1 //===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the InstructionSelector class for 10 /// AArch64. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AArch64GlobalISelUtils.h" 15 #include "AArch64InstrInfo.h" 16 #include "AArch64MachineFunctionInfo.h" 17 #include "AArch64RegisterBankInfo.h" 18 #include "AArch64RegisterInfo.h" 19 #include "AArch64Subtarget.h" 20 #include "AArch64TargetMachine.h" 21 #include "AArch64GlobalISelUtils.h" 22 #include "MCTargetDesc/AArch64AddressingModes.h" 23 #include "MCTargetDesc/AArch64MCTargetDesc.h" 24 #include "llvm/ADT/Optional.h" 25 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 26 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" 27 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" 28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 30 #include "llvm/CodeGen/MachineBasicBlock.h" 31 #include "llvm/CodeGen/MachineConstantPool.h" 32 #include "llvm/CodeGen/MachineFunction.h" 33 #include "llvm/CodeGen/MachineInstr.h" 34 #include "llvm/CodeGen/MachineInstrBuilder.h" 35 #include "llvm/CodeGen/MachineMemOperand.h" 36 #include "llvm/CodeGen/MachineOperand.h" 37 #include "llvm/CodeGen/MachineRegisterInfo.h" 38 #include "llvm/CodeGen/TargetOpcodes.h" 39 #include "llvm/IR/Constants.h" 40 #include "llvm/IR/DerivedTypes.h" 41 #include "llvm/IR/Instructions.h" 42 #include "llvm/IR/PatternMatch.h" 43 #include "llvm/IR/Type.h" 44 #include "llvm/IR/IntrinsicsAArch64.h" 45 #include "llvm/Pass.h" 46 #include "llvm/Support/Debug.h" 47 #include "llvm/Support/raw_ostream.h" 48 49 #define DEBUG_TYPE "aarch64-isel" 50 51 using namespace llvm; 52 using namespace MIPatternMatch; 53 using namespace AArch64GISelUtils; 54 55 namespace llvm { 56 class BlockFrequencyInfo; 57 class ProfileSummaryInfo; 58 } 59 60 namespace { 61 62 #define GET_GLOBALISEL_PREDICATE_BITSET 63 #include "AArch64GenGlobalISel.inc" 64 #undef GET_GLOBALISEL_PREDICATE_BITSET 65 66 class AArch64InstructionSelector : public InstructionSelector { 67 public: 68 AArch64InstructionSelector(const AArch64TargetMachine &TM, 69 const AArch64Subtarget &STI, 70 const AArch64RegisterBankInfo &RBI); 71 72 bool select(MachineInstr &I) override; 73 static const char *getName() { return DEBUG_TYPE; } 74 75 void setupMF(MachineFunction &MF, GISelKnownBits *KB, 76 CodeGenCoverage &CoverageInfo, ProfileSummaryInfo *PSI, 77 BlockFrequencyInfo *BFI) override { 78 InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI); 79 MIB.setMF(MF); 80 81 // hasFnAttribute() is expensive to call on every BRCOND selection, so 82 // cache it here for each run of the selector. 83 ProduceNonFlagSettingCondBr = 84 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening); 85 MFReturnAddr = Register(); 86 87 processPHIs(MF); 88 } 89 90 private: 91 /// tblgen-erated 'select' implementation, used as the initial selector for 92 /// the patterns that don't require complex C++. 93 bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; 94 95 // A lowering phase that runs before any selection attempts. 96 // Returns true if the instruction was modified. 97 bool preISelLower(MachineInstr &I); 98 99 // An early selection function that runs before the selectImpl() call. 100 bool earlySelect(MachineInstr &I); 101 102 // Do some preprocessing of G_PHIs before we begin selection. 103 void processPHIs(MachineFunction &MF); 104 105 bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI); 106 107 /// Eliminate same-sized cross-bank copies into stores before selectImpl(). 108 bool contractCrossBankCopyIntoStore(MachineInstr &I, 109 MachineRegisterInfo &MRI); 110 111 bool convertPtrAddToAdd(MachineInstr &I, MachineRegisterInfo &MRI); 112 113 bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF, 114 MachineRegisterInfo &MRI) const; 115 bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF, 116 MachineRegisterInfo &MRI) const; 117 118 ///@{ 119 /// Helper functions for selectCompareBranch. 120 bool selectCompareBranchFedByFCmp(MachineInstr &I, MachineInstr &FCmp, 121 MachineIRBuilder &MIB) const; 122 bool selectCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp, 123 MachineIRBuilder &MIB) const; 124 bool tryOptCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp, 125 MachineIRBuilder &MIB) const; 126 bool tryOptAndIntoCompareBranch(MachineInstr &AndInst, bool Invert, 127 MachineBasicBlock *DstMBB, 128 MachineIRBuilder &MIB) const; 129 ///@} 130 131 bool selectCompareBranch(MachineInstr &I, MachineFunction &MF, 132 MachineRegisterInfo &MRI); 133 134 bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI); 135 bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI); 136 137 // Helper to generate an equivalent of scalar_to_vector into a new register, 138 // returned via 'Dst'. 139 MachineInstr *emitScalarToVector(unsigned EltSize, 140 const TargetRegisterClass *DstRC, 141 Register Scalar, 142 MachineIRBuilder &MIRBuilder) const; 143 144 /// Emit a lane insert into \p DstReg, or a new vector register if None is 145 /// provided. 146 /// 147 /// The lane inserted into is defined by \p LaneIdx. The vector source 148 /// register is given by \p SrcReg. The register containing the element is 149 /// given by \p EltReg. 150 MachineInstr *emitLaneInsert(Optional<Register> DstReg, Register SrcReg, 151 Register EltReg, unsigned LaneIdx, 152 const RegisterBank &RB, 153 MachineIRBuilder &MIRBuilder) const; 154 155 /// Emit a sequence of instructions representing a constant \p CV for a 156 /// vector register \p Dst. (E.g. a MOV, or a load from a constant pool.) 157 /// 158 /// \returns the last instruction in the sequence on success, and nullptr 159 /// otherwise. 160 MachineInstr *emitConstantVector(Register Dst, Constant *CV, 161 MachineIRBuilder &MIRBuilder, 162 MachineRegisterInfo &MRI); 163 164 bool selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI); 165 bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy, 166 MachineRegisterInfo &MRI); 167 /// \returns true if a G_BUILD_VECTOR instruction \p MI can be selected as a 168 /// SUBREG_TO_REG. 169 bool tryOptBuildVecToSubregToReg(MachineInstr &MI, MachineRegisterInfo &MRI); 170 bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI); 171 bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI); 172 bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI); 173 174 bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI); 175 bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI); 176 bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI); 177 bool selectSplitVectorUnmerge(MachineInstr &I, MachineRegisterInfo &MRI); 178 179 /// Helper function to select vector load intrinsics like 180 /// @llvm.aarch64.neon.ld2.*, @llvm.aarch64.neon.ld4.*, etc. 181 /// \p Opc is the opcode that the selected instruction should use. 182 /// \p NumVecs is the number of vector destinations for the instruction. 183 /// \p I is the original G_INTRINSIC_W_SIDE_EFFECTS instruction. 184 bool selectVectorLoadIntrinsic(unsigned Opc, unsigned NumVecs, 185 MachineInstr &I); 186 bool selectIntrinsicWithSideEffects(MachineInstr &I, 187 MachineRegisterInfo &MRI); 188 bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI); 189 bool selectVectorICmp(MachineInstr &I, MachineRegisterInfo &MRI); 190 bool selectIntrinsicTrunc(MachineInstr &I, MachineRegisterInfo &MRI) const; 191 bool selectIntrinsicRound(MachineInstr &I, MachineRegisterInfo &MRI) const; 192 bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI); 193 bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI); 194 bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI); 195 bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI); 196 bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI); 197 198 unsigned emitConstantPoolEntry(const Constant *CPVal, 199 MachineFunction &MF) const; 200 MachineInstr *emitLoadFromConstantPool(const Constant *CPVal, 201 MachineIRBuilder &MIRBuilder) const; 202 203 // Emit a vector concat operation. 204 MachineInstr *emitVectorConcat(Optional<Register> Dst, Register Op1, 205 Register Op2, 206 MachineIRBuilder &MIRBuilder) const; 207 208 // Emit an integer compare between LHS and RHS, which checks for Predicate. 209 MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS, 210 MachineOperand &Predicate, 211 MachineIRBuilder &MIRBuilder) const; 212 213 /// Emit a floating point comparison between \p LHS and \p RHS. 214 /// \p Pred if given is the intended predicate to use. 215 MachineInstr *emitFPCompare(Register LHS, Register RHS, 216 MachineIRBuilder &MIRBuilder, 217 Optional<CmpInst::Predicate> = None) const; 218 219 MachineInstr *emitInstr(unsigned Opcode, 220 std::initializer_list<llvm::DstOp> DstOps, 221 std::initializer_list<llvm::SrcOp> SrcOps, 222 MachineIRBuilder &MIRBuilder, 223 const ComplexRendererFns &RenderFns = None) const; 224 /// Helper function to emit an add or sub instruction. 225 /// 226 /// \p AddrModeAndSizeToOpcode must contain each of the opcode variants above 227 /// in a specific order. 228 /// 229 /// Below is an example of the expected input to \p AddrModeAndSizeToOpcode. 230 /// 231 /// \code 232 /// const std::array<std::array<unsigned, 2>, 4> Table { 233 /// {{AArch64::ADDXri, AArch64::ADDWri}, 234 /// {AArch64::ADDXrs, AArch64::ADDWrs}, 235 /// {AArch64::ADDXrr, AArch64::ADDWrr}, 236 /// {AArch64::SUBXri, AArch64::SUBWri}, 237 /// {AArch64::ADDXrx, AArch64::ADDWrx}}}; 238 /// \endcode 239 /// 240 /// Each row in the table corresponds to a different addressing mode. Each 241 /// column corresponds to a different register size. 242 /// 243 /// \attention Rows must be structured as follows: 244 /// - Row 0: The ri opcode variants 245 /// - Row 1: The rs opcode variants 246 /// - Row 2: The rr opcode variants 247 /// - Row 3: The ri opcode variants for negative immediates 248 /// - Row 4: The rx opcode variants 249 /// 250 /// \attention Columns must be structured as follows: 251 /// - Column 0: The 64-bit opcode variants 252 /// - Column 1: The 32-bit opcode variants 253 /// 254 /// \p Dst is the destination register of the binop to emit. 255 /// \p LHS is the left-hand operand of the binop to emit. 256 /// \p RHS is the right-hand operand of the binop to emit. 257 MachineInstr *emitAddSub( 258 const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode, 259 Register Dst, MachineOperand &LHS, MachineOperand &RHS, 260 MachineIRBuilder &MIRBuilder) const; 261 MachineInstr *emitADD(Register DefReg, MachineOperand &LHS, 262 MachineOperand &RHS, 263 MachineIRBuilder &MIRBuilder) const; 264 MachineInstr *emitADDS(Register Dst, MachineOperand &LHS, MachineOperand &RHS, 265 MachineIRBuilder &MIRBuilder) const; 266 MachineInstr *emitSUBS(Register Dst, MachineOperand &LHS, MachineOperand &RHS, 267 MachineIRBuilder &MIRBuilder) const; 268 MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS, 269 MachineIRBuilder &MIRBuilder) const; 270 MachineInstr *emitTST(MachineOperand &LHS, MachineOperand &RHS, 271 MachineIRBuilder &MIRBuilder) const; 272 MachineInstr *emitSelect(Register Dst, Register LHS, Register RHS, 273 AArch64CC::CondCode CC, 274 MachineIRBuilder &MIRBuilder) const; 275 MachineInstr *emitExtractVectorElt(Optional<Register> DstReg, 276 const RegisterBank &DstRB, LLT ScalarTy, 277 Register VecReg, unsigned LaneIdx, 278 MachineIRBuilder &MIRBuilder) const; 279 MachineInstr *emitCSINC(Register Dst, Register Src1, Register Src2, 280 AArch64CC::CondCode Pred, 281 MachineIRBuilder &MIRBuilder) const; 282 /// Emit a CSet for a FP compare. 283 /// 284 /// \p Dst is expected to be a 32-bit scalar register. 285 MachineInstr *emitCSetForFCmp(Register Dst, CmpInst::Predicate Pred, 286 MachineIRBuilder &MIRBuilder) const; 287 288 /// Emit the overflow op for \p Opcode. 289 /// 290 /// \p Opcode is expected to be an overflow op's opcode, e.g. G_UADDO, 291 /// G_USUBO, etc. 292 std::pair<MachineInstr *, AArch64CC::CondCode> 293 emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS, 294 MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const; 295 296 /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg. 297 /// \p IsNegative is true if the test should be "not zero". 298 /// This will also optimize the test bit instruction when possible. 299 MachineInstr *emitTestBit(Register TestReg, uint64_t Bit, bool IsNegative, 300 MachineBasicBlock *DstMBB, 301 MachineIRBuilder &MIB) const; 302 303 /// Emit a CB(N)Z instruction which branches to \p DestMBB. 304 MachineInstr *emitCBZ(Register CompareReg, bool IsNegative, 305 MachineBasicBlock *DestMBB, 306 MachineIRBuilder &MIB) const; 307 308 // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td. 309 // We use these manually instead of using the importer since it doesn't 310 // support SDNodeXForm. 311 ComplexRendererFns selectShiftA_32(const MachineOperand &Root) const; 312 ComplexRendererFns selectShiftB_32(const MachineOperand &Root) const; 313 ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const; 314 ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const; 315 316 ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const; 317 ComplexRendererFns selectArithImmed(MachineOperand &Root) const; 318 ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const; 319 320 ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root, 321 unsigned Size) const; 322 323 ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const { 324 return selectAddrModeUnscaled(Root, 1); 325 } 326 ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const { 327 return selectAddrModeUnscaled(Root, 2); 328 } 329 ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const { 330 return selectAddrModeUnscaled(Root, 4); 331 } 332 ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const { 333 return selectAddrModeUnscaled(Root, 8); 334 } 335 ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const { 336 return selectAddrModeUnscaled(Root, 16); 337 } 338 339 /// Helper to try to fold in a GISEL_ADD_LOW into an immediate, to be used 340 /// from complex pattern matchers like selectAddrModeIndexed(). 341 ComplexRendererFns tryFoldAddLowIntoImm(MachineInstr &RootDef, unsigned Size, 342 MachineRegisterInfo &MRI) const; 343 344 ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root, 345 unsigned Size) const; 346 template <int Width> 347 ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const { 348 return selectAddrModeIndexed(Root, Width / 8); 349 } 350 351 bool isWorthFoldingIntoExtendedReg(MachineInstr &MI, 352 const MachineRegisterInfo &MRI) const; 353 ComplexRendererFns 354 selectAddrModeShiftedExtendXReg(MachineOperand &Root, 355 unsigned SizeInBytes) const; 356 357 /// Returns a \p ComplexRendererFns which contains a base, offset, and whether 358 /// or not a shift + extend should be folded into an addressing mode. Returns 359 /// None when this is not profitable or possible. 360 ComplexRendererFns 361 selectExtendedSHL(MachineOperand &Root, MachineOperand &Base, 362 MachineOperand &Offset, unsigned SizeInBytes, 363 bool WantsExt) const; 364 ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const; 365 ComplexRendererFns selectAddrModeXRO(MachineOperand &Root, 366 unsigned SizeInBytes) const; 367 template <int Width> 368 ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const { 369 return selectAddrModeXRO(Root, Width / 8); 370 } 371 372 ComplexRendererFns selectAddrModeWRO(MachineOperand &Root, 373 unsigned SizeInBytes) const; 374 template <int Width> 375 ComplexRendererFns selectAddrModeWRO(MachineOperand &Root) const { 376 return selectAddrModeWRO(Root, Width / 8); 377 } 378 379 ComplexRendererFns selectShiftedRegister(MachineOperand &Root, 380 bool AllowROR = false) const; 381 382 ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const { 383 return selectShiftedRegister(Root); 384 } 385 386 ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const { 387 return selectShiftedRegister(Root, true); 388 } 389 390 /// Given an extend instruction, determine the correct shift-extend type for 391 /// that instruction. 392 /// 393 /// If the instruction is going to be used in a load or store, pass 394 /// \p IsLoadStore = true. 395 AArch64_AM::ShiftExtendType 396 getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI, 397 bool IsLoadStore = false) const; 398 399 /// Move \p Reg to \p RC if \p Reg is not already on \p RC. 400 /// 401 /// \returns Either \p Reg if no change was necessary, or the new register 402 /// created by moving \p Reg. 403 /// 404 /// Note: This uses emitCopy right now. 405 Register moveScalarRegClass(Register Reg, const TargetRegisterClass &RC, 406 MachineIRBuilder &MIB) const; 407 408 ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const; 409 410 void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI, 411 int OpIdx = -1) const; 412 void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I, 413 int OpIdx = -1) const; 414 void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I, 415 int OpIdx = -1) const; 416 void renderFPImm16(MachineInstrBuilder &MIB, const MachineInstr &MI, 417 int OpIdx = -1) const; 418 void renderFPImm32(MachineInstrBuilder &MIB, const MachineInstr &MI, 419 int OpIdx = -1) const; 420 void renderFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI, 421 int OpIdx = -1) const; 422 423 // Materialize a GlobalValue or BlockAddress using a movz+movk sequence. 424 void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags); 425 426 // Optimization methods. 427 bool tryOptSelect(MachineInstr &MI); 428 MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS, 429 MachineOperand &Predicate, 430 MachineIRBuilder &MIRBuilder) const; 431 432 /// Return true if \p MI is a load or store of \p NumBytes bytes. 433 bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const; 434 435 /// Returns true if \p MI is guaranteed to have the high-half of a 64-bit 436 /// register zeroed out. In other words, the result of MI has been explicitly 437 /// zero extended. 438 bool isDef32(const MachineInstr &MI) const; 439 440 const AArch64TargetMachine &TM; 441 const AArch64Subtarget &STI; 442 const AArch64InstrInfo &TII; 443 const AArch64RegisterInfo &TRI; 444 const AArch64RegisterBankInfo &RBI; 445 446 bool ProduceNonFlagSettingCondBr = false; 447 448 // Some cached values used during selection. 449 // We use LR as a live-in register, and we keep track of it here as it can be 450 // clobbered by calls. 451 Register MFReturnAddr; 452 453 MachineIRBuilder MIB; 454 455 #define GET_GLOBALISEL_PREDICATES_DECL 456 #include "AArch64GenGlobalISel.inc" 457 #undef GET_GLOBALISEL_PREDICATES_DECL 458 459 // We declare the temporaries used by selectImpl() in the class to minimize the 460 // cost of constructing placeholder values. 461 #define GET_GLOBALISEL_TEMPORARIES_DECL 462 #include "AArch64GenGlobalISel.inc" 463 #undef GET_GLOBALISEL_TEMPORARIES_DECL 464 }; 465 466 } // end anonymous namespace 467 468 #define GET_GLOBALISEL_IMPL 469 #include "AArch64GenGlobalISel.inc" 470 #undef GET_GLOBALISEL_IMPL 471 472 AArch64InstructionSelector::AArch64InstructionSelector( 473 const AArch64TargetMachine &TM, const AArch64Subtarget &STI, 474 const AArch64RegisterBankInfo &RBI) 475 : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()), 476 TRI(*STI.getRegisterInfo()), RBI(RBI), 477 #define GET_GLOBALISEL_PREDICATES_INIT 478 #include "AArch64GenGlobalISel.inc" 479 #undef GET_GLOBALISEL_PREDICATES_INIT 480 #define GET_GLOBALISEL_TEMPORARIES_INIT 481 #include "AArch64GenGlobalISel.inc" 482 #undef GET_GLOBALISEL_TEMPORARIES_INIT 483 { 484 } 485 486 // FIXME: This should be target-independent, inferred from the types declared 487 // for each class in the bank. 488 static const TargetRegisterClass * 489 getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB, 490 const RegisterBankInfo &RBI, 491 bool GetAllRegSet = false) { 492 if (RB.getID() == AArch64::GPRRegBankID) { 493 if (Ty.getSizeInBits() <= 32) 494 return GetAllRegSet ? &AArch64::GPR32allRegClass 495 : &AArch64::GPR32RegClass; 496 if (Ty.getSizeInBits() == 64) 497 return GetAllRegSet ? &AArch64::GPR64allRegClass 498 : &AArch64::GPR64RegClass; 499 if (Ty.getSizeInBits() == 128) 500 return &AArch64::XSeqPairsClassRegClass; 501 return nullptr; 502 } 503 504 if (RB.getID() == AArch64::FPRRegBankID) { 505 switch (Ty.getSizeInBits()) { 506 case 8: 507 return &AArch64::FPR8RegClass; 508 case 16: 509 return &AArch64::FPR16RegClass; 510 case 32: 511 return &AArch64::FPR32RegClass; 512 case 64: 513 return &AArch64::FPR64RegClass; 514 case 128: 515 return &AArch64::FPR128RegClass; 516 } 517 return nullptr; 518 } 519 520 return nullptr; 521 } 522 523 /// Given a register bank, and size in bits, return the smallest register class 524 /// that can represent that combination. 525 static const TargetRegisterClass * 526 getMinClassForRegBank(const RegisterBank &RB, unsigned SizeInBits, 527 bool GetAllRegSet = false) { 528 unsigned RegBankID = RB.getID(); 529 530 if (RegBankID == AArch64::GPRRegBankID) { 531 if (SizeInBits <= 32) 532 return GetAllRegSet ? &AArch64::GPR32allRegClass 533 : &AArch64::GPR32RegClass; 534 if (SizeInBits == 64) 535 return GetAllRegSet ? &AArch64::GPR64allRegClass 536 : &AArch64::GPR64RegClass; 537 if (SizeInBits == 128) 538 return &AArch64::XSeqPairsClassRegClass; 539 } 540 541 if (RegBankID == AArch64::FPRRegBankID) { 542 switch (SizeInBits) { 543 default: 544 return nullptr; 545 case 8: 546 return &AArch64::FPR8RegClass; 547 case 16: 548 return &AArch64::FPR16RegClass; 549 case 32: 550 return &AArch64::FPR32RegClass; 551 case 64: 552 return &AArch64::FPR64RegClass; 553 case 128: 554 return &AArch64::FPR128RegClass; 555 } 556 } 557 558 return nullptr; 559 } 560 561 /// Returns the correct subregister to use for a given register class. 562 static bool getSubRegForClass(const TargetRegisterClass *RC, 563 const TargetRegisterInfo &TRI, unsigned &SubReg) { 564 switch (TRI.getRegSizeInBits(*RC)) { 565 case 8: 566 SubReg = AArch64::bsub; 567 break; 568 case 16: 569 SubReg = AArch64::hsub; 570 break; 571 case 32: 572 if (RC != &AArch64::FPR32RegClass) 573 SubReg = AArch64::sub_32; 574 else 575 SubReg = AArch64::ssub; 576 break; 577 case 64: 578 SubReg = AArch64::dsub; 579 break; 580 default: 581 LLVM_DEBUG( 582 dbgs() << "Couldn't find appropriate subregister for register class."); 583 return false; 584 } 585 586 return true; 587 } 588 589 /// Returns the minimum size the given register bank can hold. 590 static unsigned getMinSizeForRegBank(const RegisterBank &RB) { 591 switch (RB.getID()) { 592 case AArch64::GPRRegBankID: 593 return 32; 594 case AArch64::FPRRegBankID: 595 return 8; 596 default: 597 llvm_unreachable("Tried to get minimum size for unknown register bank."); 598 } 599 } 600 601 /// Create a REG_SEQUENCE instruction using the registers in \p Regs. 602 /// Helper function for functions like createDTuple and createQTuple. 603 /// 604 /// \p RegClassIDs - The list of register class IDs available for some tuple of 605 /// a scalar class. E.g. QQRegClassID, QQQRegClassID, QQQQRegClassID. This is 606 /// expected to contain between 2 and 4 tuple classes. 607 /// 608 /// \p SubRegs - The list of subregister classes associated with each register 609 /// class ID in \p RegClassIDs. E.g., QQRegClassID should use the qsub0 610 /// subregister class. The index of each subregister class is expected to 611 /// correspond with the index of each register class. 612 /// 613 /// \returns Either the destination register of REG_SEQUENCE instruction that 614 /// was created, or the 0th element of \p Regs if \p Regs contains a single 615 /// element. 616 static Register createTuple(ArrayRef<Register> Regs, 617 const unsigned RegClassIDs[], 618 const unsigned SubRegs[], MachineIRBuilder &MIB) { 619 unsigned NumRegs = Regs.size(); 620 if (NumRegs == 1) 621 return Regs[0]; 622 assert(NumRegs >= 2 && NumRegs <= 4 && 623 "Only support between two and 4 registers in a tuple!"); 624 const TargetRegisterInfo *TRI = MIB.getMF().getSubtarget().getRegisterInfo(); 625 auto *DesiredClass = TRI->getRegClass(RegClassIDs[NumRegs - 2]); 626 auto RegSequence = 627 MIB.buildInstr(TargetOpcode::REG_SEQUENCE, {DesiredClass}, {}); 628 for (unsigned I = 0, E = Regs.size(); I < E; ++I) { 629 RegSequence.addUse(Regs[I]); 630 RegSequence.addImm(SubRegs[I]); 631 } 632 return RegSequence.getReg(0); 633 } 634 635 /// Create a tuple of D-registers using the registers in \p Regs. 636 static Register createDTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) { 637 static const unsigned RegClassIDs[] = { 638 AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID}; 639 static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1, 640 AArch64::dsub2, AArch64::dsub3}; 641 return createTuple(Regs, RegClassIDs, SubRegs, MIB); 642 } 643 644 /// Create a tuple of Q-registers using the registers in \p Regs. 645 static Register createQTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) { 646 static const unsigned RegClassIDs[] = { 647 AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID}; 648 static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1, 649 AArch64::qsub2, AArch64::qsub3}; 650 return createTuple(Regs, RegClassIDs, SubRegs, MIB); 651 } 652 653 static Optional<uint64_t> getImmedFromMO(const MachineOperand &Root) { 654 auto &MI = *Root.getParent(); 655 auto &MBB = *MI.getParent(); 656 auto &MF = *MBB.getParent(); 657 auto &MRI = MF.getRegInfo(); 658 uint64_t Immed; 659 if (Root.isImm()) 660 Immed = Root.getImm(); 661 else if (Root.isCImm()) 662 Immed = Root.getCImm()->getZExtValue(); 663 else if (Root.isReg()) { 664 auto ValAndVReg = 665 getIConstantVRegValWithLookThrough(Root.getReg(), MRI, true); 666 if (!ValAndVReg) 667 return None; 668 Immed = ValAndVReg->Value.getSExtValue(); 669 } else 670 return None; 671 return Immed; 672 } 673 674 /// Check whether \p I is a currently unsupported binary operation: 675 /// - it has an unsized type 676 /// - an operand is not a vreg 677 /// - all operands are not in the same bank 678 /// These are checks that should someday live in the verifier, but right now, 679 /// these are mostly limitations of the aarch64 selector. 680 static bool unsupportedBinOp(const MachineInstr &I, 681 const AArch64RegisterBankInfo &RBI, 682 const MachineRegisterInfo &MRI, 683 const AArch64RegisterInfo &TRI) { 684 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 685 if (!Ty.isValid()) { 686 LLVM_DEBUG(dbgs() << "Generic binop register should be typed\n"); 687 return true; 688 } 689 690 const RegisterBank *PrevOpBank = nullptr; 691 for (auto &MO : I.operands()) { 692 // FIXME: Support non-register operands. 693 if (!MO.isReg()) { 694 LLVM_DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n"); 695 return true; 696 } 697 698 // FIXME: Can generic operations have physical registers operands? If 699 // so, this will need to be taught about that, and we'll need to get the 700 // bank out of the minimal class for the register. 701 // Either way, this needs to be documented (and possibly verified). 702 if (!Register::isVirtualRegister(MO.getReg())) { 703 LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n"); 704 return true; 705 } 706 707 const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI); 708 if (!OpBank) { 709 LLVM_DEBUG(dbgs() << "Generic register has no bank or class\n"); 710 return true; 711 } 712 713 if (PrevOpBank && OpBank != PrevOpBank) { 714 LLVM_DEBUG(dbgs() << "Generic inst operands have different banks\n"); 715 return true; 716 } 717 PrevOpBank = OpBank; 718 } 719 return false; 720 } 721 722 /// Select the AArch64 opcode for the basic binary operation \p GenericOpc 723 /// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID 724 /// and of size \p OpSize. 725 /// \returns \p GenericOpc if the combination is unsupported. 726 static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID, 727 unsigned OpSize) { 728 switch (RegBankID) { 729 case AArch64::GPRRegBankID: 730 if (OpSize == 32) { 731 switch (GenericOpc) { 732 case TargetOpcode::G_SHL: 733 return AArch64::LSLVWr; 734 case TargetOpcode::G_LSHR: 735 return AArch64::LSRVWr; 736 case TargetOpcode::G_ASHR: 737 return AArch64::ASRVWr; 738 default: 739 return GenericOpc; 740 } 741 } else if (OpSize == 64) { 742 switch (GenericOpc) { 743 case TargetOpcode::G_PTR_ADD: 744 return AArch64::ADDXrr; 745 case TargetOpcode::G_SHL: 746 return AArch64::LSLVXr; 747 case TargetOpcode::G_LSHR: 748 return AArch64::LSRVXr; 749 case TargetOpcode::G_ASHR: 750 return AArch64::ASRVXr; 751 default: 752 return GenericOpc; 753 } 754 } 755 break; 756 case AArch64::FPRRegBankID: 757 switch (OpSize) { 758 case 32: 759 switch (GenericOpc) { 760 case TargetOpcode::G_FADD: 761 return AArch64::FADDSrr; 762 case TargetOpcode::G_FSUB: 763 return AArch64::FSUBSrr; 764 case TargetOpcode::G_FMUL: 765 return AArch64::FMULSrr; 766 case TargetOpcode::G_FDIV: 767 return AArch64::FDIVSrr; 768 default: 769 return GenericOpc; 770 } 771 case 64: 772 switch (GenericOpc) { 773 case TargetOpcode::G_FADD: 774 return AArch64::FADDDrr; 775 case TargetOpcode::G_FSUB: 776 return AArch64::FSUBDrr; 777 case TargetOpcode::G_FMUL: 778 return AArch64::FMULDrr; 779 case TargetOpcode::G_FDIV: 780 return AArch64::FDIVDrr; 781 case TargetOpcode::G_OR: 782 return AArch64::ORRv8i8; 783 default: 784 return GenericOpc; 785 } 786 } 787 break; 788 } 789 return GenericOpc; 790 } 791 792 /// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc, 793 /// appropriate for the (value) register bank \p RegBankID and of memory access 794 /// size \p OpSize. This returns the variant with the base+unsigned-immediate 795 /// addressing mode (e.g., LDRXui). 796 /// \returns \p GenericOpc if the combination is unsupported. 797 static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID, 798 unsigned OpSize) { 799 const bool isStore = GenericOpc == TargetOpcode::G_STORE; 800 switch (RegBankID) { 801 case AArch64::GPRRegBankID: 802 switch (OpSize) { 803 case 8: 804 return isStore ? AArch64::STRBBui : AArch64::LDRBBui; 805 case 16: 806 return isStore ? AArch64::STRHHui : AArch64::LDRHHui; 807 case 32: 808 return isStore ? AArch64::STRWui : AArch64::LDRWui; 809 case 64: 810 return isStore ? AArch64::STRXui : AArch64::LDRXui; 811 } 812 break; 813 case AArch64::FPRRegBankID: 814 switch (OpSize) { 815 case 8: 816 return isStore ? AArch64::STRBui : AArch64::LDRBui; 817 case 16: 818 return isStore ? AArch64::STRHui : AArch64::LDRHui; 819 case 32: 820 return isStore ? AArch64::STRSui : AArch64::LDRSui; 821 case 64: 822 return isStore ? AArch64::STRDui : AArch64::LDRDui; 823 case 128: 824 return isStore ? AArch64::STRQui : AArch64::LDRQui; 825 } 826 break; 827 } 828 return GenericOpc; 829 } 830 831 #ifndef NDEBUG 832 /// Helper function that verifies that we have a valid copy at the end of 833 /// selectCopy. Verifies that the source and dest have the expected sizes and 834 /// then returns true. 835 static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank, 836 const MachineRegisterInfo &MRI, 837 const TargetRegisterInfo &TRI, 838 const RegisterBankInfo &RBI) { 839 const Register DstReg = I.getOperand(0).getReg(); 840 const Register SrcReg = I.getOperand(1).getReg(); 841 const unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI); 842 const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); 843 844 // Make sure the size of the source and dest line up. 845 assert( 846 (DstSize == SrcSize || 847 // Copies are a mean to setup initial types, the number of 848 // bits may not exactly match. 849 (Register::isPhysicalRegister(SrcReg) && DstSize <= SrcSize) || 850 // Copies are a mean to copy bits around, as long as we are 851 // on the same register class, that's fine. Otherwise, that 852 // means we need some SUBREG_TO_REG or AND & co. 853 (((DstSize + 31) / 32 == (SrcSize + 31) / 32) && DstSize > SrcSize)) && 854 "Copy with different width?!"); 855 856 // Check the size of the destination. 857 assert((DstSize <= 64 || DstBank.getID() == AArch64::FPRRegBankID) && 858 "GPRs cannot get more than 64-bit width values"); 859 860 return true; 861 } 862 #endif 863 864 /// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg 865 /// to \p *To. 866 /// 867 /// E.g "To = COPY SrcReg:SubReg" 868 static bool copySubReg(MachineInstr &I, MachineRegisterInfo &MRI, 869 const RegisterBankInfo &RBI, Register SrcReg, 870 const TargetRegisterClass *To, unsigned SubReg) { 871 assert(SrcReg.isValid() && "Expected a valid source register?"); 872 assert(To && "Destination register class cannot be null"); 873 assert(SubReg && "Expected a valid subregister"); 874 875 MachineIRBuilder MIB(I); 876 auto SubRegCopy = 877 MIB.buildInstr(TargetOpcode::COPY, {To}, {}).addReg(SrcReg, 0, SubReg); 878 MachineOperand &RegOp = I.getOperand(1); 879 RegOp.setReg(SubRegCopy.getReg(0)); 880 881 // It's possible that the destination register won't be constrained. Make 882 // sure that happens. 883 if (!Register::isPhysicalRegister(I.getOperand(0).getReg())) 884 RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI); 885 886 return true; 887 } 888 889 /// Helper function to get the source and destination register classes for a 890 /// copy. Returns a std::pair containing the source register class for the 891 /// copy, and the destination register class for the copy. If a register class 892 /// cannot be determined, then it will be nullptr. 893 static std::pair<const TargetRegisterClass *, const TargetRegisterClass *> 894 getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII, 895 MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, 896 const RegisterBankInfo &RBI) { 897 Register DstReg = I.getOperand(0).getReg(); 898 Register SrcReg = I.getOperand(1).getReg(); 899 const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); 900 const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); 901 unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI); 902 unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); 903 904 // Special casing for cross-bank copies of s1s. We can technically represent 905 // a 1-bit value with any size of register. The minimum size for a GPR is 32 906 // bits. So, we need to put the FPR on 32 bits as well. 907 // 908 // FIXME: I'm not sure if this case holds true outside of copies. If it does, 909 // then we can pull it into the helpers that get the appropriate class for a 910 // register bank. Or make a new helper that carries along some constraint 911 // information. 912 if (SrcRegBank != DstRegBank && (DstSize == 1 && SrcSize == 1)) 913 SrcSize = DstSize = 32; 914 915 return {getMinClassForRegBank(SrcRegBank, SrcSize, true), 916 getMinClassForRegBank(DstRegBank, DstSize, true)}; 917 } 918 919 static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, 920 MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, 921 const RegisterBankInfo &RBI) { 922 Register DstReg = I.getOperand(0).getReg(); 923 Register SrcReg = I.getOperand(1).getReg(); 924 const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); 925 const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); 926 927 // Find the correct register classes for the source and destination registers. 928 const TargetRegisterClass *SrcRC; 929 const TargetRegisterClass *DstRC; 930 std::tie(SrcRC, DstRC) = getRegClassesForCopy(I, TII, MRI, TRI, RBI); 931 932 if (!DstRC) { 933 LLVM_DEBUG(dbgs() << "Unexpected dest size " 934 << RBI.getSizeInBits(DstReg, MRI, TRI) << '\n'); 935 return false; 936 } 937 938 // A couple helpers below, for making sure that the copy we produce is valid. 939 940 // Set to true if we insert a SUBREG_TO_REG. If we do this, then we don't want 941 // to verify that the src and dst are the same size, since that's handled by 942 // the SUBREG_TO_REG. 943 bool KnownValid = false; 944 945 // Returns true, or asserts if something we don't expect happens. Instead of 946 // returning true, we return isValidCopy() to ensure that we verify the 947 // result. 948 auto CheckCopy = [&]() { 949 // If we have a bitcast or something, we can't have physical registers. 950 assert((I.isCopy() || 951 (!Register::isPhysicalRegister(I.getOperand(0).getReg()) && 952 !Register::isPhysicalRegister(I.getOperand(1).getReg()))) && 953 "No phys reg on generic operator!"); 954 bool ValidCopy = true; 955 #ifndef NDEBUG 956 ValidCopy = KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI); 957 assert(ValidCopy && "Invalid copy."); 958 #endif 959 (void)KnownValid; 960 return ValidCopy; 961 }; 962 963 // Is this a copy? If so, then we may need to insert a subregister copy. 964 if (I.isCopy()) { 965 // Yes. Check if there's anything to fix up. 966 if (!SrcRC) { 967 LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n"); 968 return false; 969 } 970 971 unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC); 972 unsigned DstSize = TRI.getRegSizeInBits(*DstRC); 973 unsigned SubReg; 974 975 // If the source bank doesn't support a subregister copy small enough, 976 // then we first need to copy to the destination bank. 977 if (getMinSizeForRegBank(SrcRegBank) > DstSize) { 978 const TargetRegisterClass *DstTempRC = 979 getMinClassForRegBank(DstRegBank, SrcSize, /* GetAllRegSet */ true); 980 getSubRegForClass(DstRC, TRI, SubReg); 981 982 MachineIRBuilder MIB(I); 983 auto Copy = MIB.buildCopy({DstTempRC}, {SrcReg}); 984 copySubReg(I, MRI, RBI, Copy.getReg(0), DstRC, SubReg); 985 } else if (SrcSize > DstSize) { 986 // If the source register is bigger than the destination we need to 987 // perform a subregister copy. 988 const TargetRegisterClass *SubRegRC = 989 getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true); 990 getSubRegForClass(SubRegRC, TRI, SubReg); 991 copySubReg(I, MRI, RBI, SrcReg, DstRC, SubReg); 992 } else if (DstSize > SrcSize) { 993 // If the destination register is bigger than the source we need to do 994 // a promotion using SUBREG_TO_REG. 995 const TargetRegisterClass *PromotionRC = 996 getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true); 997 getSubRegForClass(SrcRC, TRI, SubReg); 998 999 Register PromoteReg = MRI.createVirtualRegister(PromotionRC); 1000 BuildMI(*I.getParent(), I, I.getDebugLoc(), 1001 TII.get(AArch64::SUBREG_TO_REG), PromoteReg) 1002 .addImm(0) 1003 .addUse(SrcReg) 1004 .addImm(SubReg); 1005 MachineOperand &RegOp = I.getOperand(1); 1006 RegOp.setReg(PromoteReg); 1007 1008 // Promise that the copy is implicitly validated by the SUBREG_TO_REG. 1009 KnownValid = true; 1010 } 1011 1012 // If the destination is a physical register, then there's nothing to 1013 // change, so we're done. 1014 if (Register::isPhysicalRegister(DstReg)) 1015 return CheckCopy(); 1016 } 1017 1018 // No need to constrain SrcReg. It will get constrained when we hit another 1019 // of its use or its defs. Copies do not have constraints. 1020 if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { 1021 LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode()) 1022 << " operand\n"); 1023 return false; 1024 } 1025 1026 // If this a GPR ZEXT that we want to just reduce down into a copy. 1027 // The sizes will be mismatched with the source < 32b but that's ok. 1028 if (I.getOpcode() == TargetOpcode::G_ZEXT) { 1029 I.setDesc(TII.get(AArch64::COPY)); 1030 assert(SrcRegBank.getID() == AArch64::GPRRegBankID); 1031 return selectCopy(I, TII, MRI, TRI, RBI); 1032 } 1033 1034 I.setDesc(TII.get(AArch64::COPY)); 1035 return CheckCopy(); 1036 } 1037 1038 static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) { 1039 if (!DstTy.isScalar() || !SrcTy.isScalar()) 1040 return GenericOpc; 1041 1042 const unsigned DstSize = DstTy.getSizeInBits(); 1043 const unsigned SrcSize = SrcTy.getSizeInBits(); 1044 1045 switch (DstSize) { 1046 case 32: 1047 switch (SrcSize) { 1048 case 32: 1049 switch (GenericOpc) { 1050 case TargetOpcode::G_SITOFP: 1051 return AArch64::SCVTFUWSri; 1052 case TargetOpcode::G_UITOFP: 1053 return AArch64::UCVTFUWSri; 1054 case TargetOpcode::G_FPTOSI: 1055 return AArch64::FCVTZSUWSr; 1056 case TargetOpcode::G_FPTOUI: 1057 return AArch64::FCVTZUUWSr; 1058 default: 1059 return GenericOpc; 1060 } 1061 case 64: 1062 switch (GenericOpc) { 1063 case TargetOpcode::G_SITOFP: 1064 return AArch64::SCVTFUXSri; 1065 case TargetOpcode::G_UITOFP: 1066 return AArch64::UCVTFUXSri; 1067 case TargetOpcode::G_FPTOSI: 1068 return AArch64::FCVTZSUWDr; 1069 case TargetOpcode::G_FPTOUI: 1070 return AArch64::FCVTZUUWDr; 1071 default: 1072 return GenericOpc; 1073 } 1074 default: 1075 return GenericOpc; 1076 } 1077 case 64: 1078 switch (SrcSize) { 1079 case 32: 1080 switch (GenericOpc) { 1081 case TargetOpcode::G_SITOFP: 1082 return AArch64::SCVTFUWDri; 1083 case TargetOpcode::G_UITOFP: 1084 return AArch64::UCVTFUWDri; 1085 case TargetOpcode::G_FPTOSI: 1086 return AArch64::FCVTZSUXSr; 1087 case TargetOpcode::G_FPTOUI: 1088 return AArch64::FCVTZUUXSr; 1089 default: 1090 return GenericOpc; 1091 } 1092 case 64: 1093 switch (GenericOpc) { 1094 case TargetOpcode::G_SITOFP: 1095 return AArch64::SCVTFUXDri; 1096 case TargetOpcode::G_UITOFP: 1097 return AArch64::UCVTFUXDri; 1098 case TargetOpcode::G_FPTOSI: 1099 return AArch64::FCVTZSUXDr; 1100 case TargetOpcode::G_FPTOUI: 1101 return AArch64::FCVTZUUXDr; 1102 default: 1103 return GenericOpc; 1104 } 1105 default: 1106 return GenericOpc; 1107 } 1108 default: 1109 return GenericOpc; 1110 }; 1111 return GenericOpc; 1112 } 1113 1114 MachineInstr * 1115 AArch64InstructionSelector::emitSelect(Register Dst, Register True, 1116 Register False, AArch64CC::CondCode CC, 1117 MachineIRBuilder &MIB) const { 1118 MachineRegisterInfo &MRI = *MIB.getMRI(); 1119 assert(RBI.getRegBank(False, MRI, TRI)->getID() == 1120 RBI.getRegBank(True, MRI, TRI)->getID() && 1121 "Expected both select operands to have the same regbank?"); 1122 LLT Ty = MRI.getType(True); 1123 if (Ty.isVector()) 1124 return nullptr; 1125 const unsigned Size = Ty.getSizeInBits(); 1126 assert((Size == 32 || Size == 64) && 1127 "Expected 32 bit or 64 bit select only?"); 1128 const bool Is32Bit = Size == 32; 1129 if (RBI.getRegBank(True, MRI, TRI)->getID() != AArch64::GPRRegBankID) { 1130 unsigned Opc = Is32Bit ? AArch64::FCSELSrrr : AArch64::FCSELDrrr; 1131 auto FCSel = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC); 1132 constrainSelectedInstRegOperands(*FCSel, TII, TRI, RBI); 1133 return &*FCSel; 1134 } 1135 1136 // By default, we'll try and emit a CSEL. 1137 unsigned Opc = Is32Bit ? AArch64::CSELWr : AArch64::CSELXr; 1138 bool Optimized = false; 1139 auto TryFoldBinOpIntoSelect = [&Opc, Is32Bit, &CC, &MRI, 1140 &Optimized](Register &Reg, Register &OtherReg, 1141 bool Invert) { 1142 if (Optimized) 1143 return false; 1144 1145 // Attempt to fold: 1146 // 1147 // %sub = G_SUB 0, %x 1148 // %select = G_SELECT cc, %reg, %sub 1149 // 1150 // Into: 1151 // %select = CSNEG %reg, %x, cc 1152 Register MatchReg; 1153 if (mi_match(Reg, MRI, m_Neg(m_Reg(MatchReg)))) { 1154 Opc = Is32Bit ? AArch64::CSNEGWr : AArch64::CSNEGXr; 1155 Reg = MatchReg; 1156 if (Invert) { 1157 CC = AArch64CC::getInvertedCondCode(CC); 1158 std::swap(Reg, OtherReg); 1159 } 1160 return true; 1161 } 1162 1163 // Attempt to fold: 1164 // 1165 // %xor = G_XOR %x, -1 1166 // %select = G_SELECT cc, %reg, %xor 1167 // 1168 // Into: 1169 // %select = CSINV %reg, %x, cc 1170 if (mi_match(Reg, MRI, m_Not(m_Reg(MatchReg)))) { 1171 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; 1172 Reg = MatchReg; 1173 if (Invert) { 1174 CC = AArch64CC::getInvertedCondCode(CC); 1175 std::swap(Reg, OtherReg); 1176 } 1177 return true; 1178 } 1179 1180 // Attempt to fold: 1181 // 1182 // %add = G_ADD %x, 1 1183 // %select = G_SELECT cc, %reg, %add 1184 // 1185 // Into: 1186 // %select = CSINC %reg, %x, cc 1187 if (mi_match(Reg, MRI, 1188 m_any_of(m_GAdd(m_Reg(MatchReg), m_SpecificICst(1)), 1189 m_GPtrAdd(m_Reg(MatchReg), m_SpecificICst(1))))) { 1190 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; 1191 Reg = MatchReg; 1192 if (Invert) { 1193 CC = AArch64CC::getInvertedCondCode(CC); 1194 std::swap(Reg, OtherReg); 1195 } 1196 return true; 1197 } 1198 1199 return false; 1200 }; 1201 1202 // Helper lambda which tries to use CSINC/CSINV for the instruction when its 1203 // true/false values are constants. 1204 // FIXME: All of these patterns already exist in tablegen. We should be 1205 // able to import these. 1206 auto TryOptSelectCst = [&Opc, &True, &False, &CC, Is32Bit, &MRI, 1207 &Optimized]() { 1208 if (Optimized) 1209 return false; 1210 auto TrueCst = getIConstantVRegValWithLookThrough(True, MRI); 1211 auto FalseCst = getIConstantVRegValWithLookThrough(False, MRI); 1212 if (!TrueCst && !FalseCst) 1213 return false; 1214 1215 Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR; 1216 if (TrueCst && FalseCst) { 1217 int64_t T = TrueCst->Value.getSExtValue(); 1218 int64_t F = FalseCst->Value.getSExtValue(); 1219 1220 if (T == 0 && F == 1) { 1221 // G_SELECT cc, 0, 1 -> CSINC zreg, zreg, cc 1222 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; 1223 True = ZReg; 1224 False = ZReg; 1225 return true; 1226 } 1227 1228 if (T == 0 && F == -1) { 1229 // G_SELECT cc 0, -1 -> CSINV zreg, zreg cc 1230 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; 1231 True = ZReg; 1232 False = ZReg; 1233 return true; 1234 } 1235 } 1236 1237 if (TrueCst) { 1238 int64_t T = TrueCst->Value.getSExtValue(); 1239 if (T == 1) { 1240 // G_SELECT cc, 1, f -> CSINC f, zreg, inv_cc 1241 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; 1242 True = False; 1243 False = ZReg; 1244 CC = AArch64CC::getInvertedCondCode(CC); 1245 return true; 1246 } 1247 1248 if (T == -1) { 1249 // G_SELECT cc, -1, f -> CSINV f, zreg, inv_cc 1250 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; 1251 True = False; 1252 False = ZReg; 1253 CC = AArch64CC::getInvertedCondCode(CC); 1254 return true; 1255 } 1256 } 1257 1258 if (FalseCst) { 1259 int64_t F = FalseCst->Value.getSExtValue(); 1260 if (F == 1) { 1261 // G_SELECT cc, t, 1 -> CSINC t, zreg, cc 1262 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; 1263 False = ZReg; 1264 return true; 1265 } 1266 1267 if (F == -1) { 1268 // G_SELECT cc, t, -1 -> CSINC t, zreg, cc 1269 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; 1270 False = ZReg; 1271 return true; 1272 } 1273 } 1274 return false; 1275 }; 1276 1277 Optimized |= TryFoldBinOpIntoSelect(False, True, /*Invert = */ false); 1278 Optimized |= TryFoldBinOpIntoSelect(True, False, /*Invert = */ true); 1279 Optimized |= TryOptSelectCst(); 1280 auto SelectInst = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC); 1281 constrainSelectedInstRegOperands(*SelectInst, TII, TRI, RBI); 1282 return &*SelectInst; 1283 } 1284 1285 static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) { 1286 switch (P) { 1287 default: 1288 llvm_unreachable("Unknown condition code!"); 1289 case CmpInst::ICMP_NE: 1290 return AArch64CC::NE; 1291 case CmpInst::ICMP_EQ: 1292 return AArch64CC::EQ; 1293 case CmpInst::ICMP_SGT: 1294 return AArch64CC::GT; 1295 case CmpInst::ICMP_SGE: 1296 return AArch64CC::GE; 1297 case CmpInst::ICMP_SLT: 1298 return AArch64CC::LT; 1299 case CmpInst::ICMP_SLE: 1300 return AArch64CC::LE; 1301 case CmpInst::ICMP_UGT: 1302 return AArch64CC::HI; 1303 case CmpInst::ICMP_UGE: 1304 return AArch64CC::HS; 1305 case CmpInst::ICMP_ULT: 1306 return AArch64CC::LO; 1307 case CmpInst::ICMP_ULE: 1308 return AArch64CC::LS; 1309 } 1310 } 1311 1312 /// Return a register which can be used as a bit to test in a TB(N)Z. 1313 static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert, 1314 MachineRegisterInfo &MRI) { 1315 assert(Reg.isValid() && "Expected valid register!"); 1316 bool HasZext = false; 1317 while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) { 1318 unsigned Opc = MI->getOpcode(); 1319 1320 if (!MI->getOperand(0).isReg() || 1321 !MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) 1322 break; 1323 1324 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits. 1325 // 1326 // (tbz (trunc x), b) -> (tbz x, b) is always safe, because the bit number 1327 // on the truncated x is the same as the bit number on x. 1328 if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT || 1329 Opc == TargetOpcode::G_TRUNC) { 1330 if (Opc == TargetOpcode::G_ZEXT) 1331 HasZext = true; 1332 1333 Register NextReg = MI->getOperand(1).getReg(); 1334 // Did we find something worth folding? 1335 if (!NextReg.isValid() || !MRI.hasOneNonDBGUse(NextReg)) 1336 break; 1337 1338 // NextReg is worth folding. Keep looking. 1339 Reg = NextReg; 1340 continue; 1341 } 1342 1343 // Attempt to find a suitable operation with a constant on one side. 1344 Optional<uint64_t> C; 1345 Register TestReg; 1346 switch (Opc) { 1347 default: 1348 break; 1349 case TargetOpcode::G_AND: 1350 case TargetOpcode::G_XOR: { 1351 TestReg = MI->getOperand(1).getReg(); 1352 Register ConstantReg = MI->getOperand(2).getReg(); 1353 auto VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI); 1354 if (!VRegAndVal) { 1355 // AND commutes, check the other side for a constant. 1356 // FIXME: Can we canonicalize the constant so that it's always on the 1357 // same side at some point earlier? 1358 std::swap(ConstantReg, TestReg); 1359 VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI); 1360 } 1361 if (VRegAndVal) { 1362 if (HasZext) 1363 C = VRegAndVal->Value.getZExtValue(); 1364 else 1365 C = VRegAndVal->Value.getSExtValue(); 1366 } 1367 break; 1368 } 1369 case TargetOpcode::G_ASHR: 1370 case TargetOpcode::G_LSHR: 1371 case TargetOpcode::G_SHL: { 1372 TestReg = MI->getOperand(1).getReg(); 1373 auto VRegAndVal = 1374 getIConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI); 1375 if (VRegAndVal) 1376 C = VRegAndVal->Value.getSExtValue(); 1377 break; 1378 } 1379 } 1380 1381 // Didn't find a constant or viable register. Bail out of the loop. 1382 if (!C || !TestReg.isValid()) 1383 break; 1384 1385 // We found a suitable instruction with a constant. Check to see if we can 1386 // walk through the instruction. 1387 Register NextReg; 1388 unsigned TestRegSize = MRI.getType(TestReg).getSizeInBits(); 1389 switch (Opc) { 1390 default: 1391 break; 1392 case TargetOpcode::G_AND: 1393 // (tbz (and x, m), b) -> (tbz x, b) when the b-th bit of m is set. 1394 if ((*C >> Bit) & 1) 1395 NextReg = TestReg; 1396 break; 1397 case TargetOpcode::G_SHL: 1398 // (tbz (shl x, c), b) -> (tbz x, b-c) when b-c is positive and fits in 1399 // the type of the register. 1400 if (*C <= Bit && (Bit - *C) < TestRegSize) { 1401 NextReg = TestReg; 1402 Bit = Bit - *C; 1403 } 1404 break; 1405 case TargetOpcode::G_ASHR: 1406 // (tbz (ashr x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits 1407 // in x 1408 NextReg = TestReg; 1409 Bit = Bit + *C; 1410 if (Bit >= TestRegSize) 1411 Bit = TestRegSize - 1; 1412 break; 1413 case TargetOpcode::G_LSHR: 1414 // (tbz (lshr x, c), b) -> (tbz x, b+c) when b + c is < # bits in x 1415 if ((Bit + *C) < TestRegSize) { 1416 NextReg = TestReg; 1417 Bit = Bit + *C; 1418 } 1419 break; 1420 case TargetOpcode::G_XOR: 1421 // We can walk through a G_XOR by inverting whether we use tbz/tbnz when 1422 // appropriate. 1423 // 1424 // e.g. If x' = xor x, c, and the b-th bit is set in c then 1425 // 1426 // tbz x', b -> tbnz x, b 1427 // 1428 // Because x' only has the b-th bit set if x does not. 1429 if ((*C >> Bit) & 1) 1430 Invert = !Invert; 1431 NextReg = TestReg; 1432 break; 1433 } 1434 1435 // Check if we found anything worth folding. 1436 if (!NextReg.isValid()) 1437 return Reg; 1438 Reg = NextReg; 1439 } 1440 1441 return Reg; 1442 } 1443 1444 MachineInstr *AArch64InstructionSelector::emitTestBit( 1445 Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB, 1446 MachineIRBuilder &MIB) const { 1447 assert(TestReg.isValid()); 1448 assert(ProduceNonFlagSettingCondBr && 1449 "Cannot emit TB(N)Z with speculation tracking!"); 1450 MachineRegisterInfo &MRI = *MIB.getMRI(); 1451 1452 // Attempt to optimize the test bit by walking over instructions. 1453 TestReg = getTestBitReg(TestReg, Bit, IsNegative, MRI); 1454 LLT Ty = MRI.getType(TestReg); 1455 unsigned Size = Ty.getSizeInBits(); 1456 assert(!Ty.isVector() && "Expected a scalar!"); 1457 assert(Bit < 64 && "Bit is too large!"); 1458 1459 // When the test register is a 64-bit register, we have to narrow to make 1460 // TBNZW work. 1461 bool UseWReg = Bit < 32; 1462 unsigned NecessarySize = UseWReg ? 32 : 64; 1463 if (Size != NecessarySize) 1464 TestReg = moveScalarRegClass( 1465 TestReg, UseWReg ? AArch64::GPR32RegClass : AArch64::GPR64RegClass, 1466 MIB); 1467 1468 static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX}, 1469 {AArch64::TBZW, AArch64::TBNZW}}; 1470 unsigned Opc = OpcTable[UseWReg][IsNegative]; 1471 auto TestBitMI = 1472 MIB.buildInstr(Opc).addReg(TestReg).addImm(Bit).addMBB(DstMBB); 1473 constrainSelectedInstRegOperands(*TestBitMI, TII, TRI, RBI); 1474 return &*TestBitMI; 1475 } 1476 1477 bool AArch64InstructionSelector::tryOptAndIntoCompareBranch( 1478 MachineInstr &AndInst, bool Invert, MachineBasicBlock *DstMBB, 1479 MachineIRBuilder &MIB) const { 1480 assert(AndInst.getOpcode() == TargetOpcode::G_AND && "Expected G_AND only?"); 1481 // Given something like this: 1482 // 1483 // %x = ...Something... 1484 // %one = G_CONSTANT i64 1 1485 // %zero = G_CONSTANT i64 0 1486 // %and = G_AND %x, %one 1487 // %cmp = G_ICMP intpred(ne), %and, %zero 1488 // %cmp_trunc = G_TRUNC %cmp 1489 // G_BRCOND %cmp_trunc, %bb.3 1490 // 1491 // We want to try and fold the AND into the G_BRCOND and produce either a 1492 // TBNZ (when we have intpred(ne)) or a TBZ (when we have intpred(eq)). 1493 // 1494 // In this case, we'd get 1495 // 1496 // TBNZ %x %bb.3 1497 // 1498 1499 // Check if the AND has a constant on its RHS which we can use as a mask. 1500 // If it's a power of 2, then it's the same as checking a specific bit. 1501 // (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set) 1502 auto MaybeBit = getIConstantVRegValWithLookThrough( 1503 AndInst.getOperand(2).getReg(), *MIB.getMRI()); 1504 if (!MaybeBit) 1505 return false; 1506 1507 int32_t Bit = MaybeBit->Value.exactLogBase2(); 1508 if (Bit < 0) 1509 return false; 1510 1511 Register TestReg = AndInst.getOperand(1).getReg(); 1512 1513 // Emit a TB(N)Z. 1514 emitTestBit(TestReg, Bit, Invert, DstMBB, MIB); 1515 return true; 1516 } 1517 1518 MachineInstr *AArch64InstructionSelector::emitCBZ(Register CompareReg, 1519 bool IsNegative, 1520 MachineBasicBlock *DestMBB, 1521 MachineIRBuilder &MIB) const { 1522 assert(ProduceNonFlagSettingCondBr && "CBZ does not set flags!"); 1523 MachineRegisterInfo &MRI = *MIB.getMRI(); 1524 assert(RBI.getRegBank(CompareReg, MRI, TRI)->getID() == 1525 AArch64::GPRRegBankID && 1526 "Expected GPRs only?"); 1527 auto Ty = MRI.getType(CompareReg); 1528 unsigned Width = Ty.getSizeInBits(); 1529 assert(!Ty.isVector() && "Expected scalar only?"); 1530 assert(Width <= 64 && "Expected width to be at most 64?"); 1531 static const unsigned OpcTable[2][2] = {{AArch64::CBZW, AArch64::CBZX}, 1532 {AArch64::CBNZW, AArch64::CBNZX}}; 1533 unsigned Opc = OpcTable[IsNegative][Width == 64]; 1534 auto BranchMI = MIB.buildInstr(Opc, {}, {CompareReg}).addMBB(DestMBB); 1535 constrainSelectedInstRegOperands(*BranchMI, TII, TRI, RBI); 1536 return &*BranchMI; 1537 } 1538 1539 bool AArch64InstructionSelector::selectCompareBranchFedByFCmp( 1540 MachineInstr &I, MachineInstr &FCmp, MachineIRBuilder &MIB) const { 1541 assert(FCmp.getOpcode() == TargetOpcode::G_FCMP); 1542 assert(I.getOpcode() == TargetOpcode::G_BRCOND); 1543 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't 1544 // totally clean. Some of them require two branches to implement. 1545 auto Pred = (CmpInst::Predicate)FCmp.getOperand(1).getPredicate(); 1546 emitFPCompare(FCmp.getOperand(2).getReg(), FCmp.getOperand(3).getReg(), MIB, 1547 Pred); 1548 AArch64CC::CondCode CC1, CC2; 1549 changeFCMPPredToAArch64CC(static_cast<CmpInst::Predicate>(Pred), CC1, CC2); 1550 MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); 1551 MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC1).addMBB(DestMBB); 1552 if (CC2 != AArch64CC::AL) 1553 MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC2).addMBB(DestMBB); 1554 I.eraseFromParent(); 1555 return true; 1556 } 1557 1558 bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp( 1559 MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const { 1560 assert(ICmp.getOpcode() == TargetOpcode::G_ICMP); 1561 assert(I.getOpcode() == TargetOpcode::G_BRCOND); 1562 // Attempt to optimize the G_BRCOND + G_ICMP into a TB(N)Z/CB(N)Z. 1563 // 1564 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z 1565 // instructions will not be produced, as they are conditional branch 1566 // instructions that do not set flags. 1567 if (!ProduceNonFlagSettingCondBr) 1568 return false; 1569 1570 MachineRegisterInfo &MRI = *MIB.getMRI(); 1571 MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); 1572 auto Pred = 1573 static_cast<CmpInst::Predicate>(ICmp.getOperand(1).getPredicate()); 1574 Register LHS = ICmp.getOperand(2).getReg(); 1575 Register RHS = ICmp.getOperand(3).getReg(); 1576 1577 // We're allowed to emit a TB(N)Z/CB(N)Z. Try to do that. 1578 auto VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI); 1579 MachineInstr *AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI); 1580 1581 // When we can emit a TB(N)Z, prefer that. 1582 // 1583 // Handle non-commutative condition codes first. 1584 // Note that we don't want to do this when we have a G_AND because it can 1585 // become a tst. The tst will make the test bit in the TB(N)Z redundant. 1586 if (VRegAndVal && !AndInst) { 1587 int64_t C = VRegAndVal->Value.getSExtValue(); 1588 1589 // When we have a greater-than comparison, we can just test if the msb is 1590 // zero. 1591 if (C == -1 && Pred == CmpInst::ICMP_SGT) { 1592 uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1; 1593 emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB); 1594 I.eraseFromParent(); 1595 return true; 1596 } 1597 1598 // When we have a less than comparison, we can just test if the msb is not 1599 // zero. 1600 if (C == 0 && Pred == CmpInst::ICMP_SLT) { 1601 uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1; 1602 emitTestBit(LHS, Bit, /*IsNegative = */ true, DestMBB, MIB); 1603 I.eraseFromParent(); 1604 return true; 1605 } 1606 } 1607 1608 // Attempt to handle commutative condition codes. Right now, that's only 1609 // eq/ne. 1610 if (ICmpInst::isEquality(Pred)) { 1611 if (!VRegAndVal) { 1612 std::swap(RHS, LHS); 1613 VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI); 1614 AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI); 1615 } 1616 1617 if (VRegAndVal && VRegAndVal->Value == 0) { 1618 // If there's a G_AND feeding into this branch, try to fold it away by 1619 // emitting a TB(N)Z instead. 1620 // 1621 // Note: If we have LT, then it *is* possible to fold, but it wouldn't be 1622 // beneficial. When we have an AND and LT, we need a TST/ANDS, so folding 1623 // would be redundant. 1624 if (AndInst && 1625 tryOptAndIntoCompareBranch( 1626 *AndInst, /*Invert = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB)) { 1627 I.eraseFromParent(); 1628 return true; 1629 } 1630 1631 // Otherwise, try to emit a CB(N)Z instead. 1632 auto LHSTy = MRI.getType(LHS); 1633 if (!LHSTy.isVector() && LHSTy.getSizeInBits() <= 64) { 1634 emitCBZ(LHS, /*IsNegative = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB); 1635 I.eraseFromParent(); 1636 return true; 1637 } 1638 } 1639 } 1640 1641 return false; 1642 } 1643 1644 bool AArch64InstructionSelector::selectCompareBranchFedByICmp( 1645 MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const { 1646 assert(ICmp.getOpcode() == TargetOpcode::G_ICMP); 1647 assert(I.getOpcode() == TargetOpcode::G_BRCOND); 1648 if (tryOptCompareBranchFedByICmp(I, ICmp, MIB)) 1649 return true; 1650 1651 // Couldn't optimize. Emit a compare + a Bcc. 1652 MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); 1653 auto PredOp = ICmp.getOperand(1); 1654 emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB); 1655 const AArch64CC::CondCode CC = changeICMPPredToAArch64CC( 1656 static_cast<CmpInst::Predicate>(PredOp.getPredicate())); 1657 MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB); 1658 I.eraseFromParent(); 1659 return true; 1660 } 1661 1662 bool AArch64InstructionSelector::selectCompareBranch( 1663 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) { 1664 Register CondReg = I.getOperand(0).getReg(); 1665 MachineInstr *CCMI = MRI.getVRegDef(CondReg); 1666 if (CCMI->getOpcode() == TargetOpcode::G_TRUNC) { 1667 CondReg = CCMI->getOperand(1).getReg(); 1668 CCMI = MRI.getVRegDef(CondReg); 1669 } 1670 1671 // Try to select the G_BRCOND using whatever is feeding the condition if 1672 // possible. 1673 unsigned CCMIOpc = CCMI->getOpcode(); 1674 if (CCMIOpc == TargetOpcode::G_FCMP) 1675 return selectCompareBranchFedByFCmp(I, *CCMI, MIB); 1676 if (CCMIOpc == TargetOpcode::G_ICMP) 1677 return selectCompareBranchFedByICmp(I, *CCMI, MIB); 1678 1679 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z 1680 // instructions will not be produced, as they are conditional branch 1681 // instructions that do not set flags. 1682 if (ProduceNonFlagSettingCondBr) { 1683 emitTestBit(CondReg, /*Bit = */ 0, /*IsNegative = */ true, 1684 I.getOperand(1).getMBB(), MIB); 1685 I.eraseFromParent(); 1686 return true; 1687 } 1688 1689 // Can't emit TB(N)Z/CB(N)Z. Emit a tst + bcc instead. 1690 auto TstMI = 1691 MIB.buildInstr(AArch64::ANDSWri, {LLT::scalar(32)}, {CondReg}).addImm(1); 1692 constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI); 1693 auto Bcc = MIB.buildInstr(AArch64::Bcc) 1694 .addImm(AArch64CC::EQ) 1695 .addMBB(I.getOperand(1).getMBB()); 1696 I.eraseFromParent(); 1697 return constrainSelectedInstRegOperands(*Bcc, TII, TRI, RBI); 1698 } 1699 1700 /// Returns the element immediate value of a vector shift operand if found. 1701 /// This needs to detect a splat-like operation, e.g. a G_BUILD_VECTOR. 1702 static Optional<int64_t> getVectorShiftImm(Register Reg, 1703 MachineRegisterInfo &MRI) { 1704 assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand"); 1705 MachineInstr *OpMI = MRI.getVRegDef(Reg); 1706 assert(OpMI && "Expected to find a vreg def for vector shift operand"); 1707 return getAArch64VectorSplatScalar(*OpMI, MRI); 1708 } 1709 1710 /// Matches and returns the shift immediate value for a SHL instruction given 1711 /// a shift operand. 1712 static Optional<int64_t> getVectorSHLImm(LLT SrcTy, Register Reg, MachineRegisterInfo &MRI) { 1713 Optional<int64_t> ShiftImm = getVectorShiftImm(Reg, MRI); 1714 if (!ShiftImm) 1715 return None; 1716 // Check the immediate is in range for a SHL. 1717 int64_t Imm = *ShiftImm; 1718 if (Imm < 0) 1719 return None; 1720 switch (SrcTy.getElementType().getSizeInBits()) { 1721 default: 1722 LLVM_DEBUG(dbgs() << "Unhandled element type for vector shift"); 1723 return None; 1724 case 8: 1725 if (Imm > 7) 1726 return None; 1727 break; 1728 case 16: 1729 if (Imm > 15) 1730 return None; 1731 break; 1732 case 32: 1733 if (Imm > 31) 1734 return None; 1735 break; 1736 case 64: 1737 if (Imm > 63) 1738 return None; 1739 break; 1740 } 1741 return Imm; 1742 } 1743 1744 bool AArch64InstructionSelector::selectVectorSHL(MachineInstr &I, 1745 MachineRegisterInfo &MRI) { 1746 assert(I.getOpcode() == TargetOpcode::G_SHL); 1747 Register DstReg = I.getOperand(0).getReg(); 1748 const LLT Ty = MRI.getType(DstReg); 1749 Register Src1Reg = I.getOperand(1).getReg(); 1750 Register Src2Reg = I.getOperand(2).getReg(); 1751 1752 if (!Ty.isVector()) 1753 return false; 1754 1755 // Check if we have a vector of constants on RHS that we can select as the 1756 // immediate form. 1757 Optional<int64_t> ImmVal = getVectorSHLImm(Ty, Src2Reg, MRI); 1758 1759 unsigned Opc = 0; 1760 if (Ty == LLT::fixed_vector(2, 64)) { 1761 Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64; 1762 } else if (Ty == LLT::fixed_vector(4, 32)) { 1763 Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32; 1764 } else if (Ty == LLT::fixed_vector(2, 32)) { 1765 Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32; 1766 } else if (Ty == LLT::fixed_vector(4, 16)) { 1767 Opc = ImmVal ? AArch64::SHLv4i16_shift : AArch64::USHLv4i16; 1768 } else if (Ty == LLT::fixed_vector(8, 16)) { 1769 Opc = ImmVal ? AArch64::SHLv8i16_shift : AArch64::USHLv8i16; 1770 } else if (Ty == LLT::fixed_vector(16, 8)) { 1771 Opc = ImmVal ? AArch64::SHLv16i8_shift : AArch64::USHLv16i8; 1772 } else if (Ty == LLT::fixed_vector(8, 8)) { 1773 Opc = ImmVal ? AArch64::SHLv8i8_shift : AArch64::USHLv8i8; 1774 } else { 1775 LLVM_DEBUG(dbgs() << "Unhandled G_SHL type"); 1776 return false; 1777 } 1778 1779 auto Shl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg}); 1780 if (ImmVal) 1781 Shl.addImm(*ImmVal); 1782 else 1783 Shl.addUse(Src2Reg); 1784 constrainSelectedInstRegOperands(*Shl, TII, TRI, RBI); 1785 I.eraseFromParent(); 1786 return true; 1787 } 1788 1789 bool AArch64InstructionSelector::selectVectorAshrLshr( 1790 MachineInstr &I, MachineRegisterInfo &MRI) { 1791 assert(I.getOpcode() == TargetOpcode::G_ASHR || 1792 I.getOpcode() == TargetOpcode::G_LSHR); 1793 Register DstReg = I.getOperand(0).getReg(); 1794 const LLT Ty = MRI.getType(DstReg); 1795 Register Src1Reg = I.getOperand(1).getReg(); 1796 Register Src2Reg = I.getOperand(2).getReg(); 1797 1798 if (!Ty.isVector()) 1799 return false; 1800 1801 bool IsASHR = I.getOpcode() == TargetOpcode::G_ASHR; 1802 1803 // We expect the immediate case to be lowered in the PostLegalCombiner to 1804 // AArch64ISD::VASHR or AArch64ISD::VLSHR equivalents. 1805 1806 // There is not a shift right register instruction, but the shift left 1807 // register instruction takes a signed value, where negative numbers specify a 1808 // right shift. 1809 1810 unsigned Opc = 0; 1811 unsigned NegOpc = 0; 1812 const TargetRegisterClass *RC = 1813 getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID), RBI); 1814 if (Ty == LLT::fixed_vector(2, 64)) { 1815 Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64; 1816 NegOpc = AArch64::NEGv2i64; 1817 } else if (Ty == LLT::fixed_vector(4, 32)) { 1818 Opc = IsASHR ? AArch64::SSHLv4i32 : AArch64::USHLv4i32; 1819 NegOpc = AArch64::NEGv4i32; 1820 } else if (Ty == LLT::fixed_vector(2, 32)) { 1821 Opc = IsASHR ? AArch64::SSHLv2i32 : AArch64::USHLv2i32; 1822 NegOpc = AArch64::NEGv2i32; 1823 } else if (Ty == LLT::fixed_vector(4, 16)) { 1824 Opc = IsASHR ? AArch64::SSHLv4i16 : AArch64::USHLv4i16; 1825 NegOpc = AArch64::NEGv4i16; 1826 } else if (Ty == LLT::fixed_vector(8, 16)) { 1827 Opc = IsASHR ? AArch64::SSHLv8i16 : AArch64::USHLv8i16; 1828 NegOpc = AArch64::NEGv8i16; 1829 } else if (Ty == LLT::fixed_vector(16, 8)) { 1830 Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8; 1831 NegOpc = AArch64::NEGv16i8; 1832 } else if (Ty == LLT::fixed_vector(8, 8)) { 1833 Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8; 1834 NegOpc = AArch64::NEGv8i8; 1835 } else { 1836 LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type"); 1837 return false; 1838 } 1839 1840 auto Neg = MIB.buildInstr(NegOpc, {RC}, {Src2Reg}); 1841 constrainSelectedInstRegOperands(*Neg, TII, TRI, RBI); 1842 auto SShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Neg}); 1843 constrainSelectedInstRegOperands(*SShl, TII, TRI, RBI); 1844 I.eraseFromParent(); 1845 return true; 1846 } 1847 1848 bool AArch64InstructionSelector::selectVaStartAAPCS( 1849 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { 1850 return false; 1851 } 1852 1853 bool AArch64InstructionSelector::selectVaStartDarwin( 1854 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { 1855 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 1856 Register ListReg = I.getOperand(0).getReg(); 1857 1858 Register ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); 1859 1860 auto MIB = 1861 BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri)) 1862 .addDef(ArgsAddrReg) 1863 .addFrameIndex(FuncInfo->getVarArgsStackIndex()) 1864 .addImm(0) 1865 .addImm(0); 1866 1867 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1868 1869 MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui)) 1870 .addUse(ArgsAddrReg) 1871 .addUse(ListReg) 1872 .addImm(0) 1873 .addMemOperand(*I.memoperands_begin()); 1874 1875 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1876 I.eraseFromParent(); 1877 return true; 1878 } 1879 1880 void AArch64InstructionSelector::materializeLargeCMVal( 1881 MachineInstr &I, const Value *V, unsigned OpFlags) { 1882 MachineBasicBlock &MBB = *I.getParent(); 1883 MachineFunction &MF = *MBB.getParent(); 1884 MachineRegisterInfo &MRI = MF.getRegInfo(); 1885 1886 auto MovZ = MIB.buildInstr(AArch64::MOVZXi, {&AArch64::GPR64RegClass}, {}); 1887 MovZ->addOperand(MF, I.getOperand(1)); 1888 MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 | 1889 AArch64II::MO_NC); 1890 MovZ->addOperand(MF, MachineOperand::CreateImm(0)); 1891 constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI); 1892 1893 auto BuildMovK = [&](Register SrcReg, unsigned char Flags, unsigned Offset, 1894 Register ForceDstReg) { 1895 Register DstReg = ForceDstReg 1896 ? ForceDstReg 1897 : MRI.createVirtualRegister(&AArch64::GPR64RegClass); 1898 auto MovI = MIB.buildInstr(AArch64::MOVKXi).addDef(DstReg).addUse(SrcReg); 1899 if (auto *GV = dyn_cast<GlobalValue>(V)) { 1900 MovI->addOperand(MF, MachineOperand::CreateGA( 1901 GV, MovZ->getOperand(1).getOffset(), Flags)); 1902 } else { 1903 MovI->addOperand( 1904 MF, MachineOperand::CreateBA(cast<BlockAddress>(V), 1905 MovZ->getOperand(1).getOffset(), Flags)); 1906 } 1907 MovI->addOperand(MF, MachineOperand::CreateImm(Offset)); 1908 constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI); 1909 return DstReg; 1910 }; 1911 Register DstReg = BuildMovK(MovZ.getReg(0), 1912 AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0); 1913 DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0); 1914 BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg()); 1915 } 1916 1917 bool AArch64InstructionSelector::preISelLower(MachineInstr &I) { 1918 MachineBasicBlock &MBB = *I.getParent(); 1919 MachineFunction &MF = *MBB.getParent(); 1920 MachineRegisterInfo &MRI = MF.getRegInfo(); 1921 1922 switch (I.getOpcode()) { 1923 case TargetOpcode::G_SHL: 1924 case TargetOpcode::G_ASHR: 1925 case TargetOpcode::G_LSHR: { 1926 // These shifts are legalized to have 64 bit shift amounts because we want 1927 // to take advantage of the existing imported selection patterns that assume 1928 // the immediates are s64s. However, if the shifted type is 32 bits and for 1929 // some reason we receive input GMIR that has an s64 shift amount that's not 1930 // a G_CONSTANT, insert a truncate so that we can still select the s32 1931 // register-register variant. 1932 Register SrcReg = I.getOperand(1).getReg(); 1933 Register ShiftReg = I.getOperand(2).getReg(); 1934 const LLT ShiftTy = MRI.getType(ShiftReg); 1935 const LLT SrcTy = MRI.getType(SrcReg); 1936 if (SrcTy.isVector()) 1937 return false; 1938 assert(!ShiftTy.isVector() && "unexpected vector shift ty"); 1939 if (SrcTy.getSizeInBits() != 32 || ShiftTy.getSizeInBits() != 64) 1940 return false; 1941 auto *AmtMI = MRI.getVRegDef(ShiftReg); 1942 assert(AmtMI && "could not find a vreg definition for shift amount"); 1943 if (AmtMI->getOpcode() != TargetOpcode::G_CONSTANT) { 1944 // Insert a subregister copy to implement a 64->32 trunc 1945 auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {}) 1946 .addReg(ShiftReg, 0, AArch64::sub_32); 1947 MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID)); 1948 I.getOperand(2).setReg(Trunc.getReg(0)); 1949 } 1950 return true; 1951 } 1952 case TargetOpcode::G_STORE: { 1953 bool Changed = contractCrossBankCopyIntoStore(I, MRI); 1954 MachineOperand &SrcOp = I.getOperand(0); 1955 if (MRI.getType(SrcOp.getReg()).isPointer()) { 1956 // Allow matching with imported patterns for stores of pointers. Unlike 1957 // G_LOAD/G_PTR_ADD, we may not have selected all users. So, emit a copy 1958 // and constrain. 1959 auto Copy = MIB.buildCopy(LLT::scalar(64), SrcOp); 1960 Register NewSrc = Copy.getReg(0); 1961 SrcOp.setReg(NewSrc); 1962 RBI.constrainGenericRegister(NewSrc, AArch64::GPR64RegClass, MRI); 1963 Changed = true; 1964 } 1965 return Changed; 1966 } 1967 case TargetOpcode::G_PTR_ADD: 1968 return convertPtrAddToAdd(I, MRI); 1969 case TargetOpcode::G_LOAD: { 1970 // For scalar loads of pointers, we try to convert the dest type from p0 1971 // to s64 so that our imported patterns can match. Like with the G_PTR_ADD 1972 // conversion, this should be ok because all users should have been 1973 // selected already, so the type doesn't matter for them. 1974 Register DstReg = I.getOperand(0).getReg(); 1975 const LLT DstTy = MRI.getType(DstReg); 1976 if (!DstTy.isPointer()) 1977 return false; 1978 MRI.setType(DstReg, LLT::scalar(64)); 1979 return true; 1980 } 1981 case AArch64::G_DUP: { 1982 // Convert the type from p0 to s64 to help selection. 1983 LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 1984 if (!DstTy.getElementType().isPointer()) 1985 return false; 1986 auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(1).getReg()); 1987 MRI.setType(I.getOperand(0).getReg(), 1988 DstTy.changeElementType(LLT::scalar(64))); 1989 MRI.setRegClass(NewSrc.getReg(0), &AArch64::GPR64RegClass); 1990 I.getOperand(1).setReg(NewSrc.getReg(0)); 1991 return true; 1992 } 1993 case TargetOpcode::G_UITOFP: 1994 case TargetOpcode::G_SITOFP: { 1995 // If both source and destination regbanks are FPR, then convert the opcode 1996 // to G_SITOF so that the importer can select it to an fpr variant. 1997 // Otherwise, it ends up matching an fpr/gpr variant and adding a cross-bank 1998 // copy. 1999 Register SrcReg = I.getOperand(1).getReg(); 2000 LLT SrcTy = MRI.getType(SrcReg); 2001 LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 2002 if (SrcTy.isVector() || SrcTy.getSizeInBits() != DstTy.getSizeInBits()) 2003 return false; 2004 2005 if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::FPRRegBankID) { 2006 if (I.getOpcode() == TargetOpcode::G_SITOFP) 2007 I.setDesc(TII.get(AArch64::G_SITOF)); 2008 else 2009 I.setDesc(TII.get(AArch64::G_UITOF)); 2010 return true; 2011 } 2012 return false; 2013 } 2014 default: 2015 return false; 2016 } 2017 } 2018 2019 /// This lowering tries to look for G_PTR_ADD instructions and then converts 2020 /// them to a standard G_ADD with a COPY on the source. 2021 /// 2022 /// The motivation behind this is to expose the add semantics to the imported 2023 /// tablegen patterns. We shouldn't need to check for uses being loads/stores, 2024 /// because the selector works bottom up, uses before defs. By the time we 2025 /// end up trying to select a G_PTR_ADD, we should have already attempted to 2026 /// fold this into addressing modes and were therefore unsuccessful. 2027 bool AArch64InstructionSelector::convertPtrAddToAdd( 2028 MachineInstr &I, MachineRegisterInfo &MRI) { 2029 assert(I.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD"); 2030 Register DstReg = I.getOperand(0).getReg(); 2031 Register AddOp1Reg = I.getOperand(1).getReg(); 2032 const LLT PtrTy = MRI.getType(DstReg); 2033 if (PtrTy.getAddressSpace() != 0) 2034 return false; 2035 2036 const LLT CastPtrTy = 2037 PtrTy.isVector() ? LLT::fixed_vector(2, 64) : LLT::scalar(64); 2038 auto PtrToInt = MIB.buildPtrToInt(CastPtrTy, AddOp1Reg); 2039 // Set regbanks on the registers. 2040 if (PtrTy.isVector()) 2041 MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::FPRRegBankID)); 2042 else 2043 MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID)); 2044 2045 // Now turn the %dst(p0) = G_PTR_ADD %base, off into: 2046 // %dst(intty) = G_ADD %intbase, off 2047 I.setDesc(TII.get(TargetOpcode::G_ADD)); 2048 MRI.setType(DstReg, CastPtrTy); 2049 I.getOperand(1).setReg(PtrToInt.getReg(0)); 2050 if (!select(*PtrToInt)) { 2051 LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd"); 2052 return false; 2053 } 2054 2055 // Also take the opportunity here to try to do some optimization. 2056 // Try to convert this into a G_SUB if the offset is a 0-x negate idiom. 2057 Register NegatedReg; 2058 if (!mi_match(I.getOperand(2).getReg(), MRI, m_Neg(m_Reg(NegatedReg)))) 2059 return true; 2060 I.getOperand(2).setReg(NegatedReg); 2061 I.setDesc(TII.get(TargetOpcode::G_SUB)); 2062 return true; 2063 } 2064 2065 bool AArch64InstructionSelector::earlySelectSHL(MachineInstr &I, 2066 MachineRegisterInfo &MRI) { 2067 // We try to match the immediate variant of LSL, which is actually an alias 2068 // for a special case of UBFM. Otherwise, we fall back to the imported 2069 // selector which will match the register variant. 2070 assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op"); 2071 const auto &MO = I.getOperand(2); 2072 auto VRegAndVal = getIConstantVRegVal(MO.getReg(), MRI); 2073 if (!VRegAndVal) 2074 return false; 2075 2076 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 2077 if (DstTy.isVector()) 2078 return false; 2079 bool Is64Bit = DstTy.getSizeInBits() == 64; 2080 auto Imm1Fn = Is64Bit ? selectShiftA_64(MO) : selectShiftA_32(MO); 2081 auto Imm2Fn = Is64Bit ? selectShiftB_64(MO) : selectShiftB_32(MO); 2082 2083 if (!Imm1Fn || !Imm2Fn) 2084 return false; 2085 2086 auto NewI = 2087 MIB.buildInstr(Is64Bit ? AArch64::UBFMXri : AArch64::UBFMWri, 2088 {I.getOperand(0).getReg()}, {I.getOperand(1).getReg()}); 2089 2090 for (auto &RenderFn : *Imm1Fn) 2091 RenderFn(NewI); 2092 for (auto &RenderFn : *Imm2Fn) 2093 RenderFn(NewI); 2094 2095 I.eraseFromParent(); 2096 return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI); 2097 } 2098 2099 bool AArch64InstructionSelector::contractCrossBankCopyIntoStore( 2100 MachineInstr &I, MachineRegisterInfo &MRI) { 2101 assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE"); 2102 // If we're storing a scalar, it doesn't matter what register bank that 2103 // scalar is on. All that matters is the size. 2104 // 2105 // So, if we see something like this (with a 32-bit scalar as an example): 2106 // 2107 // %x:gpr(s32) = ... something ... 2108 // %y:fpr(s32) = COPY %x:gpr(s32) 2109 // G_STORE %y:fpr(s32) 2110 // 2111 // We can fix this up into something like this: 2112 // 2113 // G_STORE %x:gpr(s32) 2114 // 2115 // And then continue the selection process normally. 2116 Register DefDstReg = getSrcRegIgnoringCopies(I.getOperand(0).getReg(), MRI); 2117 if (!DefDstReg.isValid()) 2118 return false; 2119 LLT DefDstTy = MRI.getType(DefDstReg); 2120 Register StoreSrcReg = I.getOperand(0).getReg(); 2121 LLT StoreSrcTy = MRI.getType(StoreSrcReg); 2122 2123 // If we get something strange like a physical register, then we shouldn't 2124 // go any further. 2125 if (!DefDstTy.isValid()) 2126 return false; 2127 2128 // Are the source and dst types the same size? 2129 if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits()) 2130 return false; 2131 2132 if (RBI.getRegBank(StoreSrcReg, MRI, TRI) == 2133 RBI.getRegBank(DefDstReg, MRI, TRI)) 2134 return false; 2135 2136 // We have a cross-bank copy, which is entering a store. Let's fold it. 2137 I.getOperand(0).setReg(DefDstReg); 2138 return true; 2139 } 2140 2141 bool AArch64InstructionSelector::earlySelect(MachineInstr &I) { 2142 assert(I.getParent() && "Instruction should be in a basic block!"); 2143 assert(I.getParent()->getParent() && "Instruction should be in a function!"); 2144 2145 MachineBasicBlock &MBB = *I.getParent(); 2146 MachineFunction &MF = *MBB.getParent(); 2147 MachineRegisterInfo &MRI = MF.getRegInfo(); 2148 2149 switch (I.getOpcode()) { 2150 case AArch64::G_DUP: { 2151 // Before selecting a DUP instruction, check if it is better selected as a 2152 // MOV or load from a constant pool. 2153 Register Src = I.getOperand(1).getReg(); 2154 auto ValAndVReg = getIConstantVRegValWithLookThrough(Src, MRI); 2155 if (!ValAndVReg) 2156 return false; 2157 LLVMContext &Ctx = MF.getFunction().getContext(); 2158 Register Dst = I.getOperand(0).getReg(); 2159 auto *CV = ConstantDataVector::getSplat( 2160 MRI.getType(Dst).getNumElements(), 2161 ConstantInt::get(Type::getIntNTy(Ctx, MRI.getType(Src).getSizeInBits()), 2162 ValAndVReg->Value)); 2163 if (!emitConstantVector(Dst, CV, MIB, MRI)) 2164 return false; 2165 I.eraseFromParent(); 2166 return true; 2167 } 2168 case TargetOpcode::G_SEXT: 2169 // Check for i64 sext(i32 vector_extract) prior to tablegen to select SMOV 2170 // over a normal extend. 2171 if (selectUSMovFromExtend(I, MRI)) 2172 return true; 2173 return false; 2174 case TargetOpcode::G_BR: 2175 return false; 2176 case TargetOpcode::G_SHL: 2177 return earlySelectSHL(I, MRI); 2178 case TargetOpcode::G_CONSTANT: { 2179 bool IsZero = false; 2180 if (I.getOperand(1).isCImm()) 2181 IsZero = I.getOperand(1).getCImm()->getZExtValue() == 0; 2182 else if (I.getOperand(1).isImm()) 2183 IsZero = I.getOperand(1).getImm() == 0; 2184 2185 if (!IsZero) 2186 return false; 2187 2188 Register DefReg = I.getOperand(0).getReg(); 2189 LLT Ty = MRI.getType(DefReg); 2190 if (Ty.getSizeInBits() == 64) { 2191 I.getOperand(1).ChangeToRegister(AArch64::XZR, false); 2192 RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI); 2193 } else if (Ty.getSizeInBits() == 32) { 2194 I.getOperand(1).ChangeToRegister(AArch64::WZR, false); 2195 RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI); 2196 } else 2197 return false; 2198 2199 I.setDesc(TII.get(TargetOpcode::COPY)); 2200 return true; 2201 } 2202 2203 case TargetOpcode::G_ADD: { 2204 // Check if this is being fed by a G_ICMP on either side. 2205 // 2206 // (cmp pred, x, y) + z 2207 // 2208 // In the above case, when the cmp is true, we increment z by 1. So, we can 2209 // fold the add into the cset for the cmp by using cinc. 2210 // 2211 // FIXME: This would probably be a lot nicer in PostLegalizerLowering. 2212 Register AddDst = I.getOperand(0).getReg(); 2213 Register AddLHS = I.getOperand(1).getReg(); 2214 Register AddRHS = I.getOperand(2).getReg(); 2215 // Only handle scalars. 2216 LLT Ty = MRI.getType(AddLHS); 2217 if (Ty.isVector()) 2218 return false; 2219 // Since G_ICMP is modeled as ADDS/SUBS/ANDS, we can handle 32 bits or 64 2220 // bits. 2221 unsigned Size = Ty.getSizeInBits(); 2222 if (Size != 32 && Size != 64) 2223 return false; 2224 auto MatchCmp = [&](Register Reg) -> MachineInstr * { 2225 if (!MRI.hasOneNonDBGUse(Reg)) 2226 return nullptr; 2227 // If the LHS of the add is 32 bits, then we want to fold a 32-bit 2228 // compare. 2229 if (Size == 32) 2230 return getOpcodeDef(TargetOpcode::G_ICMP, Reg, MRI); 2231 // We model scalar compares using 32-bit destinations right now. 2232 // If it's a 64-bit compare, it'll have 64-bit sources. 2233 Register ZExt; 2234 if (!mi_match(Reg, MRI, 2235 m_OneNonDBGUse(m_GZExt(m_OneNonDBGUse(m_Reg(ZExt)))))) 2236 return nullptr; 2237 auto *Cmp = getOpcodeDef(TargetOpcode::G_ICMP, ZExt, MRI); 2238 if (!Cmp || 2239 MRI.getType(Cmp->getOperand(2).getReg()).getSizeInBits() != 64) 2240 return nullptr; 2241 return Cmp; 2242 }; 2243 // Try to match 2244 // z + (cmp pred, x, y) 2245 MachineInstr *Cmp = MatchCmp(AddRHS); 2246 if (!Cmp) { 2247 // (cmp pred, x, y) + z 2248 std::swap(AddLHS, AddRHS); 2249 Cmp = MatchCmp(AddRHS); 2250 if (!Cmp) 2251 return false; 2252 } 2253 auto &PredOp = Cmp->getOperand(1); 2254 auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate()); 2255 const AArch64CC::CondCode InvCC = 2256 changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred)); 2257 MIB.setInstrAndDebugLoc(I); 2258 emitIntegerCompare(/*LHS=*/Cmp->getOperand(2), 2259 /*RHS=*/Cmp->getOperand(3), PredOp, MIB); 2260 emitCSINC(/*Dst=*/AddDst, /*Src =*/AddLHS, /*Src2=*/AddLHS, InvCC, MIB); 2261 I.eraseFromParent(); 2262 return true; 2263 } 2264 case TargetOpcode::G_OR: { 2265 // Look for operations that take the lower `Width=Size-ShiftImm` bits of 2266 // `ShiftSrc` and insert them into the upper `Width` bits of `MaskSrc` via 2267 // shifting and masking that we can replace with a BFI (encoded as a BFM). 2268 Register Dst = I.getOperand(0).getReg(); 2269 LLT Ty = MRI.getType(Dst); 2270 2271 if (!Ty.isScalar()) 2272 return false; 2273 2274 unsigned Size = Ty.getSizeInBits(); 2275 if (Size != 32 && Size != 64) 2276 return false; 2277 2278 Register ShiftSrc; 2279 int64_t ShiftImm; 2280 Register MaskSrc; 2281 int64_t MaskImm; 2282 if (!mi_match( 2283 Dst, MRI, 2284 m_GOr(m_OneNonDBGUse(m_GShl(m_Reg(ShiftSrc), m_ICst(ShiftImm))), 2285 m_OneNonDBGUse(m_GAnd(m_Reg(MaskSrc), m_ICst(MaskImm)))))) 2286 return false; 2287 2288 if (ShiftImm > Size || ((1ULL << ShiftImm) - 1ULL) != uint64_t(MaskImm)) 2289 return false; 2290 2291 int64_t Immr = Size - ShiftImm; 2292 int64_t Imms = Size - ShiftImm - 1; 2293 unsigned Opc = Size == 32 ? AArch64::BFMWri : AArch64::BFMXri; 2294 emitInstr(Opc, {Dst}, {MaskSrc, ShiftSrc, Immr, Imms}, MIB); 2295 I.eraseFromParent(); 2296 return true; 2297 } 2298 default: 2299 return false; 2300 } 2301 } 2302 2303 bool AArch64InstructionSelector::select(MachineInstr &I) { 2304 assert(I.getParent() && "Instruction should be in a basic block!"); 2305 assert(I.getParent()->getParent() && "Instruction should be in a function!"); 2306 2307 MachineBasicBlock &MBB = *I.getParent(); 2308 MachineFunction &MF = *MBB.getParent(); 2309 MachineRegisterInfo &MRI = MF.getRegInfo(); 2310 2311 const AArch64Subtarget *Subtarget = 2312 &static_cast<const AArch64Subtarget &>(MF.getSubtarget()); 2313 if (Subtarget->requiresStrictAlign()) { 2314 // We don't support this feature yet. 2315 LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n"); 2316 return false; 2317 } 2318 2319 MIB.setInstrAndDebugLoc(I); 2320 2321 unsigned Opcode = I.getOpcode(); 2322 // G_PHI requires same handling as PHI 2323 if (!I.isPreISelOpcode() || Opcode == TargetOpcode::G_PHI) { 2324 // Certain non-generic instructions also need some special handling. 2325 2326 if (Opcode == TargetOpcode::LOAD_STACK_GUARD) 2327 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2328 2329 if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) { 2330 const Register DefReg = I.getOperand(0).getReg(); 2331 const LLT DefTy = MRI.getType(DefReg); 2332 2333 const RegClassOrRegBank &RegClassOrBank = 2334 MRI.getRegClassOrRegBank(DefReg); 2335 2336 const TargetRegisterClass *DefRC 2337 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); 2338 if (!DefRC) { 2339 if (!DefTy.isValid()) { 2340 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); 2341 return false; 2342 } 2343 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); 2344 DefRC = getRegClassForTypeOnBank(DefTy, RB, RBI); 2345 if (!DefRC) { 2346 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); 2347 return false; 2348 } 2349 } 2350 2351 I.setDesc(TII.get(TargetOpcode::PHI)); 2352 2353 return RBI.constrainGenericRegister(DefReg, *DefRC, MRI); 2354 } 2355 2356 if (I.isCopy()) 2357 return selectCopy(I, TII, MRI, TRI, RBI); 2358 2359 return true; 2360 } 2361 2362 2363 if (I.getNumOperands() != I.getNumExplicitOperands()) { 2364 LLVM_DEBUG( 2365 dbgs() << "Generic instruction has unexpected implicit operands\n"); 2366 return false; 2367 } 2368 2369 // Try to do some lowering before we start instruction selecting. These 2370 // lowerings are purely transformations on the input G_MIR and so selection 2371 // must continue after any modification of the instruction. 2372 if (preISelLower(I)) { 2373 Opcode = I.getOpcode(); // The opcode may have been modified, refresh it. 2374 } 2375 2376 // There may be patterns where the importer can't deal with them optimally, 2377 // but does select it to a suboptimal sequence so our custom C++ selection 2378 // code later never has a chance to work on it. Therefore, we have an early 2379 // selection attempt here to give priority to certain selection routines 2380 // over the imported ones. 2381 if (earlySelect(I)) 2382 return true; 2383 2384 if (selectImpl(I, *CoverageInfo)) 2385 return true; 2386 2387 LLT Ty = 2388 I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{}; 2389 2390 switch (Opcode) { 2391 case TargetOpcode::G_SBFX: 2392 case TargetOpcode::G_UBFX: { 2393 static const unsigned OpcTable[2][2] = { 2394 {AArch64::UBFMWri, AArch64::UBFMXri}, 2395 {AArch64::SBFMWri, AArch64::SBFMXri}}; 2396 bool IsSigned = Opcode == TargetOpcode::G_SBFX; 2397 unsigned Size = Ty.getSizeInBits(); 2398 unsigned Opc = OpcTable[IsSigned][Size == 64]; 2399 auto Cst1 = 2400 getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), MRI); 2401 assert(Cst1 && "Should have gotten a constant for src 1?"); 2402 auto Cst2 = 2403 getIConstantVRegValWithLookThrough(I.getOperand(3).getReg(), MRI); 2404 assert(Cst2 && "Should have gotten a constant for src 2?"); 2405 auto LSB = Cst1->Value.getZExtValue(); 2406 auto Width = Cst2->Value.getZExtValue(); 2407 auto BitfieldInst = 2408 MIB.buildInstr(Opc, {I.getOperand(0)}, {I.getOperand(1)}) 2409 .addImm(LSB) 2410 .addImm(LSB + Width - 1); 2411 I.eraseFromParent(); 2412 return constrainSelectedInstRegOperands(*BitfieldInst, TII, TRI, RBI); 2413 } 2414 case TargetOpcode::G_BRCOND: 2415 return selectCompareBranch(I, MF, MRI); 2416 2417 case TargetOpcode::G_BRINDIRECT: { 2418 I.setDesc(TII.get(AArch64::BR)); 2419 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2420 } 2421 2422 case TargetOpcode::G_BRJT: 2423 return selectBrJT(I, MRI); 2424 2425 case AArch64::G_ADD_LOW: { 2426 // This op may have been separated from it's ADRP companion by the localizer 2427 // or some other code motion pass. Given that many CPUs will try to 2428 // macro fuse these operations anyway, select this into a MOVaddr pseudo 2429 // which will later be expanded into an ADRP+ADD pair after scheduling. 2430 MachineInstr *BaseMI = MRI.getVRegDef(I.getOperand(1).getReg()); 2431 if (BaseMI->getOpcode() != AArch64::ADRP) { 2432 I.setDesc(TII.get(AArch64::ADDXri)); 2433 I.addOperand(MachineOperand::CreateImm(0)); 2434 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2435 } 2436 assert(TM.getCodeModel() == CodeModel::Small && 2437 "Expected small code model"); 2438 auto Op1 = BaseMI->getOperand(1); 2439 auto Op2 = I.getOperand(2); 2440 auto MovAddr = MIB.buildInstr(AArch64::MOVaddr, {I.getOperand(0)}, {}) 2441 .addGlobalAddress(Op1.getGlobal(), Op1.getOffset(), 2442 Op1.getTargetFlags()) 2443 .addGlobalAddress(Op2.getGlobal(), Op2.getOffset(), 2444 Op2.getTargetFlags()); 2445 I.eraseFromParent(); 2446 return constrainSelectedInstRegOperands(*MovAddr, TII, TRI, RBI); 2447 } 2448 2449 case TargetOpcode::G_BSWAP: { 2450 // Handle vector types for G_BSWAP directly. 2451 Register DstReg = I.getOperand(0).getReg(); 2452 LLT DstTy = MRI.getType(DstReg); 2453 2454 // We should only get vector types here; everything else is handled by the 2455 // importer right now. 2456 if (!DstTy.isVector() || DstTy.getSizeInBits() > 128) { 2457 LLVM_DEBUG(dbgs() << "Dst type for G_BSWAP currently unsupported.\n"); 2458 return false; 2459 } 2460 2461 // Only handle 4 and 2 element vectors for now. 2462 // TODO: 16-bit elements. 2463 unsigned NumElts = DstTy.getNumElements(); 2464 if (NumElts != 4 && NumElts != 2) { 2465 LLVM_DEBUG(dbgs() << "Unsupported number of elements for G_BSWAP.\n"); 2466 return false; 2467 } 2468 2469 // Choose the correct opcode for the supported types. Right now, that's 2470 // v2s32, v4s32, and v2s64. 2471 unsigned Opc = 0; 2472 unsigned EltSize = DstTy.getElementType().getSizeInBits(); 2473 if (EltSize == 32) 2474 Opc = (DstTy.getNumElements() == 2) ? AArch64::REV32v8i8 2475 : AArch64::REV32v16i8; 2476 else if (EltSize == 64) 2477 Opc = AArch64::REV64v16i8; 2478 2479 // We should always get something by the time we get here... 2480 assert(Opc != 0 && "Didn't get an opcode for G_BSWAP?"); 2481 2482 I.setDesc(TII.get(Opc)); 2483 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2484 } 2485 2486 case TargetOpcode::G_FCONSTANT: 2487 case TargetOpcode::G_CONSTANT: { 2488 const bool isFP = Opcode == TargetOpcode::G_FCONSTANT; 2489 2490 const LLT s8 = LLT::scalar(8); 2491 const LLT s16 = LLT::scalar(16); 2492 const LLT s32 = LLT::scalar(32); 2493 const LLT s64 = LLT::scalar(64); 2494 const LLT s128 = LLT::scalar(128); 2495 const LLT p0 = LLT::pointer(0, 64); 2496 2497 const Register DefReg = I.getOperand(0).getReg(); 2498 const LLT DefTy = MRI.getType(DefReg); 2499 const unsigned DefSize = DefTy.getSizeInBits(); 2500 const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); 2501 2502 // FIXME: Redundant check, but even less readable when factored out. 2503 if (isFP) { 2504 if (Ty != s16 && Ty != s32 && Ty != s64 && Ty != s128) { 2505 LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty 2506 << " constant, expected: " << s16 << " or " << s32 2507 << " or " << s64 << " or " << s128 << '\n'); 2508 return false; 2509 } 2510 2511 if (RB.getID() != AArch64::FPRRegBankID) { 2512 LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty 2513 << " constant on bank: " << RB 2514 << ", expected: FPR\n"); 2515 return false; 2516 } 2517 2518 // The case when we have 0.0 is covered by tablegen. Reject it here so we 2519 // can be sure tablegen works correctly and isn't rescued by this code. 2520 // 0.0 is not covered by tablegen for FP128. So we will handle this 2521 // scenario in the code here. 2522 if (DefSize != 128 && I.getOperand(1).getFPImm()->isExactlyValue(0.0)) 2523 return false; 2524 } else { 2525 // s32 and s64 are covered by tablegen. 2526 if (Ty != p0 && Ty != s8 && Ty != s16) { 2527 LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty 2528 << " constant, expected: " << s32 << ", " << s64 2529 << ", or " << p0 << '\n'); 2530 return false; 2531 } 2532 2533 if (RB.getID() != AArch64::GPRRegBankID) { 2534 LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty 2535 << " constant on bank: " << RB 2536 << ", expected: GPR\n"); 2537 return false; 2538 } 2539 } 2540 2541 if (isFP) { 2542 const TargetRegisterClass &FPRRC = *getMinClassForRegBank(RB, DefSize); 2543 // For 16, 64, and 128b values, emit a constant pool load. 2544 switch (DefSize) { 2545 default: 2546 llvm_unreachable("Unexpected destination size for G_FCONSTANT?"); 2547 case 32: 2548 // For s32, use a cp load if we have optsize/minsize. 2549 if (!shouldOptForSize(&MF)) 2550 break; 2551 LLVM_FALLTHROUGH; 2552 case 16: 2553 case 64: 2554 case 128: { 2555 auto *FPImm = I.getOperand(1).getFPImm(); 2556 auto *LoadMI = emitLoadFromConstantPool(FPImm, MIB); 2557 if (!LoadMI) { 2558 LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n"); 2559 return false; 2560 } 2561 MIB.buildCopy({DefReg}, {LoadMI->getOperand(0).getReg()}); 2562 I.eraseFromParent(); 2563 return RBI.constrainGenericRegister(DefReg, FPRRC, MRI); 2564 } 2565 } 2566 2567 // Either emit a FMOV, or emit a copy to emit a normal mov. 2568 assert(DefSize == 32 && 2569 "Expected constant pool loads for all sizes other than 32!"); 2570 const Register DefGPRReg = 2571 MRI.createVirtualRegister(&AArch64::GPR32RegClass); 2572 MachineOperand &RegOp = I.getOperand(0); 2573 RegOp.setReg(DefGPRReg); 2574 MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator())); 2575 MIB.buildCopy({DefReg}, {DefGPRReg}); 2576 2577 if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) { 2578 LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n"); 2579 return false; 2580 } 2581 2582 MachineOperand &ImmOp = I.getOperand(1); 2583 // FIXME: Is going through int64_t always correct? 2584 ImmOp.ChangeToImmediate( 2585 ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue()); 2586 } else if (I.getOperand(1).isCImm()) { 2587 uint64_t Val = I.getOperand(1).getCImm()->getZExtValue(); 2588 I.getOperand(1).ChangeToImmediate(Val); 2589 } else if (I.getOperand(1).isImm()) { 2590 uint64_t Val = I.getOperand(1).getImm(); 2591 I.getOperand(1).ChangeToImmediate(Val); 2592 } 2593 2594 const unsigned MovOpc = 2595 DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm; 2596 I.setDesc(TII.get(MovOpc)); 2597 constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2598 return true; 2599 } 2600 case TargetOpcode::G_EXTRACT: { 2601 Register DstReg = I.getOperand(0).getReg(); 2602 Register SrcReg = I.getOperand(1).getReg(); 2603 LLT SrcTy = MRI.getType(SrcReg); 2604 LLT DstTy = MRI.getType(DstReg); 2605 (void)DstTy; 2606 unsigned SrcSize = SrcTy.getSizeInBits(); 2607 2608 if (SrcTy.getSizeInBits() > 64) { 2609 // This should be an extract of an s128, which is like a vector extract. 2610 if (SrcTy.getSizeInBits() != 128) 2611 return false; 2612 // Only support extracting 64 bits from an s128 at the moment. 2613 if (DstTy.getSizeInBits() != 64) 2614 return false; 2615 2616 unsigned Offset = I.getOperand(2).getImm(); 2617 if (Offset % 64 != 0) 2618 return false; 2619 2620 // Check we have the right regbank always. 2621 const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI); 2622 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); 2623 assert(SrcRB.getID() == DstRB.getID() && "Wrong extract regbank!"); 2624 2625 if (SrcRB.getID() == AArch64::GPRRegBankID) { 2626 MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}) 2627 .addUse(SrcReg, 0, Offset == 0 ? AArch64::sube64 : AArch64::subo64); 2628 I.eraseFromParent(); 2629 return true; 2630 } 2631 2632 // Emit the same code as a vector extract. 2633 // Offset must be a multiple of 64. 2634 unsigned LaneIdx = Offset / 64; 2635 MachineInstr *Extract = emitExtractVectorElt( 2636 DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB); 2637 if (!Extract) 2638 return false; 2639 I.eraseFromParent(); 2640 return true; 2641 } 2642 2643 I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri)); 2644 MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() + 2645 Ty.getSizeInBits() - 1); 2646 2647 if (SrcSize < 64) { 2648 assert(SrcSize == 32 && DstTy.getSizeInBits() == 16 && 2649 "unexpected G_EXTRACT types"); 2650 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2651 } 2652 2653 DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); 2654 MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator())); 2655 MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {}) 2656 .addReg(DstReg, 0, AArch64::sub_32); 2657 RBI.constrainGenericRegister(I.getOperand(0).getReg(), 2658 AArch64::GPR32RegClass, MRI); 2659 I.getOperand(0).setReg(DstReg); 2660 2661 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2662 } 2663 2664 case TargetOpcode::G_INSERT: { 2665 LLT SrcTy = MRI.getType(I.getOperand(2).getReg()); 2666 LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 2667 unsigned DstSize = DstTy.getSizeInBits(); 2668 // Larger inserts are vectors, same-size ones should be something else by 2669 // now (split up or turned into COPYs). 2670 if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32) 2671 return false; 2672 2673 I.setDesc(TII.get(DstSize == 64 ? AArch64::BFMXri : AArch64::BFMWri)); 2674 unsigned LSB = I.getOperand(3).getImm(); 2675 unsigned Width = MRI.getType(I.getOperand(2).getReg()).getSizeInBits(); 2676 I.getOperand(3).setImm((DstSize - LSB) % DstSize); 2677 MachineInstrBuilder(MF, I).addImm(Width - 1); 2678 2679 if (DstSize < 64) { 2680 assert(DstSize == 32 && SrcTy.getSizeInBits() == 16 && 2681 "unexpected G_INSERT types"); 2682 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2683 } 2684 2685 Register SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); 2686 BuildMI(MBB, I.getIterator(), I.getDebugLoc(), 2687 TII.get(AArch64::SUBREG_TO_REG)) 2688 .addDef(SrcReg) 2689 .addImm(0) 2690 .addUse(I.getOperand(2).getReg()) 2691 .addImm(AArch64::sub_32); 2692 RBI.constrainGenericRegister(I.getOperand(2).getReg(), 2693 AArch64::GPR32RegClass, MRI); 2694 I.getOperand(2).setReg(SrcReg); 2695 2696 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2697 } 2698 case TargetOpcode::G_FRAME_INDEX: { 2699 // allocas and G_FRAME_INDEX are only supported in addrspace(0). 2700 if (Ty != LLT::pointer(0, 64)) { 2701 LLVM_DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty 2702 << ", expected: " << LLT::pointer(0, 64) << '\n'); 2703 return false; 2704 } 2705 I.setDesc(TII.get(AArch64::ADDXri)); 2706 2707 // MOs for a #0 shifted immediate. 2708 I.addOperand(MachineOperand::CreateImm(0)); 2709 I.addOperand(MachineOperand::CreateImm(0)); 2710 2711 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2712 } 2713 2714 case TargetOpcode::G_GLOBAL_VALUE: { 2715 auto GV = I.getOperand(1).getGlobal(); 2716 if (GV->isThreadLocal()) 2717 return selectTLSGlobalValue(I, MRI); 2718 2719 unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM); 2720 if (OpFlags & AArch64II::MO_GOT) { 2721 I.setDesc(TII.get(AArch64::LOADgot)); 2722 I.getOperand(1).setTargetFlags(OpFlags); 2723 } else if (TM.getCodeModel() == CodeModel::Large) { 2724 // Materialize the global using movz/movk instructions. 2725 materializeLargeCMVal(I, GV, OpFlags); 2726 I.eraseFromParent(); 2727 return true; 2728 } else if (TM.getCodeModel() == CodeModel::Tiny) { 2729 I.setDesc(TII.get(AArch64::ADR)); 2730 I.getOperand(1).setTargetFlags(OpFlags); 2731 } else { 2732 I.setDesc(TII.get(AArch64::MOVaddr)); 2733 I.getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_PAGE); 2734 MachineInstrBuilder MIB(MF, I); 2735 MIB.addGlobalAddress(GV, I.getOperand(1).getOffset(), 2736 OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 2737 } 2738 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2739 } 2740 2741 case TargetOpcode::G_ZEXTLOAD: 2742 case TargetOpcode::G_LOAD: 2743 case TargetOpcode::G_STORE: { 2744 GLoadStore &LdSt = cast<GLoadStore>(I); 2745 bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD; 2746 LLT PtrTy = MRI.getType(LdSt.getPointerReg()); 2747 2748 if (PtrTy != LLT::pointer(0, 64)) { 2749 LLVM_DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy 2750 << ", expected: " << LLT::pointer(0, 64) << '\n'); 2751 return false; 2752 } 2753 2754 uint64_t MemSizeInBytes = LdSt.getMemSize(); 2755 unsigned MemSizeInBits = LdSt.getMemSizeInBits(); 2756 AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering(); 2757 2758 // Need special instructions for atomics that affect ordering. 2759 if (Order != AtomicOrdering::NotAtomic && 2760 Order != AtomicOrdering::Unordered && 2761 Order != AtomicOrdering::Monotonic) { 2762 assert(!isa<GZExtLoad>(LdSt)); 2763 if (MemSizeInBytes > 64) 2764 return false; 2765 2766 if (isa<GLoad>(LdSt)) { 2767 static unsigned Opcodes[] = {AArch64::LDARB, AArch64::LDARH, 2768 AArch64::LDARW, AArch64::LDARX}; 2769 I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)])); 2770 } else { 2771 static unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH, 2772 AArch64::STLRW, AArch64::STLRX}; 2773 Register ValReg = LdSt.getReg(0); 2774 if (MRI.getType(ValReg).getSizeInBits() == 64 && MemSizeInBits != 64) { 2775 // Emit a subreg copy of 32 bits. 2776 Register NewVal = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 2777 MIB.buildInstr(TargetOpcode::COPY, {NewVal}, {}) 2778 .addReg(I.getOperand(0).getReg(), 0, AArch64::sub_32); 2779 I.getOperand(0).setReg(NewVal); 2780 } 2781 I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)])); 2782 } 2783 constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2784 return true; 2785 } 2786 2787 #ifndef NDEBUG 2788 const Register PtrReg = LdSt.getPointerReg(); 2789 const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI); 2790 // Check that the pointer register is valid. 2791 assert(PtrRB.getID() == AArch64::GPRRegBankID && 2792 "Load/Store pointer operand isn't a GPR"); 2793 assert(MRI.getType(PtrReg).isPointer() && 2794 "Load/Store pointer operand isn't a pointer"); 2795 #endif 2796 2797 const Register ValReg = LdSt.getReg(0); 2798 const LLT ValTy = MRI.getType(ValReg); 2799 const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI); 2800 2801 // The code below doesn't support truncating stores, so we need to split it 2802 // again. 2803 if (isa<GStore>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) { 2804 unsigned SubReg; 2805 LLT MemTy = LdSt.getMMO().getMemoryType(); 2806 auto *RC = getRegClassForTypeOnBank(MemTy, RB, RBI); 2807 if (!getSubRegForClass(RC, TRI, SubReg)) 2808 return false; 2809 2810 // Generate a subreg copy. 2811 auto Copy = MIB.buildInstr(TargetOpcode::COPY, {MemTy}, {}) 2812 .addReg(ValReg, 0, SubReg) 2813 .getReg(0); 2814 RBI.constrainGenericRegister(Copy, *RC, MRI); 2815 LdSt.getOperand(0).setReg(Copy); 2816 } else if (isa<GLoad>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) { 2817 // If this is an any-extending load from the FPR bank, split it into a regular 2818 // load + extend. 2819 if (RB.getID() == AArch64::FPRRegBankID) { 2820 unsigned SubReg; 2821 LLT MemTy = LdSt.getMMO().getMemoryType(); 2822 auto *RC = getRegClassForTypeOnBank(MemTy, RB, RBI); 2823 if (!getSubRegForClass(RC, TRI, SubReg)) 2824 return false; 2825 Register OldDst = LdSt.getReg(0); 2826 Register NewDst = 2827 MRI.createGenericVirtualRegister(LdSt.getMMO().getMemoryType()); 2828 LdSt.getOperand(0).setReg(NewDst); 2829 MRI.setRegBank(NewDst, RB); 2830 // Generate a SUBREG_TO_REG to extend it. 2831 MIB.setInsertPt(MIB.getMBB(), std::next(LdSt.getIterator())); 2832 MIB.buildInstr(AArch64::SUBREG_TO_REG, {OldDst}, {}) 2833 .addImm(0) 2834 .addUse(NewDst) 2835 .addImm(SubReg); 2836 auto SubRegRC = getRegClassForTypeOnBank(MRI.getType(OldDst), RB, RBI); 2837 RBI.constrainGenericRegister(OldDst, *SubRegRC, MRI); 2838 MIB.setInstr(LdSt); 2839 } 2840 } 2841 2842 // Helper lambda for partially selecting I. Either returns the original 2843 // instruction with an updated opcode, or a new instruction. 2844 auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * { 2845 bool IsStore = isa<GStore>(I); 2846 const unsigned NewOpc = 2847 selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits); 2848 if (NewOpc == I.getOpcode()) 2849 return nullptr; 2850 // Check if we can fold anything into the addressing mode. 2851 auto AddrModeFns = 2852 selectAddrModeIndexed(I.getOperand(1), MemSizeInBytes); 2853 if (!AddrModeFns) { 2854 // Can't fold anything. Use the original instruction. 2855 I.setDesc(TII.get(NewOpc)); 2856 I.addOperand(MachineOperand::CreateImm(0)); 2857 return &I; 2858 } 2859 2860 // Folded something. Create a new instruction and return it. 2861 auto NewInst = MIB.buildInstr(NewOpc, {}, {}, I.getFlags()); 2862 Register CurValReg = I.getOperand(0).getReg(); 2863 IsStore ? NewInst.addUse(CurValReg) : NewInst.addDef(CurValReg); 2864 NewInst.cloneMemRefs(I); 2865 for (auto &Fn : *AddrModeFns) 2866 Fn(NewInst); 2867 I.eraseFromParent(); 2868 return &*NewInst; 2869 }; 2870 2871 MachineInstr *LoadStore = SelectLoadStoreAddressingMode(); 2872 if (!LoadStore) 2873 return false; 2874 2875 // If we're storing a 0, use WZR/XZR. 2876 if (Opcode == TargetOpcode::G_STORE) { 2877 auto CVal = getIConstantVRegValWithLookThrough( 2878 LoadStore->getOperand(0).getReg(), MRI); 2879 if (CVal && CVal->Value == 0) { 2880 switch (LoadStore->getOpcode()) { 2881 case AArch64::STRWui: 2882 case AArch64::STRHHui: 2883 case AArch64::STRBBui: 2884 LoadStore->getOperand(0).setReg(AArch64::WZR); 2885 break; 2886 case AArch64::STRXui: 2887 LoadStore->getOperand(0).setReg(AArch64::XZR); 2888 break; 2889 } 2890 } 2891 } 2892 2893 if (IsZExtLoad) { 2894 // The zextload from a smaller type to i32 should be handled by the 2895 // importer. 2896 if (MRI.getType(LoadStore->getOperand(0).getReg()).getSizeInBits() != 64) 2897 return false; 2898 // If we have a ZEXTLOAD then change the load's type to be a narrower reg 2899 // and zero_extend with SUBREG_TO_REG. 2900 Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 2901 Register DstReg = LoadStore->getOperand(0).getReg(); 2902 LoadStore->getOperand(0).setReg(LdReg); 2903 2904 MIB.setInsertPt(MIB.getMBB(), std::next(LoadStore->getIterator())); 2905 MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {}) 2906 .addImm(0) 2907 .addUse(LdReg) 2908 .addImm(AArch64::sub_32); 2909 constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI); 2910 return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass, 2911 MRI); 2912 } 2913 return constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI); 2914 } 2915 2916 case TargetOpcode::G_SMULH: 2917 case TargetOpcode::G_UMULH: { 2918 // Reject the various things we don't support yet. 2919 if (unsupportedBinOp(I, RBI, MRI, TRI)) 2920 return false; 2921 2922 const Register DefReg = I.getOperand(0).getReg(); 2923 const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); 2924 2925 if (RB.getID() != AArch64::GPRRegBankID) { 2926 LLVM_DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n"); 2927 return false; 2928 } 2929 2930 if (Ty != LLT::scalar(64)) { 2931 LLVM_DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty 2932 << ", expected: " << LLT::scalar(64) << '\n'); 2933 return false; 2934 } 2935 2936 unsigned NewOpc = I.getOpcode() == TargetOpcode::G_SMULH ? AArch64::SMULHrr 2937 : AArch64::UMULHrr; 2938 I.setDesc(TII.get(NewOpc)); 2939 2940 // Now that we selected an opcode, we need to constrain the register 2941 // operands to use appropriate classes. 2942 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2943 } 2944 case TargetOpcode::G_LSHR: 2945 case TargetOpcode::G_ASHR: 2946 if (MRI.getType(I.getOperand(0).getReg()).isVector()) 2947 return selectVectorAshrLshr(I, MRI); 2948 LLVM_FALLTHROUGH; 2949 case TargetOpcode::G_SHL: 2950 if (Opcode == TargetOpcode::G_SHL && 2951 MRI.getType(I.getOperand(0).getReg()).isVector()) 2952 return selectVectorSHL(I, MRI); 2953 LLVM_FALLTHROUGH; 2954 case TargetOpcode::G_FADD: 2955 case TargetOpcode::G_FSUB: 2956 case TargetOpcode::G_FMUL: 2957 case TargetOpcode::G_FDIV: 2958 case TargetOpcode::G_OR: { 2959 // Reject the various things we don't support yet. 2960 if (unsupportedBinOp(I, RBI, MRI, TRI)) 2961 return false; 2962 2963 const unsigned OpSize = Ty.getSizeInBits(); 2964 2965 const Register DefReg = I.getOperand(0).getReg(); 2966 const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); 2967 2968 const unsigned NewOpc = selectBinaryOp(I.getOpcode(), RB.getID(), OpSize); 2969 if (NewOpc == I.getOpcode()) 2970 return false; 2971 2972 I.setDesc(TII.get(NewOpc)); 2973 // FIXME: Should the type be always reset in setDesc? 2974 2975 // Now that we selected an opcode, we need to constrain the register 2976 // operands to use appropriate classes. 2977 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2978 } 2979 2980 case TargetOpcode::G_PTR_ADD: { 2981 emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2), MIB); 2982 I.eraseFromParent(); 2983 return true; 2984 } 2985 case TargetOpcode::G_SADDO: 2986 case TargetOpcode::G_UADDO: 2987 case TargetOpcode::G_SSUBO: 2988 case TargetOpcode::G_USUBO: { 2989 // Emit the operation and get the correct condition code. 2990 auto OpAndCC = emitOverflowOp(Opcode, I.getOperand(0).getReg(), 2991 I.getOperand(2), I.getOperand(3), MIB); 2992 2993 // Now, put the overflow result in the register given by the first operand 2994 // to the overflow op. CSINC increments the result when the predicate is 2995 // false, so to get the increment when it's true, we need to use the 2996 // inverse. In this case, we want to increment when carry is set. 2997 Register ZReg = AArch64::WZR; 2998 emitCSINC(/*Dst=*/I.getOperand(1).getReg(), /*Src1=*/ZReg, /*Src2=*/ZReg, 2999 getInvertedCondCode(OpAndCC.second), MIB); 3000 I.eraseFromParent(); 3001 return true; 3002 } 3003 3004 case TargetOpcode::G_PTRMASK: { 3005 Register MaskReg = I.getOperand(2).getReg(); 3006 Optional<int64_t> MaskVal = getIConstantVRegSExtVal(MaskReg, MRI); 3007 // TODO: Implement arbitrary cases 3008 if (!MaskVal || !isShiftedMask_64(*MaskVal)) 3009 return false; 3010 3011 uint64_t Mask = *MaskVal; 3012 I.setDesc(TII.get(AArch64::ANDXri)); 3013 I.getOperand(2).ChangeToImmediate( 3014 AArch64_AM::encodeLogicalImmediate(Mask, 64)); 3015 3016 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3017 } 3018 case TargetOpcode::G_PTRTOINT: 3019 case TargetOpcode::G_TRUNC: { 3020 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 3021 const LLT SrcTy = MRI.getType(I.getOperand(1).getReg()); 3022 3023 const Register DstReg = I.getOperand(0).getReg(); 3024 const Register SrcReg = I.getOperand(1).getReg(); 3025 3026 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); 3027 const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI); 3028 3029 if (DstRB.getID() != SrcRB.getID()) { 3030 LLVM_DEBUG( 3031 dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n"); 3032 return false; 3033 } 3034 3035 if (DstRB.getID() == AArch64::GPRRegBankID) { 3036 const TargetRegisterClass *DstRC = 3037 getRegClassForTypeOnBank(DstTy, DstRB, RBI); 3038 if (!DstRC) 3039 return false; 3040 3041 const TargetRegisterClass *SrcRC = 3042 getRegClassForTypeOnBank(SrcTy, SrcRB, RBI); 3043 if (!SrcRC) 3044 return false; 3045 3046 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) || 3047 !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { 3048 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n"); 3049 return false; 3050 } 3051 3052 if (DstRC == SrcRC) { 3053 // Nothing to be done 3054 } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) && 3055 SrcTy == LLT::scalar(64)) { 3056 llvm_unreachable("TableGen can import this case"); 3057 return false; 3058 } else if (DstRC == &AArch64::GPR32RegClass && 3059 SrcRC == &AArch64::GPR64RegClass) { 3060 I.getOperand(1).setSubReg(AArch64::sub_32); 3061 } else { 3062 LLVM_DEBUG( 3063 dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n"); 3064 return false; 3065 } 3066 3067 I.setDesc(TII.get(TargetOpcode::COPY)); 3068 return true; 3069 } else if (DstRB.getID() == AArch64::FPRRegBankID) { 3070 if (DstTy == LLT::fixed_vector(4, 16) && 3071 SrcTy == LLT::fixed_vector(4, 32)) { 3072 I.setDesc(TII.get(AArch64::XTNv4i16)); 3073 constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3074 return true; 3075 } 3076 3077 if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) { 3078 MachineInstr *Extract = emitExtractVectorElt( 3079 DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB); 3080 if (!Extract) 3081 return false; 3082 I.eraseFromParent(); 3083 return true; 3084 } 3085 3086 // We might have a vector G_PTRTOINT, in which case just emit a COPY. 3087 if (Opcode == TargetOpcode::G_PTRTOINT) { 3088 assert(DstTy.isVector() && "Expected an FPR ptrtoint to be a vector"); 3089 I.setDesc(TII.get(TargetOpcode::COPY)); 3090 return selectCopy(I, TII, MRI, TRI, RBI); 3091 } 3092 } 3093 3094 return false; 3095 } 3096 3097 case TargetOpcode::G_ANYEXT: { 3098 if (selectUSMovFromExtend(I, MRI)) 3099 return true; 3100 3101 const Register DstReg = I.getOperand(0).getReg(); 3102 const Register SrcReg = I.getOperand(1).getReg(); 3103 3104 const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI); 3105 if (RBDst.getID() != AArch64::GPRRegBankID) { 3106 LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst 3107 << ", expected: GPR\n"); 3108 return false; 3109 } 3110 3111 const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI); 3112 if (RBSrc.getID() != AArch64::GPRRegBankID) { 3113 LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc 3114 << ", expected: GPR\n"); 3115 return false; 3116 } 3117 3118 const unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); 3119 3120 if (DstSize == 0) { 3121 LLVM_DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n"); 3122 return false; 3123 } 3124 3125 if (DstSize != 64 && DstSize > 32) { 3126 LLVM_DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize 3127 << ", expected: 32 or 64\n"); 3128 return false; 3129 } 3130 // At this point G_ANYEXT is just like a plain COPY, but we need 3131 // to explicitly form the 64-bit value if any. 3132 if (DstSize > 32) { 3133 Register ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass); 3134 BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG)) 3135 .addDef(ExtSrc) 3136 .addImm(0) 3137 .addUse(SrcReg) 3138 .addImm(AArch64::sub_32); 3139 I.getOperand(1).setReg(ExtSrc); 3140 } 3141 return selectCopy(I, TII, MRI, TRI, RBI); 3142 } 3143 3144 case TargetOpcode::G_ZEXT: 3145 case TargetOpcode::G_SEXT_INREG: 3146 case TargetOpcode::G_SEXT: { 3147 if (selectUSMovFromExtend(I, MRI)) 3148 return true; 3149 3150 unsigned Opcode = I.getOpcode(); 3151 const bool IsSigned = Opcode != TargetOpcode::G_ZEXT; 3152 const Register DefReg = I.getOperand(0).getReg(); 3153 Register SrcReg = I.getOperand(1).getReg(); 3154 const LLT DstTy = MRI.getType(DefReg); 3155 const LLT SrcTy = MRI.getType(SrcReg); 3156 unsigned DstSize = DstTy.getSizeInBits(); 3157 unsigned SrcSize = SrcTy.getSizeInBits(); 3158 3159 // SEXT_INREG has the same src reg size as dst, the size of the value to be 3160 // extended is encoded in the imm. 3161 if (Opcode == TargetOpcode::G_SEXT_INREG) 3162 SrcSize = I.getOperand(2).getImm(); 3163 3164 if (DstTy.isVector()) 3165 return false; // Should be handled by imported patterns. 3166 3167 assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() == 3168 AArch64::GPRRegBankID && 3169 "Unexpected ext regbank"); 3170 3171 MachineInstr *ExtI; 3172 3173 // First check if we're extending the result of a load which has a dest type 3174 // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest 3175 // GPR register on AArch64 and all loads which are smaller automatically 3176 // zero-extend the upper bits. E.g. 3177 // %v(s8) = G_LOAD %p, :: (load 1) 3178 // %v2(s32) = G_ZEXT %v(s8) 3179 if (!IsSigned) { 3180 auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI); 3181 bool IsGPR = 3182 RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID; 3183 if (LoadMI && IsGPR) { 3184 const MachineMemOperand *MemOp = *LoadMI->memoperands_begin(); 3185 unsigned BytesLoaded = MemOp->getSize(); 3186 if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded) 3187 return selectCopy(I, TII, MRI, TRI, RBI); 3188 } 3189 3190 // For the 32-bit -> 64-bit case, we can emit a mov (ORRWrs) 3191 // + SUBREG_TO_REG. 3192 // 3193 // If we are zero extending from 32 bits to 64 bits, it's possible that 3194 // the instruction implicitly does the zero extend for us. In that case, 3195 // we only need the SUBREG_TO_REG. 3196 if (IsGPR && SrcSize == 32 && DstSize == 64) { 3197 // Unlike with the G_LOAD case, we don't want to look through copies 3198 // here. (See isDef32.) 3199 MachineInstr *Def = MRI.getVRegDef(SrcReg); 3200 Register SubregToRegSrc = SrcReg; 3201 3202 // Does the instruction implicitly zero extend? 3203 if (!Def || !isDef32(*Def)) { 3204 // No. Zero out using an OR. 3205 Register OrDst = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 3206 const Register ZReg = AArch64::WZR; 3207 MIB.buildInstr(AArch64::ORRWrs, {OrDst}, {ZReg, SrcReg}).addImm(0); 3208 SubregToRegSrc = OrDst; 3209 } 3210 3211 MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {}) 3212 .addImm(0) 3213 .addUse(SubregToRegSrc) 3214 .addImm(AArch64::sub_32); 3215 3216 if (!RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, 3217 MRI)) { 3218 LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n"); 3219 return false; 3220 } 3221 3222 if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, 3223 MRI)) { 3224 LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n"); 3225 return false; 3226 } 3227 3228 I.eraseFromParent(); 3229 return true; 3230 } 3231 } 3232 3233 if (DstSize == 64) { 3234 if (Opcode != TargetOpcode::G_SEXT_INREG) { 3235 // FIXME: Can we avoid manually doing this? 3236 if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, 3237 MRI)) { 3238 LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode) 3239 << " operand\n"); 3240 return false; 3241 } 3242 SrcReg = MIB.buildInstr(AArch64::SUBREG_TO_REG, 3243 {&AArch64::GPR64RegClass}, {}) 3244 .addImm(0) 3245 .addUse(SrcReg) 3246 .addImm(AArch64::sub_32) 3247 .getReg(0); 3248 } 3249 3250 ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri, 3251 {DefReg}, {SrcReg}) 3252 .addImm(0) 3253 .addImm(SrcSize - 1); 3254 } else if (DstSize <= 32) { 3255 ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri, 3256 {DefReg}, {SrcReg}) 3257 .addImm(0) 3258 .addImm(SrcSize - 1); 3259 } else { 3260 return false; 3261 } 3262 3263 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 3264 I.eraseFromParent(); 3265 return true; 3266 } 3267 3268 case TargetOpcode::G_SITOFP: 3269 case TargetOpcode::G_UITOFP: 3270 case TargetOpcode::G_FPTOSI: 3271 case TargetOpcode::G_FPTOUI: { 3272 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()), 3273 SrcTy = MRI.getType(I.getOperand(1).getReg()); 3274 const unsigned NewOpc = selectFPConvOpc(Opcode, DstTy, SrcTy); 3275 if (NewOpc == Opcode) 3276 return false; 3277 3278 I.setDesc(TII.get(NewOpc)); 3279 constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3280 3281 return true; 3282 } 3283 3284 case TargetOpcode::G_FREEZE: 3285 return selectCopy(I, TII, MRI, TRI, RBI); 3286 3287 case TargetOpcode::G_INTTOPTR: 3288 // The importer is currently unable to import pointer types since they 3289 // didn't exist in SelectionDAG. 3290 return selectCopy(I, TII, MRI, TRI, RBI); 3291 3292 case TargetOpcode::G_BITCAST: 3293 // Imported SelectionDAG rules can handle every bitcast except those that 3294 // bitcast from a type to the same type. Ideally, these shouldn't occur 3295 // but we might not run an optimizer that deletes them. The other exception 3296 // is bitcasts involving pointer types, as SelectionDAG has no knowledge 3297 // of them. 3298 return selectCopy(I, TII, MRI, TRI, RBI); 3299 3300 case TargetOpcode::G_SELECT: { 3301 if (MRI.getType(I.getOperand(1).getReg()) != LLT::scalar(1)) { 3302 LLVM_DEBUG(dbgs() << "G_SELECT cond has type: " << Ty 3303 << ", expected: " << LLT::scalar(1) << '\n'); 3304 return false; 3305 } 3306 3307 const Register CondReg = I.getOperand(1).getReg(); 3308 const Register TReg = I.getOperand(2).getReg(); 3309 const Register FReg = I.getOperand(3).getReg(); 3310 3311 if (tryOptSelect(I)) 3312 return true; 3313 3314 // Make sure to use an unused vreg instead of wzr, so that the peephole 3315 // optimizations will be able to optimize these. 3316 Register DeadVReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 3317 auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg}) 3318 .addImm(AArch64_AM::encodeLogicalImmediate(1, 32)); 3319 constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI); 3320 if (!emitSelect(I.getOperand(0).getReg(), TReg, FReg, AArch64CC::NE, MIB)) 3321 return false; 3322 I.eraseFromParent(); 3323 return true; 3324 } 3325 case TargetOpcode::G_ICMP: { 3326 if (Ty.isVector()) 3327 return selectVectorICmp(I, MRI); 3328 3329 if (Ty != LLT::scalar(32)) { 3330 LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty 3331 << ", expected: " << LLT::scalar(32) << '\n'); 3332 return false; 3333 } 3334 3335 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate()); 3336 const AArch64CC::CondCode InvCC = 3337 changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred)); 3338 emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1), MIB); 3339 emitCSINC(/*Dst=*/I.getOperand(0).getReg(), /*Src1=*/AArch64::WZR, 3340 /*Src2=*/AArch64::WZR, InvCC, MIB); 3341 I.eraseFromParent(); 3342 return true; 3343 } 3344 3345 case TargetOpcode::G_FCMP: { 3346 CmpInst::Predicate Pred = 3347 static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate()); 3348 if (!emitFPCompare(I.getOperand(2).getReg(), I.getOperand(3).getReg(), MIB, 3349 Pred) || 3350 !emitCSetForFCmp(I.getOperand(0).getReg(), Pred, MIB)) 3351 return false; 3352 I.eraseFromParent(); 3353 return true; 3354 } 3355 case TargetOpcode::G_VASTART: 3356 return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI) 3357 : selectVaStartAAPCS(I, MF, MRI); 3358 case TargetOpcode::G_INTRINSIC: 3359 return selectIntrinsic(I, MRI); 3360 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: 3361 return selectIntrinsicWithSideEffects(I, MRI); 3362 case TargetOpcode::G_IMPLICIT_DEF: { 3363 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); 3364 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 3365 const Register DstReg = I.getOperand(0).getReg(); 3366 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); 3367 const TargetRegisterClass *DstRC = 3368 getRegClassForTypeOnBank(DstTy, DstRB, RBI); 3369 RBI.constrainGenericRegister(DstReg, *DstRC, MRI); 3370 return true; 3371 } 3372 case TargetOpcode::G_BLOCK_ADDR: { 3373 if (TM.getCodeModel() == CodeModel::Large) { 3374 materializeLargeCMVal(I, I.getOperand(1).getBlockAddress(), 0); 3375 I.eraseFromParent(); 3376 return true; 3377 } else { 3378 I.setDesc(TII.get(AArch64::MOVaddrBA)); 3379 auto MovMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::MOVaddrBA), 3380 I.getOperand(0).getReg()) 3381 .addBlockAddress(I.getOperand(1).getBlockAddress(), 3382 /* Offset */ 0, AArch64II::MO_PAGE) 3383 .addBlockAddress( 3384 I.getOperand(1).getBlockAddress(), /* Offset */ 0, 3385 AArch64II::MO_NC | AArch64II::MO_PAGEOFF); 3386 I.eraseFromParent(); 3387 return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI); 3388 } 3389 } 3390 case AArch64::G_DUP: { 3391 // When the scalar of G_DUP is an s8/s16 gpr, they can't be selected by 3392 // imported patterns. Do it manually here. Avoiding generating s16 gpr is 3393 // difficult because at RBS we may end up pessimizing the fpr case if we 3394 // decided to add an anyextend to fix this. Manual selection is the most 3395 // robust solution for now. 3396 if (RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() != 3397 AArch64::GPRRegBankID) 3398 return false; // We expect the fpr regbank case to be imported. 3399 LLT VecTy = MRI.getType(I.getOperand(0).getReg()); 3400 if (VecTy == LLT::fixed_vector(8, 8)) 3401 I.setDesc(TII.get(AArch64::DUPv8i8gpr)); 3402 else if (VecTy == LLT::fixed_vector(16, 8)) 3403 I.setDesc(TII.get(AArch64::DUPv16i8gpr)); 3404 else if (VecTy == LLT::fixed_vector(4, 16)) 3405 I.setDesc(TII.get(AArch64::DUPv4i16gpr)); 3406 else if (VecTy == LLT::fixed_vector(8, 16)) 3407 I.setDesc(TII.get(AArch64::DUPv8i16gpr)); 3408 else 3409 return false; 3410 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3411 } 3412 case TargetOpcode::G_INTRINSIC_TRUNC: 3413 return selectIntrinsicTrunc(I, MRI); 3414 case TargetOpcode::G_INTRINSIC_ROUND: 3415 return selectIntrinsicRound(I, MRI); 3416 case TargetOpcode::G_BUILD_VECTOR: 3417 return selectBuildVector(I, MRI); 3418 case TargetOpcode::G_MERGE_VALUES: 3419 return selectMergeValues(I, MRI); 3420 case TargetOpcode::G_UNMERGE_VALUES: 3421 return selectUnmergeValues(I, MRI); 3422 case TargetOpcode::G_SHUFFLE_VECTOR: 3423 return selectShuffleVector(I, MRI); 3424 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 3425 return selectExtractElt(I, MRI); 3426 case TargetOpcode::G_INSERT_VECTOR_ELT: 3427 return selectInsertElt(I, MRI); 3428 case TargetOpcode::G_CONCAT_VECTORS: 3429 return selectConcatVectors(I, MRI); 3430 case TargetOpcode::G_JUMP_TABLE: 3431 return selectJumpTable(I, MRI); 3432 case TargetOpcode::G_VECREDUCE_FADD: 3433 case TargetOpcode::G_VECREDUCE_ADD: 3434 return selectReduction(I, MRI); 3435 } 3436 3437 return false; 3438 } 3439 3440 bool AArch64InstructionSelector::selectReduction(MachineInstr &I, 3441 MachineRegisterInfo &MRI) { 3442 Register VecReg = I.getOperand(1).getReg(); 3443 LLT VecTy = MRI.getType(VecReg); 3444 if (I.getOpcode() == TargetOpcode::G_VECREDUCE_ADD) { 3445 // For <2 x i32> ADDPv2i32 generates an FPR64 value, so we need to emit 3446 // a subregister copy afterwards. 3447 if (VecTy == LLT::fixed_vector(2, 32)) { 3448 Register DstReg = I.getOperand(0).getReg(); 3449 auto AddP = MIB.buildInstr(AArch64::ADDPv2i32, {&AArch64::FPR64RegClass}, 3450 {VecReg, VecReg}); 3451 auto Copy = MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}) 3452 .addReg(AddP.getReg(0), 0, AArch64::ssub) 3453 .getReg(0); 3454 RBI.constrainGenericRegister(Copy, AArch64::FPR32RegClass, MRI); 3455 I.eraseFromParent(); 3456 return constrainSelectedInstRegOperands(*AddP, TII, TRI, RBI); 3457 } 3458 3459 unsigned Opc = 0; 3460 if (VecTy == LLT::fixed_vector(16, 8)) 3461 Opc = AArch64::ADDVv16i8v; 3462 else if (VecTy == LLT::fixed_vector(8, 16)) 3463 Opc = AArch64::ADDVv8i16v; 3464 else if (VecTy == LLT::fixed_vector(4, 32)) 3465 Opc = AArch64::ADDVv4i32v; 3466 else if (VecTy == LLT::fixed_vector(2, 64)) 3467 Opc = AArch64::ADDPv2i64p; 3468 else { 3469 LLVM_DEBUG(dbgs() << "Unhandled type for add reduction"); 3470 return false; 3471 } 3472 I.setDesc(TII.get(Opc)); 3473 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3474 } 3475 3476 if (I.getOpcode() == TargetOpcode::G_VECREDUCE_FADD) { 3477 unsigned Opc = 0; 3478 if (VecTy == LLT::fixed_vector(2, 32)) 3479 Opc = AArch64::FADDPv2i32p; 3480 else if (VecTy == LLT::fixed_vector(2, 64)) 3481 Opc = AArch64::FADDPv2i64p; 3482 else { 3483 LLVM_DEBUG(dbgs() << "Unhandled type for fadd reduction"); 3484 return false; 3485 } 3486 I.setDesc(TII.get(Opc)); 3487 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3488 } 3489 return false; 3490 } 3491 3492 bool AArch64InstructionSelector::selectBrJT(MachineInstr &I, 3493 MachineRegisterInfo &MRI) { 3494 assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT"); 3495 Register JTAddr = I.getOperand(0).getReg(); 3496 unsigned JTI = I.getOperand(1).getIndex(); 3497 Register Index = I.getOperand(2).getReg(); 3498 3499 Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); 3500 Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass); 3501 3502 MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(JTI, 4, nullptr); 3503 auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32, 3504 {TargetReg, ScratchReg}, {JTAddr, Index}) 3505 .addJumpTableIndex(JTI); 3506 // Build the indirect branch. 3507 MIB.buildInstr(AArch64::BR, {}, {TargetReg}); 3508 I.eraseFromParent(); 3509 return constrainSelectedInstRegOperands(*JumpTableInst, TII, TRI, RBI); 3510 } 3511 3512 bool AArch64InstructionSelector::selectJumpTable(MachineInstr &I, 3513 MachineRegisterInfo &MRI) { 3514 assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table"); 3515 assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!"); 3516 3517 Register DstReg = I.getOperand(0).getReg(); 3518 unsigned JTI = I.getOperand(1).getIndex(); 3519 // We generate a MOVaddrJT which will get expanded to an ADRP + ADD later. 3520 auto MovMI = 3521 MIB.buildInstr(AArch64::MOVaddrJT, {DstReg}, {}) 3522 .addJumpTableIndex(JTI, AArch64II::MO_PAGE) 3523 .addJumpTableIndex(JTI, AArch64II::MO_NC | AArch64II::MO_PAGEOFF); 3524 I.eraseFromParent(); 3525 return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI); 3526 } 3527 3528 bool AArch64InstructionSelector::selectTLSGlobalValue( 3529 MachineInstr &I, MachineRegisterInfo &MRI) { 3530 if (!STI.isTargetMachO()) 3531 return false; 3532 MachineFunction &MF = *I.getParent()->getParent(); 3533 MF.getFrameInfo().setAdjustsStack(true); 3534 3535 const auto &GlobalOp = I.getOperand(1); 3536 assert(GlobalOp.getOffset() == 0 && 3537 "Shouldn't have an offset on TLS globals!"); 3538 const GlobalValue &GV = *GlobalOp.getGlobal(); 3539 3540 auto LoadGOT = 3541 MIB.buildInstr(AArch64::LOADgot, {&AArch64::GPR64commonRegClass}, {}) 3542 .addGlobalAddress(&GV, 0, AArch64II::MO_TLS); 3543 3544 auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass}, 3545 {LoadGOT.getReg(0)}) 3546 .addImm(0); 3547 3548 MIB.buildCopy(Register(AArch64::X0), LoadGOT.getReg(0)); 3549 // TLS calls preserve all registers except those that absolutely must be 3550 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be 3551 // silly). 3552 MIB.buildInstr(getBLRCallOpcode(MF), {}, {Load}) 3553 .addUse(AArch64::X0, RegState::Implicit) 3554 .addDef(AArch64::X0, RegState::Implicit) 3555 .addRegMask(TRI.getTLSCallPreservedMask()); 3556 3557 MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X0)); 3558 RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR64RegClass, 3559 MRI); 3560 I.eraseFromParent(); 3561 return true; 3562 } 3563 3564 bool AArch64InstructionSelector::selectIntrinsicTrunc( 3565 MachineInstr &I, MachineRegisterInfo &MRI) const { 3566 const LLT SrcTy = MRI.getType(I.getOperand(0).getReg()); 3567 3568 // Select the correct opcode. 3569 unsigned Opc = 0; 3570 if (!SrcTy.isVector()) { 3571 switch (SrcTy.getSizeInBits()) { 3572 default: 3573 case 16: 3574 Opc = AArch64::FRINTZHr; 3575 break; 3576 case 32: 3577 Opc = AArch64::FRINTZSr; 3578 break; 3579 case 64: 3580 Opc = AArch64::FRINTZDr; 3581 break; 3582 } 3583 } else { 3584 unsigned NumElts = SrcTy.getNumElements(); 3585 switch (SrcTy.getElementType().getSizeInBits()) { 3586 default: 3587 break; 3588 case 16: 3589 if (NumElts == 4) 3590 Opc = AArch64::FRINTZv4f16; 3591 else if (NumElts == 8) 3592 Opc = AArch64::FRINTZv8f16; 3593 break; 3594 case 32: 3595 if (NumElts == 2) 3596 Opc = AArch64::FRINTZv2f32; 3597 else if (NumElts == 4) 3598 Opc = AArch64::FRINTZv4f32; 3599 break; 3600 case 64: 3601 if (NumElts == 2) 3602 Opc = AArch64::FRINTZv2f64; 3603 break; 3604 } 3605 } 3606 3607 if (!Opc) { 3608 // Didn't get an opcode above, bail. 3609 LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_TRUNC!\n"); 3610 return false; 3611 } 3612 3613 // Legalization would have set us up perfectly for this; we just need to 3614 // set the opcode and move on. 3615 I.setDesc(TII.get(Opc)); 3616 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3617 } 3618 3619 bool AArch64InstructionSelector::selectIntrinsicRound( 3620 MachineInstr &I, MachineRegisterInfo &MRI) const { 3621 const LLT SrcTy = MRI.getType(I.getOperand(0).getReg()); 3622 3623 // Select the correct opcode. 3624 unsigned Opc = 0; 3625 if (!SrcTy.isVector()) { 3626 switch (SrcTy.getSizeInBits()) { 3627 default: 3628 case 16: 3629 Opc = AArch64::FRINTAHr; 3630 break; 3631 case 32: 3632 Opc = AArch64::FRINTASr; 3633 break; 3634 case 64: 3635 Opc = AArch64::FRINTADr; 3636 break; 3637 } 3638 } else { 3639 unsigned NumElts = SrcTy.getNumElements(); 3640 switch (SrcTy.getElementType().getSizeInBits()) { 3641 default: 3642 break; 3643 case 16: 3644 if (NumElts == 4) 3645 Opc = AArch64::FRINTAv4f16; 3646 else if (NumElts == 8) 3647 Opc = AArch64::FRINTAv8f16; 3648 break; 3649 case 32: 3650 if (NumElts == 2) 3651 Opc = AArch64::FRINTAv2f32; 3652 else if (NumElts == 4) 3653 Opc = AArch64::FRINTAv4f32; 3654 break; 3655 case 64: 3656 if (NumElts == 2) 3657 Opc = AArch64::FRINTAv2f64; 3658 break; 3659 } 3660 } 3661 3662 if (!Opc) { 3663 // Didn't get an opcode above, bail. 3664 LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_ROUND!\n"); 3665 return false; 3666 } 3667 3668 // Legalization would have set us up perfectly for this; we just need to 3669 // set the opcode and move on. 3670 I.setDesc(TII.get(Opc)); 3671 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3672 } 3673 3674 bool AArch64InstructionSelector::selectVectorICmp( 3675 MachineInstr &I, MachineRegisterInfo &MRI) { 3676 Register DstReg = I.getOperand(0).getReg(); 3677 LLT DstTy = MRI.getType(DstReg); 3678 Register SrcReg = I.getOperand(2).getReg(); 3679 Register Src2Reg = I.getOperand(3).getReg(); 3680 LLT SrcTy = MRI.getType(SrcReg); 3681 3682 unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits(); 3683 unsigned NumElts = DstTy.getNumElements(); 3684 3685 // First index is element size, 0 == 8b, 1 == 16b, 2 == 32b, 3 == 64b 3686 // Second index is num elts, 0 == v2, 1 == v4, 2 == v8, 3 == v16 3687 // Third index is cc opcode: 3688 // 0 == eq 3689 // 1 == ugt 3690 // 2 == uge 3691 // 3 == ult 3692 // 4 == ule 3693 // 5 == sgt 3694 // 6 == sge 3695 // 7 == slt 3696 // 8 == sle 3697 // ne is done by negating 'eq' result. 3698 3699 // This table below assumes that for some comparisons the operands will be 3700 // commuted. 3701 // ult op == commute + ugt op 3702 // ule op == commute + uge op 3703 // slt op == commute + sgt op 3704 // sle op == commute + sge op 3705 unsigned PredIdx = 0; 3706 bool SwapOperands = false; 3707 CmpInst::Predicate Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); 3708 switch (Pred) { 3709 case CmpInst::ICMP_NE: 3710 case CmpInst::ICMP_EQ: 3711 PredIdx = 0; 3712 break; 3713 case CmpInst::ICMP_UGT: 3714 PredIdx = 1; 3715 break; 3716 case CmpInst::ICMP_UGE: 3717 PredIdx = 2; 3718 break; 3719 case CmpInst::ICMP_ULT: 3720 PredIdx = 3; 3721 SwapOperands = true; 3722 break; 3723 case CmpInst::ICMP_ULE: 3724 PredIdx = 4; 3725 SwapOperands = true; 3726 break; 3727 case CmpInst::ICMP_SGT: 3728 PredIdx = 5; 3729 break; 3730 case CmpInst::ICMP_SGE: 3731 PredIdx = 6; 3732 break; 3733 case CmpInst::ICMP_SLT: 3734 PredIdx = 7; 3735 SwapOperands = true; 3736 break; 3737 case CmpInst::ICMP_SLE: 3738 PredIdx = 8; 3739 SwapOperands = true; 3740 break; 3741 default: 3742 llvm_unreachable("Unhandled icmp predicate"); 3743 return false; 3744 } 3745 3746 // This table obviously should be tablegen'd when we have our GISel native 3747 // tablegen selector. 3748 3749 static const unsigned OpcTable[4][4][9] = { 3750 { 3751 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3752 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3753 0 /* invalid */}, 3754 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3755 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3756 0 /* invalid */}, 3757 {AArch64::CMEQv8i8, AArch64::CMHIv8i8, AArch64::CMHSv8i8, 3758 AArch64::CMHIv8i8, AArch64::CMHSv8i8, AArch64::CMGTv8i8, 3759 AArch64::CMGEv8i8, AArch64::CMGTv8i8, AArch64::CMGEv8i8}, 3760 {AArch64::CMEQv16i8, AArch64::CMHIv16i8, AArch64::CMHSv16i8, 3761 AArch64::CMHIv16i8, AArch64::CMHSv16i8, AArch64::CMGTv16i8, 3762 AArch64::CMGEv16i8, AArch64::CMGTv16i8, AArch64::CMGEv16i8} 3763 }, 3764 { 3765 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3766 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3767 0 /* invalid */}, 3768 {AArch64::CMEQv4i16, AArch64::CMHIv4i16, AArch64::CMHSv4i16, 3769 AArch64::CMHIv4i16, AArch64::CMHSv4i16, AArch64::CMGTv4i16, 3770 AArch64::CMGEv4i16, AArch64::CMGTv4i16, AArch64::CMGEv4i16}, 3771 {AArch64::CMEQv8i16, AArch64::CMHIv8i16, AArch64::CMHSv8i16, 3772 AArch64::CMHIv8i16, AArch64::CMHSv8i16, AArch64::CMGTv8i16, 3773 AArch64::CMGEv8i16, AArch64::CMGTv8i16, AArch64::CMGEv8i16}, 3774 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3775 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3776 0 /* invalid */} 3777 }, 3778 { 3779 {AArch64::CMEQv2i32, AArch64::CMHIv2i32, AArch64::CMHSv2i32, 3780 AArch64::CMHIv2i32, AArch64::CMHSv2i32, AArch64::CMGTv2i32, 3781 AArch64::CMGEv2i32, AArch64::CMGTv2i32, AArch64::CMGEv2i32}, 3782 {AArch64::CMEQv4i32, AArch64::CMHIv4i32, AArch64::CMHSv4i32, 3783 AArch64::CMHIv4i32, AArch64::CMHSv4i32, AArch64::CMGTv4i32, 3784 AArch64::CMGEv4i32, AArch64::CMGTv4i32, AArch64::CMGEv4i32}, 3785 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3786 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3787 0 /* invalid */}, 3788 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3789 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3790 0 /* invalid */} 3791 }, 3792 { 3793 {AArch64::CMEQv2i64, AArch64::CMHIv2i64, AArch64::CMHSv2i64, 3794 AArch64::CMHIv2i64, AArch64::CMHSv2i64, AArch64::CMGTv2i64, 3795 AArch64::CMGEv2i64, AArch64::CMGTv2i64, AArch64::CMGEv2i64}, 3796 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3797 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3798 0 /* invalid */}, 3799 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3800 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3801 0 /* invalid */}, 3802 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3803 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3804 0 /* invalid */} 3805 }, 3806 }; 3807 unsigned EltIdx = Log2_32(SrcEltSize / 8); 3808 unsigned NumEltsIdx = Log2_32(NumElts / 2); 3809 unsigned Opc = OpcTable[EltIdx][NumEltsIdx][PredIdx]; 3810 if (!Opc) { 3811 LLVM_DEBUG(dbgs() << "Could not map G_ICMP to cmp opcode"); 3812 return false; 3813 } 3814 3815 const RegisterBank &VecRB = *RBI.getRegBank(SrcReg, MRI, TRI); 3816 const TargetRegisterClass *SrcRC = 3817 getRegClassForTypeOnBank(SrcTy, VecRB, RBI, true); 3818 if (!SrcRC) { 3819 LLVM_DEBUG(dbgs() << "Could not determine source register class.\n"); 3820 return false; 3821 } 3822 3823 unsigned NotOpc = Pred == ICmpInst::ICMP_NE ? AArch64::NOTv8i8 : 0; 3824 if (SrcTy.getSizeInBits() == 128) 3825 NotOpc = NotOpc ? AArch64::NOTv16i8 : 0; 3826 3827 if (SwapOperands) 3828 std::swap(SrcReg, Src2Reg); 3829 3830 auto Cmp = MIB.buildInstr(Opc, {SrcRC}, {SrcReg, Src2Reg}); 3831 constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI); 3832 3833 // Invert if we had a 'ne' cc. 3834 if (NotOpc) { 3835 Cmp = MIB.buildInstr(NotOpc, {DstReg}, {Cmp}); 3836 constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI); 3837 } else { 3838 MIB.buildCopy(DstReg, Cmp.getReg(0)); 3839 } 3840 RBI.constrainGenericRegister(DstReg, *SrcRC, MRI); 3841 I.eraseFromParent(); 3842 return true; 3843 } 3844 3845 MachineInstr *AArch64InstructionSelector::emitScalarToVector( 3846 unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar, 3847 MachineIRBuilder &MIRBuilder) const { 3848 auto Undef = MIRBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstRC}, {}); 3849 3850 auto BuildFn = [&](unsigned SubregIndex) { 3851 auto Ins = 3852 MIRBuilder 3853 .buildInstr(TargetOpcode::INSERT_SUBREG, {DstRC}, {Undef, Scalar}) 3854 .addImm(SubregIndex); 3855 constrainSelectedInstRegOperands(*Undef, TII, TRI, RBI); 3856 constrainSelectedInstRegOperands(*Ins, TII, TRI, RBI); 3857 return &*Ins; 3858 }; 3859 3860 switch (EltSize) { 3861 case 16: 3862 return BuildFn(AArch64::hsub); 3863 case 32: 3864 return BuildFn(AArch64::ssub); 3865 case 64: 3866 return BuildFn(AArch64::dsub); 3867 default: 3868 return nullptr; 3869 } 3870 } 3871 3872 bool AArch64InstructionSelector::selectMergeValues( 3873 MachineInstr &I, MachineRegisterInfo &MRI) { 3874 assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode"); 3875 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 3876 const LLT SrcTy = MRI.getType(I.getOperand(1).getReg()); 3877 assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation"); 3878 const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI); 3879 3880 if (I.getNumOperands() != 3) 3881 return false; 3882 3883 // Merging 2 s64s into an s128. 3884 if (DstTy == LLT::scalar(128)) { 3885 if (SrcTy.getSizeInBits() != 64) 3886 return false; 3887 Register DstReg = I.getOperand(0).getReg(); 3888 Register Src1Reg = I.getOperand(1).getReg(); 3889 Register Src2Reg = I.getOperand(2).getReg(); 3890 auto Tmp = MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstTy}, {}); 3891 MachineInstr *InsMI = 3892 emitLaneInsert(None, Tmp.getReg(0), Src1Reg, /* LaneIdx */ 0, RB, MIB); 3893 if (!InsMI) 3894 return false; 3895 MachineInstr *Ins2MI = emitLaneInsert(DstReg, InsMI->getOperand(0).getReg(), 3896 Src2Reg, /* LaneIdx */ 1, RB, MIB); 3897 if (!Ins2MI) 3898 return false; 3899 constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI); 3900 constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI); 3901 I.eraseFromParent(); 3902 return true; 3903 } 3904 3905 if (RB.getID() != AArch64::GPRRegBankID) 3906 return false; 3907 3908 if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32) 3909 return false; 3910 3911 auto *DstRC = &AArch64::GPR64RegClass; 3912 Register SubToRegDef = MRI.createVirtualRegister(DstRC); 3913 MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(), 3914 TII.get(TargetOpcode::SUBREG_TO_REG)) 3915 .addDef(SubToRegDef) 3916 .addImm(0) 3917 .addUse(I.getOperand(1).getReg()) 3918 .addImm(AArch64::sub_32); 3919 Register SubToRegDef2 = MRI.createVirtualRegister(DstRC); 3920 // Need to anyext the second scalar before we can use bfm 3921 MachineInstr &SubRegMI2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(), 3922 TII.get(TargetOpcode::SUBREG_TO_REG)) 3923 .addDef(SubToRegDef2) 3924 .addImm(0) 3925 .addUse(I.getOperand(2).getReg()) 3926 .addImm(AArch64::sub_32); 3927 MachineInstr &BFM = 3928 *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::BFMXri)) 3929 .addDef(I.getOperand(0).getReg()) 3930 .addUse(SubToRegDef) 3931 .addUse(SubToRegDef2) 3932 .addImm(32) 3933 .addImm(31); 3934 constrainSelectedInstRegOperands(SubRegMI, TII, TRI, RBI); 3935 constrainSelectedInstRegOperands(SubRegMI2, TII, TRI, RBI); 3936 constrainSelectedInstRegOperands(BFM, TII, TRI, RBI); 3937 I.eraseFromParent(); 3938 return true; 3939 } 3940 3941 static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg, 3942 const unsigned EltSize) { 3943 // Choose a lane copy opcode and subregister based off of the size of the 3944 // vector's elements. 3945 switch (EltSize) { 3946 case 8: 3947 CopyOpc = AArch64::CPYi8; 3948 ExtractSubReg = AArch64::bsub; 3949 break; 3950 case 16: 3951 CopyOpc = AArch64::CPYi16; 3952 ExtractSubReg = AArch64::hsub; 3953 break; 3954 case 32: 3955 CopyOpc = AArch64::CPYi32; 3956 ExtractSubReg = AArch64::ssub; 3957 break; 3958 case 64: 3959 CopyOpc = AArch64::CPYi64; 3960 ExtractSubReg = AArch64::dsub; 3961 break; 3962 default: 3963 // Unknown size, bail out. 3964 LLVM_DEBUG(dbgs() << "Elt size '" << EltSize << "' unsupported.\n"); 3965 return false; 3966 } 3967 return true; 3968 } 3969 3970 MachineInstr *AArch64InstructionSelector::emitExtractVectorElt( 3971 Optional<Register> DstReg, const RegisterBank &DstRB, LLT ScalarTy, 3972 Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const { 3973 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 3974 unsigned CopyOpc = 0; 3975 unsigned ExtractSubReg = 0; 3976 if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, ScalarTy.getSizeInBits())) { 3977 LLVM_DEBUG( 3978 dbgs() << "Couldn't determine lane copy opcode for instruction.\n"); 3979 return nullptr; 3980 } 3981 3982 const TargetRegisterClass *DstRC = 3983 getRegClassForTypeOnBank(ScalarTy, DstRB, RBI, true); 3984 if (!DstRC) { 3985 LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n"); 3986 return nullptr; 3987 } 3988 3989 const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI); 3990 const LLT &VecTy = MRI.getType(VecReg); 3991 const TargetRegisterClass *VecRC = 3992 getRegClassForTypeOnBank(VecTy, VecRB, RBI, true); 3993 if (!VecRC) { 3994 LLVM_DEBUG(dbgs() << "Could not determine source register class.\n"); 3995 return nullptr; 3996 } 3997 3998 // The register that we're going to copy into. 3999 Register InsertReg = VecReg; 4000 if (!DstReg) 4001 DstReg = MRI.createVirtualRegister(DstRC); 4002 // If the lane index is 0, we just use a subregister COPY. 4003 if (LaneIdx == 0) { 4004 auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {*DstReg}, {}) 4005 .addReg(VecReg, 0, ExtractSubReg); 4006 RBI.constrainGenericRegister(*DstReg, *DstRC, MRI); 4007 return &*Copy; 4008 } 4009 4010 // Lane copies require 128-bit wide registers. If we're dealing with an 4011 // unpacked vector, then we need to move up to that width. Insert an implicit 4012 // def and a subregister insert to get us there. 4013 if (VecTy.getSizeInBits() != 128) { 4014 MachineInstr *ScalarToVector = emitScalarToVector( 4015 VecTy.getSizeInBits(), &AArch64::FPR128RegClass, VecReg, MIRBuilder); 4016 if (!ScalarToVector) 4017 return nullptr; 4018 InsertReg = ScalarToVector->getOperand(0).getReg(); 4019 } 4020 4021 MachineInstr *LaneCopyMI = 4022 MIRBuilder.buildInstr(CopyOpc, {*DstReg}, {InsertReg}).addImm(LaneIdx); 4023 constrainSelectedInstRegOperands(*LaneCopyMI, TII, TRI, RBI); 4024 4025 // Make sure that we actually constrain the initial copy. 4026 RBI.constrainGenericRegister(*DstReg, *DstRC, MRI); 4027 return LaneCopyMI; 4028 } 4029 4030 bool AArch64InstructionSelector::selectExtractElt( 4031 MachineInstr &I, MachineRegisterInfo &MRI) { 4032 assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT && 4033 "unexpected opcode!"); 4034 Register DstReg = I.getOperand(0).getReg(); 4035 const LLT NarrowTy = MRI.getType(DstReg); 4036 const Register SrcReg = I.getOperand(1).getReg(); 4037 const LLT WideTy = MRI.getType(SrcReg); 4038 (void)WideTy; 4039 assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() && 4040 "source register size too small!"); 4041 assert(!NarrowTy.isVector() && "cannot extract vector into vector!"); 4042 4043 // Need the lane index to determine the correct copy opcode. 4044 MachineOperand &LaneIdxOp = I.getOperand(2); 4045 assert(LaneIdxOp.isReg() && "Lane index operand was not a register?"); 4046 4047 if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) { 4048 LLVM_DEBUG(dbgs() << "Cannot extract into GPR.\n"); 4049 return false; 4050 } 4051 4052 // Find the index to extract from. 4053 auto VRegAndVal = getIConstantVRegValWithLookThrough(LaneIdxOp.getReg(), MRI); 4054 if (!VRegAndVal) 4055 return false; 4056 unsigned LaneIdx = VRegAndVal->Value.getSExtValue(); 4057 4058 4059 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); 4060 MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, NarrowTy, SrcReg, 4061 LaneIdx, MIB); 4062 if (!Extract) 4063 return false; 4064 4065 I.eraseFromParent(); 4066 return true; 4067 } 4068 4069 bool AArch64InstructionSelector::selectSplitVectorUnmerge( 4070 MachineInstr &I, MachineRegisterInfo &MRI) { 4071 unsigned NumElts = I.getNumOperands() - 1; 4072 Register SrcReg = I.getOperand(NumElts).getReg(); 4073 const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg()); 4074 const LLT SrcTy = MRI.getType(SrcReg); 4075 4076 assert(NarrowTy.isVector() && "Expected an unmerge into vectors"); 4077 if (SrcTy.getSizeInBits() > 128) { 4078 LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge"); 4079 return false; 4080 } 4081 4082 // We implement a split vector operation by treating the sub-vectors as 4083 // scalars and extracting them. 4084 const RegisterBank &DstRB = 4085 *RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI); 4086 for (unsigned OpIdx = 0; OpIdx < NumElts; ++OpIdx) { 4087 Register Dst = I.getOperand(OpIdx).getReg(); 4088 MachineInstr *Extract = 4089 emitExtractVectorElt(Dst, DstRB, NarrowTy, SrcReg, OpIdx, MIB); 4090 if (!Extract) 4091 return false; 4092 } 4093 I.eraseFromParent(); 4094 return true; 4095 } 4096 4097 bool AArch64InstructionSelector::selectUnmergeValues(MachineInstr &I, 4098 MachineRegisterInfo &MRI) { 4099 assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES && 4100 "unexpected opcode"); 4101 4102 // TODO: Handle unmerging into GPRs and from scalars to scalars. 4103 if (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() != 4104 AArch64::FPRRegBankID || 4105 RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() != 4106 AArch64::FPRRegBankID) { 4107 LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar " 4108 "currently unsupported.\n"); 4109 return false; 4110 } 4111 4112 // The last operand is the vector source register, and every other operand is 4113 // a register to unpack into. 4114 unsigned NumElts = I.getNumOperands() - 1; 4115 Register SrcReg = I.getOperand(NumElts).getReg(); 4116 const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg()); 4117 const LLT WideTy = MRI.getType(SrcReg); 4118 (void)WideTy; 4119 assert((WideTy.isVector() || WideTy.getSizeInBits() == 128) && 4120 "can only unmerge from vector or s128 types!"); 4121 assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() && 4122 "source register size too small!"); 4123 4124 if (!NarrowTy.isScalar()) 4125 return selectSplitVectorUnmerge(I, MRI); 4126 4127 // Choose a lane copy opcode and subregister based off of the size of the 4128 // vector's elements. 4129 unsigned CopyOpc = 0; 4130 unsigned ExtractSubReg = 0; 4131 if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, NarrowTy.getSizeInBits())) 4132 return false; 4133 4134 // Set up for the lane copies. 4135 MachineBasicBlock &MBB = *I.getParent(); 4136 4137 // Stores the registers we'll be copying from. 4138 SmallVector<Register, 4> InsertRegs; 4139 4140 // We'll use the first register twice, so we only need NumElts-1 registers. 4141 unsigned NumInsertRegs = NumElts - 1; 4142 4143 // If our elements fit into exactly 128 bits, then we can copy from the source 4144 // directly. Otherwise, we need to do a bit of setup with some subregister 4145 // inserts. 4146 if (NarrowTy.getSizeInBits() * NumElts == 128) { 4147 InsertRegs = SmallVector<Register, 4>(NumInsertRegs, SrcReg); 4148 } else { 4149 // No. We have to perform subregister inserts. For each insert, create an 4150 // implicit def and a subregister insert, and save the register we create. 4151 const TargetRegisterClass *RC = 4152 getMinClassForRegBank(*RBI.getRegBank(SrcReg, MRI, TRI), 4153 WideTy.getScalarSizeInBits() * NumElts); 4154 unsigned SubReg = 0; 4155 bool Found = getSubRegForClass(RC, TRI, SubReg); 4156 (void)Found; 4157 assert(Found && "expected to find last operand's subeg idx"); 4158 for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) { 4159 Register ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass); 4160 MachineInstr &ImpDefMI = 4161 *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::IMPLICIT_DEF), 4162 ImpDefReg); 4163 4164 // Now, create the subregister insert from SrcReg. 4165 Register InsertReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass); 4166 MachineInstr &InsMI = 4167 *BuildMI(MBB, I, I.getDebugLoc(), 4168 TII.get(TargetOpcode::INSERT_SUBREG), InsertReg) 4169 .addUse(ImpDefReg) 4170 .addUse(SrcReg) 4171 .addImm(SubReg); 4172 4173 constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI); 4174 constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI); 4175 4176 // Save the register so that we can copy from it after. 4177 InsertRegs.push_back(InsertReg); 4178 } 4179 } 4180 4181 // Now that we've created any necessary subregister inserts, we can 4182 // create the copies. 4183 // 4184 // Perform the first copy separately as a subregister copy. 4185 Register CopyTo = I.getOperand(0).getReg(); 4186 auto FirstCopy = MIB.buildInstr(TargetOpcode::COPY, {CopyTo}, {}) 4187 .addReg(InsertRegs[0], 0, ExtractSubReg); 4188 constrainSelectedInstRegOperands(*FirstCopy, TII, TRI, RBI); 4189 4190 // Now, perform the remaining copies as vector lane copies. 4191 unsigned LaneIdx = 1; 4192 for (Register InsReg : InsertRegs) { 4193 Register CopyTo = I.getOperand(LaneIdx).getReg(); 4194 MachineInstr &CopyInst = 4195 *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CopyOpc), CopyTo) 4196 .addUse(InsReg) 4197 .addImm(LaneIdx); 4198 constrainSelectedInstRegOperands(CopyInst, TII, TRI, RBI); 4199 ++LaneIdx; 4200 } 4201 4202 // Separately constrain the first copy's destination. Because of the 4203 // limitation in constrainOperandRegClass, we can't guarantee that this will 4204 // actually be constrained. So, do it ourselves using the second operand. 4205 const TargetRegisterClass *RC = 4206 MRI.getRegClassOrNull(I.getOperand(1).getReg()); 4207 if (!RC) { 4208 LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n"); 4209 return false; 4210 } 4211 4212 RBI.constrainGenericRegister(CopyTo, *RC, MRI); 4213 I.eraseFromParent(); 4214 return true; 4215 } 4216 4217 bool AArch64InstructionSelector::selectConcatVectors( 4218 MachineInstr &I, MachineRegisterInfo &MRI) { 4219 assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS && 4220 "Unexpected opcode"); 4221 Register Dst = I.getOperand(0).getReg(); 4222 Register Op1 = I.getOperand(1).getReg(); 4223 Register Op2 = I.getOperand(2).getReg(); 4224 MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIB); 4225 if (!ConcatMI) 4226 return false; 4227 I.eraseFromParent(); 4228 return true; 4229 } 4230 4231 unsigned 4232 AArch64InstructionSelector::emitConstantPoolEntry(const Constant *CPVal, 4233 MachineFunction &MF) const { 4234 Type *CPTy = CPVal->getType(); 4235 Align Alignment = MF.getDataLayout().getPrefTypeAlign(CPTy); 4236 4237 MachineConstantPool *MCP = MF.getConstantPool(); 4238 return MCP->getConstantPoolIndex(CPVal, Alignment); 4239 } 4240 4241 MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool( 4242 const Constant *CPVal, MachineIRBuilder &MIRBuilder) const { 4243 auto &MF = MIRBuilder.getMF(); 4244 unsigned CPIdx = emitConstantPoolEntry(CPVal, MF); 4245 4246 auto Adrp = 4247 MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {}) 4248 .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE); 4249 4250 MachineInstr *LoadMI = nullptr; 4251 MachinePointerInfo PtrInfo = MachinePointerInfo::getConstantPool(MF); 4252 unsigned Size = MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType()); 4253 switch (Size) { 4254 case 16: 4255 LoadMI = 4256 &*MIRBuilder 4257 .buildInstr(AArch64::LDRQui, {&AArch64::FPR128RegClass}, {Adrp}) 4258 .addConstantPoolIndex(CPIdx, 0, 4259 AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 4260 break; 4261 case 8: 4262 LoadMI = 4263 &*MIRBuilder 4264 .buildInstr(AArch64::LDRDui, {&AArch64::FPR64RegClass}, {Adrp}) 4265 .addConstantPoolIndex(CPIdx, 0, 4266 AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 4267 break; 4268 case 4: 4269 LoadMI = 4270 &*MIRBuilder 4271 .buildInstr(AArch64::LDRSui, {&AArch64::FPR32RegClass}, {Adrp}) 4272 .addConstantPoolIndex(CPIdx, 0, 4273 AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 4274 break; 4275 case 2: 4276 LoadMI = 4277 &*MIRBuilder 4278 .buildInstr(AArch64::LDRHui, {&AArch64::FPR16RegClass}, {Adrp}) 4279 .addConstantPoolIndex(CPIdx, 0, 4280 AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 4281 break; 4282 default: 4283 LLVM_DEBUG(dbgs() << "Could not load from constant pool of type " 4284 << *CPVal->getType()); 4285 return nullptr; 4286 } 4287 LoadMI->addMemOperand(MF, MF.getMachineMemOperand(PtrInfo, 4288 MachineMemOperand::MOLoad, 4289 Size, Align(Size))); 4290 constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI); 4291 constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI); 4292 return LoadMI; 4293 } 4294 4295 /// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given 4296 /// size and RB. 4297 static std::pair<unsigned, unsigned> 4298 getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) { 4299 unsigned Opc, SubregIdx; 4300 if (RB.getID() == AArch64::GPRRegBankID) { 4301 if (EltSize == 16) { 4302 Opc = AArch64::INSvi16gpr; 4303 SubregIdx = AArch64::ssub; 4304 } else if (EltSize == 32) { 4305 Opc = AArch64::INSvi32gpr; 4306 SubregIdx = AArch64::ssub; 4307 } else if (EltSize == 64) { 4308 Opc = AArch64::INSvi64gpr; 4309 SubregIdx = AArch64::dsub; 4310 } else { 4311 llvm_unreachable("invalid elt size!"); 4312 } 4313 } else { 4314 if (EltSize == 8) { 4315 Opc = AArch64::INSvi8lane; 4316 SubregIdx = AArch64::bsub; 4317 } else if (EltSize == 16) { 4318 Opc = AArch64::INSvi16lane; 4319 SubregIdx = AArch64::hsub; 4320 } else if (EltSize == 32) { 4321 Opc = AArch64::INSvi32lane; 4322 SubregIdx = AArch64::ssub; 4323 } else if (EltSize == 64) { 4324 Opc = AArch64::INSvi64lane; 4325 SubregIdx = AArch64::dsub; 4326 } else { 4327 llvm_unreachable("invalid elt size!"); 4328 } 4329 } 4330 return std::make_pair(Opc, SubregIdx); 4331 } 4332 4333 MachineInstr *AArch64InstructionSelector::emitInstr( 4334 unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps, 4335 std::initializer_list<llvm::SrcOp> SrcOps, MachineIRBuilder &MIRBuilder, 4336 const ComplexRendererFns &RenderFns) const { 4337 assert(Opcode && "Expected an opcode?"); 4338 assert(!isPreISelGenericOpcode(Opcode) && 4339 "Function should only be used to produce selected instructions!"); 4340 auto MI = MIRBuilder.buildInstr(Opcode, DstOps, SrcOps); 4341 if (RenderFns) 4342 for (auto &Fn : *RenderFns) 4343 Fn(MI); 4344 constrainSelectedInstRegOperands(*MI, TII, TRI, RBI); 4345 return &*MI; 4346 } 4347 4348 MachineInstr *AArch64InstructionSelector::emitAddSub( 4349 const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode, 4350 Register Dst, MachineOperand &LHS, MachineOperand &RHS, 4351 MachineIRBuilder &MIRBuilder) const { 4352 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); 4353 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?"); 4354 auto Ty = MRI.getType(LHS.getReg()); 4355 assert(!Ty.isVector() && "Expected a scalar or pointer?"); 4356 unsigned Size = Ty.getSizeInBits(); 4357 assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit type only"); 4358 bool Is32Bit = Size == 32; 4359 4360 // INSTRri form with positive arithmetic immediate. 4361 if (auto Fns = selectArithImmed(RHS)) 4362 return emitInstr(AddrModeAndSizeToOpcode[0][Is32Bit], {Dst}, {LHS}, 4363 MIRBuilder, Fns); 4364 4365 // INSTRri form with negative arithmetic immediate. 4366 if (auto Fns = selectNegArithImmed(RHS)) 4367 return emitInstr(AddrModeAndSizeToOpcode[3][Is32Bit], {Dst}, {LHS}, 4368 MIRBuilder, Fns); 4369 4370 // INSTRrx form. 4371 if (auto Fns = selectArithExtendedRegister(RHS)) 4372 return emitInstr(AddrModeAndSizeToOpcode[4][Is32Bit], {Dst}, {LHS}, 4373 MIRBuilder, Fns); 4374 4375 // INSTRrs form. 4376 if (auto Fns = selectShiftedRegister(RHS)) 4377 return emitInstr(AddrModeAndSizeToOpcode[1][Is32Bit], {Dst}, {LHS}, 4378 MIRBuilder, Fns); 4379 return emitInstr(AddrModeAndSizeToOpcode[2][Is32Bit], {Dst}, {LHS, RHS}, 4380 MIRBuilder); 4381 } 4382 4383 MachineInstr * 4384 AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS, 4385 MachineOperand &RHS, 4386 MachineIRBuilder &MIRBuilder) const { 4387 const std::array<std::array<unsigned, 2>, 5> OpcTable{ 4388 {{AArch64::ADDXri, AArch64::ADDWri}, 4389 {AArch64::ADDXrs, AArch64::ADDWrs}, 4390 {AArch64::ADDXrr, AArch64::ADDWrr}, 4391 {AArch64::SUBXri, AArch64::SUBWri}, 4392 {AArch64::ADDXrx, AArch64::ADDWrx}}}; 4393 return emitAddSub(OpcTable, DefReg, LHS, RHS, MIRBuilder); 4394 } 4395 4396 MachineInstr * 4397 AArch64InstructionSelector::emitADDS(Register Dst, MachineOperand &LHS, 4398 MachineOperand &RHS, 4399 MachineIRBuilder &MIRBuilder) const { 4400 const std::array<std::array<unsigned, 2>, 5> OpcTable{ 4401 {{AArch64::ADDSXri, AArch64::ADDSWri}, 4402 {AArch64::ADDSXrs, AArch64::ADDSWrs}, 4403 {AArch64::ADDSXrr, AArch64::ADDSWrr}, 4404 {AArch64::SUBSXri, AArch64::SUBSWri}, 4405 {AArch64::ADDSXrx, AArch64::ADDSWrx}}}; 4406 return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder); 4407 } 4408 4409 MachineInstr * 4410 AArch64InstructionSelector::emitSUBS(Register Dst, MachineOperand &LHS, 4411 MachineOperand &RHS, 4412 MachineIRBuilder &MIRBuilder) const { 4413 const std::array<std::array<unsigned, 2>, 5> OpcTable{ 4414 {{AArch64::SUBSXri, AArch64::SUBSWri}, 4415 {AArch64::SUBSXrs, AArch64::SUBSWrs}, 4416 {AArch64::SUBSXrr, AArch64::SUBSWrr}, 4417 {AArch64::ADDSXri, AArch64::ADDSWri}, 4418 {AArch64::SUBSXrx, AArch64::SUBSWrx}}}; 4419 return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder); 4420 } 4421 4422 MachineInstr * 4423 AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS, 4424 MachineIRBuilder &MIRBuilder) const { 4425 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); 4426 bool Is32Bit = (MRI.getType(LHS.getReg()).getSizeInBits() == 32); 4427 auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass; 4428 return emitADDS(MRI.createVirtualRegister(RC), LHS, RHS, MIRBuilder); 4429 } 4430 4431 MachineInstr * 4432 AArch64InstructionSelector::emitTST(MachineOperand &LHS, MachineOperand &RHS, 4433 MachineIRBuilder &MIRBuilder) const { 4434 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?"); 4435 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); 4436 LLT Ty = MRI.getType(LHS.getReg()); 4437 unsigned RegSize = Ty.getSizeInBits(); 4438 bool Is32Bit = (RegSize == 32); 4439 const unsigned OpcTable[3][2] = {{AArch64::ANDSXri, AArch64::ANDSWri}, 4440 {AArch64::ANDSXrs, AArch64::ANDSWrs}, 4441 {AArch64::ANDSXrr, AArch64::ANDSWrr}}; 4442 // ANDS needs a logical immediate for its immediate form. Check if we can 4443 // fold one in. 4444 if (auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI)) { 4445 int64_t Imm = ValAndVReg->Value.getSExtValue(); 4446 4447 if (AArch64_AM::isLogicalImmediate(Imm, RegSize)) { 4448 auto TstMI = MIRBuilder.buildInstr(OpcTable[0][Is32Bit], {Ty}, {LHS}); 4449 TstMI.addImm(AArch64_AM::encodeLogicalImmediate(Imm, RegSize)); 4450 constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI); 4451 return &*TstMI; 4452 } 4453 } 4454 4455 if (auto Fns = selectLogicalShiftedRegister(RHS)) 4456 return emitInstr(OpcTable[1][Is32Bit], {Ty}, {LHS}, MIRBuilder, Fns); 4457 return emitInstr(OpcTable[2][Is32Bit], {Ty}, {LHS, RHS}, MIRBuilder); 4458 } 4459 4460 MachineInstr *AArch64InstructionSelector::emitIntegerCompare( 4461 MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate, 4462 MachineIRBuilder &MIRBuilder) const { 4463 assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!"); 4464 assert(Predicate.isPredicate() && "Expected predicate?"); 4465 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); 4466 LLT CmpTy = MRI.getType(LHS.getReg()); 4467 assert(!CmpTy.isVector() && "Expected scalar or pointer"); 4468 unsigned Size = CmpTy.getSizeInBits(); 4469 (void)Size; 4470 assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit LHS/RHS?"); 4471 // Fold the compare into a cmn or tst if possible. 4472 if (auto FoldCmp = tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder)) 4473 return FoldCmp; 4474 auto Dst = MRI.cloneVirtualRegister(LHS.getReg()); 4475 return emitSUBS(Dst, LHS, RHS, MIRBuilder); 4476 } 4477 4478 MachineInstr *AArch64InstructionSelector::emitCSetForFCmp( 4479 Register Dst, CmpInst::Predicate Pred, MachineIRBuilder &MIRBuilder) const { 4480 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 4481 #ifndef NDEBUG 4482 LLT Ty = MRI.getType(Dst); 4483 assert(!Ty.isVector() && Ty.getSizeInBits() == 32 && 4484 "Expected a 32-bit scalar register?"); 4485 #endif 4486 const Register ZReg = AArch64::WZR; 4487 AArch64CC::CondCode CC1, CC2; 4488 changeFCMPPredToAArch64CC(Pred, CC1, CC2); 4489 auto InvCC1 = AArch64CC::getInvertedCondCode(CC1); 4490 if (CC2 == AArch64CC::AL) 4491 return emitCSINC(/*Dst=*/Dst, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1, 4492 MIRBuilder); 4493 const TargetRegisterClass *RC = &AArch64::GPR32RegClass; 4494 Register Def1Reg = MRI.createVirtualRegister(RC); 4495 Register Def2Reg = MRI.createVirtualRegister(RC); 4496 auto InvCC2 = AArch64CC::getInvertedCondCode(CC2); 4497 emitCSINC(/*Dst=*/Def1Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1, MIRBuilder); 4498 emitCSINC(/*Dst=*/Def2Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC2, MIRBuilder); 4499 auto OrMI = MIRBuilder.buildInstr(AArch64::ORRWrr, {Dst}, {Def1Reg, Def2Reg}); 4500 constrainSelectedInstRegOperands(*OrMI, TII, TRI, RBI); 4501 return &*OrMI; 4502 } 4503 4504 MachineInstr * 4505 AArch64InstructionSelector::emitFPCompare(Register LHS, Register RHS, 4506 MachineIRBuilder &MIRBuilder, 4507 Optional<CmpInst::Predicate> Pred) const { 4508 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 4509 LLT Ty = MRI.getType(LHS); 4510 if (Ty.isVector()) 4511 return nullptr; 4512 unsigned OpSize = Ty.getSizeInBits(); 4513 if (OpSize != 32 && OpSize != 64) 4514 return nullptr; 4515 4516 // If this is a compare against +0.0, then we don't have 4517 // to explicitly materialize a constant. 4518 const ConstantFP *FPImm = getConstantFPVRegVal(RHS, MRI); 4519 bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative()); 4520 4521 auto IsEqualityPred = [](CmpInst::Predicate P) { 4522 return P == CmpInst::FCMP_OEQ || P == CmpInst::FCMP_ONE || 4523 P == CmpInst::FCMP_UEQ || P == CmpInst::FCMP_UNE; 4524 }; 4525 if (!ShouldUseImm && Pred && IsEqualityPred(*Pred)) { 4526 // Try commutating the operands. 4527 const ConstantFP *LHSImm = getConstantFPVRegVal(LHS, MRI); 4528 if (LHSImm && (LHSImm->isZero() && !LHSImm->isNegative())) { 4529 ShouldUseImm = true; 4530 std::swap(LHS, RHS); 4531 } 4532 } 4533 unsigned CmpOpcTbl[2][2] = {{AArch64::FCMPSrr, AArch64::FCMPDrr}, 4534 {AArch64::FCMPSri, AArch64::FCMPDri}}; 4535 unsigned CmpOpc = CmpOpcTbl[ShouldUseImm][OpSize == 64]; 4536 4537 // Partially build the compare. Decide if we need to add a use for the 4538 // third operand based off whether or not we're comparing against 0.0. 4539 auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addUse(LHS); 4540 if (!ShouldUseImm) 4541 CmpMI.addUse(RHS); 4542 constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI); 4543 return &*CmpMI; 4544 } 4545 4546 MachineInstr *AArch64InstructionSelector::emitVectorConcat( 4547 Optional<Register> Dst, Register Op1, Register Op2, 4548 MachineIRBuilder &MIRBuilder) const { 4549 // We implement a vector concat by: 4550 // 1. Use scalar_to_vector to insert the lower vector into the larger dest 4551 // 2. Insert the upper vector into the destination's upper element 4552 // TODO: some of this code is common with G_BUILD_VECTOR handling. 4553 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); 4554 4555 const LLT Op1Ty = MRI.getType(Op1); 4556 const LLT Op2Ty = MRI.getType(Op2); 4557 4558 if (Op1Ty != Op2Ty) { 4559 LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys"); 4560 return nullptr; 4561 } 4562 assert(Op1Ty.isVector() && "Expected a vector for vector concat"); 4563 4564 if (Op1Ty.getSizeInBits() >= 128) { 4565 LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors"); 4566 return nullptr; 4567 } 4568 4569 // At the moment we just support 64 bit vector concats. 4570 if (Op1Ty.getSizeInBits() != 64) { 4571 LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors"); 4572 return nullptr; 4573 } 4574 4575 const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits()); 4576 const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI); 4577 const TargetRegisterClass *DstRC = 4578 getMinClassForRegBank(FPRBank, Op1Ty.getSizeInBits() * 2); 4579 4580 MachineInstr *WidenedOp1 = 4581 emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder); 4582 MachineInstr *WidenedOp2 = 4583 emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op2, MIRBuilder); 4584 if (!WidenedOp1 || !WidenedOp2) { 4585 LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value"); 4586 return nullptr; 4587 } 4588 4589 // Now do the insert of the upper element. 4590 unsigned InsertOpc, InsSubRegIdx; 4591 std::tie(InsertOpc, InsSubRegIdx) = 4592 getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits()); 4593 4594 if (!Dst) 4595 Dst = MRI.createVirtualRegister(DstRC); 4596 auto InsElt = 4597 MIRBuilder 4598 .buildInstr(InsertOpc, {*Dst}, {WidenedOp1->getOperand(0).getReg()}) 4599 .addImm(1) /* Lane index */ 4600 .addUse(WidenedOp2->getOperand(0).getReg()) 4601 .addImm(0); 4602 constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI); 4603 return &*InsElt; 4604 } 4605 4606 MachineInstr * 4607 AArch64InstructionSelector::emitCSINC(Register Dst, Register Src1, 4608 Register Src2, AArch64CC::CondCode Pred, 4609 MachineIRBuilder &MIRBuilder) const { 4610 auto &MRI = *MIRBuilder.getMRI(); 4611 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Dst); 4612 // If we used a register class, then this won't necessarily have an LLT. 4613 // Compute the size based off whether or not we have a class or bank. 4614 unsigned Size; 4615 if (const auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>()) 4616 Size = TRI.getRegSizeInBits(*RC); 4617 else 4618 Size = MRI.getType(Dst).getSizeInBits(); 4619 // Some opcodes use s1. 4620 assert(Size <= 64 && "Expected 64 bits or less only!"); 4621 static const unsigned OpcTable[2] = {AArch64::CSINCWr, AArch64::CSINCXr}; 4622 unsigned Opc = OpcTable[Size == 64]; 4623 auto CSINC = MIRBuilder.buildInstr(Opc, {Dst}, {Src1, Src2}).addImm(Pred); 4624 constrainSelectedInstRegOperands(*CSINC, TII, TRI, RBI); 4625 return &*CSINC; 4626 } 4627 4628 std::pair<MachineInstr *, AArch64CC::CondCode> 4629 AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst, 4630 MachineOperand &LHS, 4631 MachineOperand &RHS, 4632 MachineIRBuilder &MIRBuilder) const { 4633 switch (Opcode) { 4634 default: 4635 llvm_unreachable("Unexpected opcode!"); 4636 case TargetOpcode::G_SADDO: 4637 return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS); 4638 case TargetOpcode::G_UADDO: 4639 return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS); 4640 case TargetOpcode::G_SSUBO: 4641 return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS); 4642 case TargetOpcode::G_USUBO: 4643 return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO); 4644 } 4645 } 4646 4647 bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) { 4648 MachineRegisterInfo &MRI = *MIB.getMRI(); 4649 // We want to recognize this pattern: 4650 // 4651 // $z = G_FCMP pred, $x, $y 4652 // ... 4653 // $w = G_SELECT $z, $a, $b 4654 // 4655 // Where the value of $z is *only* ever used by the G_SELECT (possibly with 4656 // some copies/truncs in between.) 4657 // 4658 // If we see this, then we can emit something like this: 4659 // 4660 // fcmp $x, $y 4661 // fcsel $w, $a, $b, pred 4662 // 4663 // Rather than emitting both of the rather long sequences in the standard 4664 // G_FCMP/G_SELECT select methods. 4665 4666 // First, check if the condition is defined by a compare. 4667 MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg()); 4668 while (CondDef) { 4669 // We can only fold if all of the defs have one use. 4670 Register CondDefReg = CondDef->getOperand(0).getReg(); 4671 if (!MRI.hasOneNonDBGUse(CondDefReg)) { 4672 // Unless it's another select. 4673 for (const MachineInstr &UI : MRI.use_nodbg_instructions(CondDefReg)) { 4674 if (CondDef == &UI) 4675 continue; 4676 if (UI.getOpcode() != TargetOpcode::G_SELECT) 4677 return false; 4678 } 4679 } 4680 4681 // We can skip over G_TRUNC since the condition is 1-bit. 4682 // Truncating/extending can have no impact on the value. 4683 unsigned Opc = CondDef->getOpcode(); 4684 if (Opc != TargetOpcode::COPY && Opc != TargetOpcode::G_TRUNC) 4685 break; 4686 4687 // Can't see past copies from physregs. 4688 if (Opc == TargetOpcode::COPY && 4689 Register::isPhysicalRegister(CondDef->getOperand(1).getReg())) 4690 return false; 4691 4692 CondDef = MRI.getVRegDef(CondDef->getOperand(1).getReg()); 4693 } 4694 4695 // Is the condition defined by a compare? 4696 if (!CondDef) 4697 return false; 4698 4699 unsigned CondOpc = CondDef->getOpcode(); 4700 if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP) 4701 return false; 4702 4703 AArch64CC::CondCode CondCode; 4704 if (CondOpc == TargetOpcode::G_ICMP) { 4705 auto Pred = 4706 static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate()); 4707 CondCode = changeICMPPredToAArch64CC(Pred); 4708 emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3), 4709 CondDef->getOperand(1), MIB); 4710 } else { 4711 // Get the condition code for the select. 4712 auto Pred = 4713 static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate()); 4714 AArch64CC::CondCode CondCode2; 4715 changeFCMPPredToAArch64CC(Pred, CondCode, CondCode2); 4716 4717 // changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two 4718 // instructions to emit the comparison. 4719 // TODO: Handle FCMP_UEQ and FCMP_ONE. After that, this check will be 4720 // unnecessary. 4721 if (CondCode2 != AArch64CC::AL) 4722 return false; 4723 4724 if (!emitFPCompare(CondDef->getOperand(2).getReg(), 4725 CondDef->getOperand(3).getReg(), MIB)) { 4726 LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n"); 4727 return false; 4728 } 4729 } 4730 4731 // Emit the select. 4732 emitSelect(I.getOperand(0).getReg(), I.getOperand(2).getReg(), 4733 I.getOperand(3).getReg(), CondCode, MIB); 4734 I.eraseFromParent(); 4735 return true; 4736 } 4737 4738 MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare( 4739 MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate, 4740 MachineIRBuilder &MIRBuilder) const { 4741 assert(LHS.isReg() && RHS.isReg() && Predicate.isPredicate() && 4742 "Unexpected MachineOperand"); 4743 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 4744 // We want to find this sort of thing: 4745 // x = G_SUB 0, y 4746 // G_ICMP z, x 4747 // 4748 // In this case, we can fold the G_SUB into the G_ICMP using a CMN instead. 4749 // e.g: 4750 // 4751 // cmn z, y 4752 4753 // Check if the RHS or LHS of the G_ICMP is defined by a SUB 4754 MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI); 4755 MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI); 4756 auto P = static_cast<CmpInst::Predicate>(Predicate.getPredicate()); 4757 // Given this: 4758 // 4759 // x = G_SUB 0, y 4760 // G_ICMP x, z 4761 // 4762 // Produce this: 4763 // 4764 // cmn y, z 4765 if (isCMN(LHSDef, P, MRI)) 4766 return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder); 4767 4768 // Same idea here, but with the RHS of the compare instead: 4769 // 4770 // Given this: 4771 // 4772 // x = G_SUB 0, y 4773 // G_ICMP z, x 4774 // 4775 // Produce this: 4776 // 4777 // cmn z, y 4778 if (isCMN(RHSDef, P, MRI)) 4779 return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder); 4780 4781 // Given this: 4782 // 4783 // z = G_AND x, y 4784 // G_ICMP z, 0 4785 // 4786 // Produce this if the compare is signed: 4787 // 4788 // tst x, y 4789 if (!CmpInst::isUnsigned(P) && LHSDef && 4790 LHSDef->getOpcode() == TargetOpcode::G_AND) { 4791 // Make sure that the RHS is 0. 4792 auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI); 4793 if (!ValAndVReg || ValAndVReg->Value != 0) 4794 return nullptr; 4795 4796 return emitTST(LHSDef->getOperand(1), 4797 LHSDef->getOperand(2), MIRBuilder); 4798 } 4799 4800 return nullptr; 4801 } 4802 4803 bool AArch64InstructionSelector::selectShuffleVector( 4804 MachineInstr &I, MachineRegisterInfo &MRI) { 4805 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 4806 Register Src1Reg = I.getOperand(1).getReg(); 4807 const LLT Src1Ty = MRI.getType(Src1Reg); 4808 Register Src2Reg = I.getOperand(2).getReg(); 4809 const LLT Src2Ty = MRI.getType(Src2Reg); 4810 ArrayRef<int> Mask = I.getOperand(3).getShuffleMask(); 4811 4812 MachineBasicBlock &MBB = *I.getParent(); 4813 MachineFunction &MF = *MBB.getParent(); 4814 LLVMContext &Ctx = MF.getFunction().getContext(); 4815 4816 // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if 4817 // it's originated from a <1 x T> type. Those should have been lowered into 4818 // G_BUILD_VECTOR earlier. 4819 if (!Src1Ty.isVector() || !Src2Ty.isVector()) { 4820 LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n"); 4821 return false; 4822 } 4823 4824 unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8; 4825 4826 SmallVector<Constant *, 64> CstIdxs; 4827 for (int Val : Mask) { 4828 // For now, any undef indexes we'll just assume to be 0. This should be 4829 // optimized in future, e.g. to select DUP etc. 4830 Val = Val < 0 ? 0 : Val; 4831 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { 4832 unsigned Offset = Byte + Val * BytesPerElt; 4833 CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset)); 4834 } 4835 } 4836 4837 // Use a constant pool to load the index vector for TBL. 4838 Constant *CPVal = ConstantVector::get(CstIdxs); 4839 MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIB); 4840 if (!IndexLoad) { 4841 LLVM_DEBUG(dbgs() << "Could not load from a constant pool"); 4842 return false; 4843 } 4844 4845 if (DstTy.getSizeInBits() != 128) { 4846 assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty"); 4847 // This case can be done with TBL1. 4848 MachineInstr *Concat = emitVectorConcat(None, Src1Reg, Src2Reg, MIB); 4849 if (!Concat) { 4850 LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1"); 4851 return false; 4852 } 4853 4854 // The constant pool load will be 64 bits, so need to convert to FPR128 reg. 4855 IndexLoad = emitScalarToVector(64, &AArch64::FPR128RegClass, 4856 IndexLoad->getOperand(0).getReg(), MIB); 4857 4858 auto TBL1 = MIB.buildInstr( 4859 AArch64::TBLv16i8One, {&AArch64::FPR128RegClass}, 4860 {Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()}); 4861 constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI); 4862 4863 auto Copy = 4864 MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {}) 4865 .addReg(TBL1.getReg(0), 0, AArch64::dsub); 4866 RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI); 4867 I.eraseFromParent(); 4868 return true; 4869 } 4870 4871 // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive 4872 // Q registers for regalloc. 4873 SmallVector<Register, 2> Regs = {Src1Reg, Src2Reg}; 4874 auto RegSeq = createQTuple(Regs, MIB); 4875 auto TBL2 = MIB.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0)}, 4876 {RegSeq, IndexLoad->getOperand(0)}); 4877 constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI); 4878 I.eraseFromParent(); 4879 return true; 4880 } 4881 4882 MachineInstr *AArch64InstructionSelector::emitLaneInsert( 4883 Optional<Register> DstReg, Register SrcReg, Register EltReg, 4884 unsigned LaneIdx, const RegisterBank &RB, 4885 MachineIRBuilder &MIRBuilder) const { 4886 MachineInstr *InsElt = nullptr; 4887 const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass; 4888 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 4889 4890 // Create a register to define with the insert if one wasn't passed in. 4891 if (!DstReg) 4892 DstReg = MRI.createVirtualRegister(DstRC); 4893 4894 unsigned EltSize = MRI.getType(EltReg).getSizeInBits(); 4895 unsigned Opc = getInsertVecEltOpInfo(RB, EltSize).first; 4896 4897 if (RB.getID() == AArch64::FPRRegBankID) { 4898 auto InsSub = emitScalarToVector(EltSize, DstRC, EltReg, MIRBuilder); 4899 InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg}) 4900 .addImm(LaneIdx) 4901 .addUse(InsSub->getOperand(0).getReg()) 4902 .addImm(0); 4903 } else { 4904 InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg}) 4905 .addImm(LaneIdx) 4906 .addUse(EltReg); 4907 } 4908 4909 constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI); 4910 return InsElt; 4911 } 4912 4913 bool AArch64InstructionSelector::selectUSMovFromExtend( 4914 MachineInstr &MI, MachineRegisterInfo &MRI) { 4915 if (MI.getOpcode() != TargetOpcode::G_SEXT && 4916 MI.getOpcode() != TargetOpcode::G_ZEXT && 4917 MI.getOpcode() != TargetOpcode::G_ANYEXT) 4918 return false; 4919 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SEXT; 4920 const Register DefReg = MI.getOperand(0).getReg(); 4921 const LLT DstTy = MRI.getType(DefReg); 4922 unsigned DstSize = DstTy.getSizeInBits(); 4923 4924 if (DstSize != 32 && DstSize != 64) 4925 return false; 4926 4927 MachineInstr *Extract = getOpcodeDef(TargetOpcode::G_EXTRACT_VECTOR_ELT, 4928 MI.getOperand(1).getReg(), MRI); 4929 int64_t Lane; 4930 if (!Extract || !mi_match(Extract->getOperand(2).getReg(), MRI, m_ICst(Lane))) 4931 return false; 4932 Register Src0 = Extract->getOperand(1).getReg(); 4933 4934 const LLT &VecTy = MRI.getType(Src0); 4935 4936 if (VecTy.getSizeInBits() != 128) { 4937 const MachineInstr *ScalarToVector = emitScalarToVector( 4938 VecTy.getSizeInBits(), &AArch64::FPR128RegClass, Src0, MIB); 4939 assert(ScalarToVector && "Didn't expect emitScalarToVector to fail!"); 4940 Src0 = ScalarToVector->getOperand(0).getReg(); 4941 } 4942 4943 unsigned Opcode; 4944 if (DstSize == 64 && VecTy.getScalarSizeInBits() == 32) 4945 Opcode = IsSigned ? AArch64::SMOVvi32to64 : AArch64::UMOVvi32; 4946 else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 16) 4947 Opcode = IsSigned ? AArch64::SMOVvi16to64 : AArch64::UMOVvi16; 4948 else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 8) 4949 Opcode = IsSigned ? AArch64::SMOVvi8to64 : AArch64::UMOVvi8; 4950 else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 16) 4951 Opcode = IsSigned ? AArch64::SMOVvi16to32 : AArch64::UMOVvi16; 4952 else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 8) 4953 Opcode = IsSigned ? AArch64::SMOVvi8to32 : AArch64::UMOVvi8; 4954 else 4955 llvm_unreachable("Unexpected type combo for S/UMov!"); 4956 4957 // We may need to generate one of these, depending on the type and sign of the 4958 // input: 4959 // DstReg = SMOV Src0, Lane; 4960 // NewReg = UMOV Src0, Lane; DstReg = SUBREG_TO_REG NewReg, sub_32; 4961 MachineInstr *ExtI = nullptr; 4962 if (DstSize == 64 && !IsSigned) { 4963 Register NewReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 4964 MIB.buildInstr(Opcode, {NewReg}, {Src0}).addImm(Lane); 4965 ExtI = MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {}) 4966 .addImm(0) 4967 .addUse(NewReg) 4968 .addImm(AArch64::sub_32); 4969 RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI); 4970 } else 4971 ExtI = MIB.buildInstr(Opcode, {DefReg}, {Src0}).addImm(Lane); 4972 4973 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 4974 MI.eraseFromParent(); 4975 return true; 4976 } 4977 4978 bool AArch64InstructionSelector::selectInsertElt(MachineInstr &I, 4979 MachineRegisterInfo &MRI) { 4980 assert(I.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT); 4981 4982 // Get information on the destination. 4983 Register DstReg = I.getOperand(0).getReg(); 4984 const LLT DstTy = MRI.getType(DstReg); 4985 unsigned VecSize = DstTy.getSizeInBits(); 4986 4987 // Get information on the element we want to insert into the destination. 4988 Register EltReg = I.getOperand(2).getReg(); 4989 const LLT EltTy = MRI.getType(EltReg); 4990 unsigned EltSize = EltTy.getSizeInBits(); 4991 if (EltSize < 16 || EltSize > 64) 4992 return false; // Don't support all element types yet. 4993 4994 // Find the definition of the index. Bail out if it's not defined by a 4995 // G_CONSTANT. 4996 Register IdxReg = I.getOperand(3).getReg(); 4997 auto VRegAndVal = getIConstantVRegValWithLookThrough(IdxReg, MRI); 4998 if (!VRegAndVal) 4999 return false; 5000 unsigned LaneIdx = VRegAndVal->Value.getSExtValue(); 5001 5002 // Perform the lane insert. 5003 Register SrcReg = I.getOperand(1).getReg(); 5004 const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI); 5005 5006 if (VecSize < 128) { 5007 // If the vector we're inserting into is smaller than 128 bits, widen it 5008 // to 128 to do the insert. 5009 MachineInstr *ScalarToVec = 5010 emitScalarToVector(VecSize, &AArch64::FPR128RegClass, SrcReg, MIB); 5011 if (!ScalarToVec) 5012 return false; 5013 SrcReg = ScalarToVec->getOperand(0).getReg(); 5014 } 5015 5016 // Create an insert into a new FPR128 register. 5017 // Note that if our vector is already 128 bits, we end up emitting an extra 5018 // register. 5019 MachineInstr *InsMI = 5020 emitLaneInsert(None, SrcReg, EltReg, LaneIdx, EltRB, MIB); 5021 5022 if (VecSize < 128) { 5023 // If we had to widen to perform the insert, then we have to demote back to 5024 // the original size to get the result we want. 5025 Register DemoteVec = InsMI->getOperand(0).getReg(); 5026 const TargetRegisterClass *RC = 5027 getMinClassForRegBank(*RBI.getRegBank(DemoteVec, MRI, TRI), VecSize); 5028 if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) { 5029 LLVM_DEBUG(dbgs() << "Unsupported register class!\n"); 5030 return false; 5031 } 5032 unsigned SubReg = 0; 5033 if (!getSubRegForClass(RC, TRI, SubReg)) 5034 return false; 5035 if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) { 5036 LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << VecSize 5037 << "\n"); 5038 return false; 5039 } 5040 MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}) 5041 .addReg(DemoteVec, 0, SubReg); 5042 RBI.constrainGenericRegister(DstReg, *RC, MRI); 5043 } else { 5044 // No widening needed. 5045 InsMI->getOperand(0).setReg(DstReg); 5046 constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI); 5047 } 5048 5049 I.eraseFromParent(); 5050 return true; 5051 } 5052 5053 MachineInstr * 5054 AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV, 5055 MachineIRBuilder &MIRBuilder, 5056 MachineRegisterInfo &MRI) { 5057 LLT DstTy = MRI.getType(Dst); 5058 unsigned DstSize = DstTy.getSizeInBits(); 5059 if (CV->isNullValue()) { 5060 if (DstSize == 128) { 5061 auto Mov = 5062 MIRBuilder.buildInstr(AArch64::MOVIv2d_ns, {Dst}, {}).addImm(0); 5063 constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI); 5064 return &*Mov; 5065 } 5066 5067 if (DstSize == 64) { 5068 auto Mov = 5069 MIRBuilder 5070 .buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {}) 5071 .addImm(0); 5072 auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {Dst}, {}) 5073 .addReg(Mov.getReg(0), 0, AArch64::dsub); 5074 RBI.constrainGenericRegister(Dst, AArch64::FPR64RegClass, MRI); 5075 return &*Copy; 5076 } 5077 } 5078 5079 auto *CPLoad = emitLoadFromConstantPool(CV, MIRBuilder); 5080 if (!CPLoad) { 5081 LLVM_DEBUG(dbgs() << "Could not generate cp load for constant vector!"); 5082 return nullptr; 5083 } 5084 5085 auto Copy = MIRBuilder.buildCopy(Dst, CPLoad->getOperand(0)); 5086 RBI.constrainGenericRegister( 5087 Dst, *MRI.getRegClass(CPLoad->getOperand(0).getReg()), MRI); 5088 return &*Copy; 5089 } 5090 5091 bool AArch64InstructionSelector::tryOptConstantBuildVec( 5092 MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) { 5093 assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR); 5094 unsigned DstSize = DstTy.getSizeInBits(); 5095 assert(DstSize <= 128 && "Unexpected build_vec type!"); 5096 if (DstSize < 32) 5097 return false; 5098 // Check if we're building a constant vector, in which case we want to 5099 // generate a constant pool load instead of a vector insert sequence. 5100 SmallVector<Constant *, 16> Csts; 5101 for (unsigned Idx = 1; Idx < I.getNumOperands(); ++Idx) { 5102 // Try to find G_CONSTANT or G_FCONSTANT 5103 auto *OpMI = 5104 getOpcodeDef(TargetOpcode::G_CONSTANT, I.getOperand(Idx).getReg(), MRI); 5105 if (OpMI) 5106 Csts.emplace_back( 5107 const_cast<ConstantInt *>(OpMI->getOperand(1).getCImm())); 5108 else if ((OpMI = getOpcodeDef(TargetOpcode::G_FCONSTANT, 5109 I.getOperand(Idx).getReg(), MRI))) 5110 Csts.emplace_back( 5111 const_cast<ConstantFP *>(OpMI->getOperand(1).getFPImm())); 5112 else 5113 return false; 5114 } 5115 Constant *CV = ConstantVector::get(Csts); 5116 if (!emitConstantVector(I.getOperand(0).getReg(), CV, MIB, MRI)) 5117 return false; 5118 I.eraseFromParent(); 5119 return true; 5120 } 5121 5122 bool AArch64InstructionSelector::tryOptBuildVecToSubregToReg( 5123 MachineInstr &I, MachineRegisterInfo &MRI) { 5124 // Given: 5125 // %vec = G_BUILD_VECTOR %elt, %undef, %undef, ... %undef 5126 // 5127 // Select the G_BUILD_VECTOR as a SUBREG_TO_REG from %elt. 5128 Register Dst = I.getOperand(0).getReg(); 5129 Register EltReg = I.getOperand(1).getReg(); 5130 LLT EltTy = MRI.getType(EltReg); 5131 // If the index isn't on the same bank as its elements, then this can't be a 5132 // SUBREG_TO_REG. 5133 const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI); 5134 const RegisterBank &DstRB = *RBI.getRegBank(Dst, MRI, TRI); 5135 if (EltRB != DstRB) 5136 return false; 5137 if (any_of(make_range(I.operands_begin() + 2, I.operands_end()), 5138 [&MRI](const MachineOperand &Op) { 5139 return !getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, Op.getReg(), 5140 MRI); 5141 })) 5142 return false; 5143 unsigned SubReg; 5144 const TargetRegisterClass *EltRC = 5145 getMinClassForRegBank(EltRB, EltTy.getSizeInBits()); 5146 if (!EltRC) 5147 return false; 5148 const TargetRegisterClass *DstRC = 5149 getMinClassForRegBank(DstRB, MRI.getType(Dst).getSizeInBits()); 5150 if (!DstRC) 5151 return false; 5152 if (!getSubRegForClass(EltRC, TRI, SubReg)) 5153 return false; 5154 auto SubregToReg = MIB.buildInstr(AArch64::SUBREG_TO_REG, {Dst}, {}) 5155 .addImm(0) 5156 .addUse(EltReg) 5157 .addImm(SubReg); 5158 I.eraseFromParent(); 5159 constrainSelectedInstRegOperands(*SubregToReg, TII, TRI, RBI); 5160 return RBI.constrainGenericRegister(Dst, *DstRC, MRI); 5161 } 5162 5163 bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I, 5164 MachineRegisterInfo &MRI) { 5165 assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR); 5166 // Until we port more of the optimized selections, for now just use a vector 5167 // insert sequence. 5168 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 5169 const LLT EltTy = MRI.getType(I.getOperand(1).getReg()); 5170 unsigned EltSize = EltTy.getSizeInBits(); 5171 5172 if (tryOptConstantBuildVec(I, DstTy, MRI)) 5173 return true; 5174 if (tryOptBuildVecToSubregToReg(I, MRI)) 5175 return true; 5176 5177 if (EltSize < 16 || EltSize > 64) 5178 return false; // Don't support all element types yet. 5179 const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI); 5180 5181 const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass; 5182 MachineInstr *ScalarToVec = 5183 emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC, 5184 I.getOperand(1).getReg(), MIB); 5185 if (!ScalarToVec) 5186 return false; 5187 5188 Register DstVec = ScalarToVec->getOperand(0).getReg(); 5189 unsigned DstSize = DstTy.getSizeInBits(); 5190 5191 // Keep track of the last MI we inserted. Later on, we might be able to save 5192 // a copy using it. 5193 MachineInstr *PrevMI = nullptr; 5194 for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) { 5195 // Note that if we don't do a subregister copy, we can end up making an 5196 // extra register. 5197 PrevMI = &*emitLaneInsert(None, DstVec, I.getOperand(i).getReg(), i - 1, RB, 5198 MIB); 5199 DstVec = PrevMI->getOperand(0).getReg(); 5200 } 5201 5202 // If DstTy's size in bits is less than 128, then emit a subregister copy 5203 // from DstVec to the last register we've defined. 5204 if (DstSize < 128) { 5205 // Force this to be FPR using the destination vector. 5206 const TargetRegisterClass *RC = 5207 getMinClassForRegBank(*RBI.getRegBank(DstVec, MRI, TRI), DstSize); 5208 if (!RC) 5209 return false; 5210 if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) { 5211 LLVM_DEBUG(dbgs() << "Unsupported register class!\n"); 5212 return false; 5213 } 5214 5215 unsigned SubReg = 0; 5216 if (!getSubRegForClass(RC, TRI, SubReg)) 5217 return false; 5218 if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) { 5219 LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << DstSize 5220 << "\n"); 5221 return false; 5222 } 5223 5224 Register Reg = MRI.createVirtualRegister(RC); 5225 Register DstReg = I.getOperand(0).getReg(); 5226 5227 MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}).addReg(DstVec, 0, SubReg); 5228 MachineOperand &RegOp = I.getOperand(1); 5229 RegOp.setReg(Reg); 5230 RBI.constrainGenericRegister(DstReg, *RC, MRI); 5231 } else { 5232 // We don't need a subregister copy. Save a copy by re-using the 5233 // destination register on the final insert. 5234 assert(PrevMI && "PrevMI was null?"); 5235 PrevMI->getOperand(0).setReg(I.getOperand(0).getReg()); 5236 constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI); 5237 } 5238 5239 I.eraseFromParent(); 5240 return true; 5241 } 5242 5243 bool AArch64InstructionSelector::selectVectorLoadIntrinsic(unsigned Opc, 5244 unsigned NumVecs, 5245 MachineInstr &I) { 5246 assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS); 5247 assert(Opc && "Expected an opcode?"); 5248 assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors"); 5249 auto &MRI = *MIB.getMRI(); 5250 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 5251 unsigned Size = Ty.getSizeInBits(); 5252 assert((Size == 64 || Size == 128) && 5253 "Destination must be 64 bits or 128 bits?"); 5254 unsigned SubReg = Size == 64 ? AArch64::dsub0 : AArch64::qsub0; 5255 auto Ptr = I.getOperand(I.getNumOperands() - 1).getReg(); 5256 assert(MRI.getType(Ptr).isPointer() && "Expected a pointer type?"); 5257 auto Load = MIB.buildInstr(Opc, {Ty}, {Ptr}); 5258 Load.cloneMemRefs(I); 5259 constrainSelectedInstRegOperands(*Load, TII, TRI, RBI); 5260 Register SelectedLoadDst = Load->getOperand(0).getReg(); 5261 for (unsigned Idx = 0; Idx < NumVecs; ++Idx) { 5262 auto Vec = MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(Idx)}, {}) 5263 .addReg(SelectedLoadDst, 0, SubReg + Idx); 5264 // Emit the subreg copies and immediately select them. 5265 // FIXME: We should refactor our copy code into an emitCopy helper and 5266 // clean up uses of this pattern elsewhere in the selector. 5267 selectCopy(*Vec, TII, MRI, TRI, RBI); 5268 } 5269 return true; 5270 } 5271 5272 bool AArch64InstructionSelector::selectIntrinsicWithSideEffects( 5273 MachineInstr &I, MachineRegisterInfo &MRI) { 5274 // Find the intrinsic ID. 5275 unsigned IntrinID = I.getIntrinsicID(); 5276 5277 const LLT S8 = LLT::scalar(8); 5278 const LLT S16 = LLT::scalar(16); 5279 const LLT S32 = LLT::scalar(32); 5280 const LLT S64 = LLT::scalar(64); 5281 const LLT P0 = LLT::pointer(0, 64); 5282 // Select the instruction. 5283 switch (IntrinID) { 5284 default: 5285 return false; 5286 case Intrinsic::aarch64_ldxp: 5287 case Intrinsic::aarch64_ldaxp: { 5288 auto NewI = MIB.buildInstr( 5289 IntrinID == Intrinsic::aarch64_ldxp ? AArch64::LDXPX : AArch64::LDAXPX, 5290 {I.getOperand(0).getReg(), I.getOperand(1).getReg()}, 5291 {I.getOperand(3)}); 5292 NewI.cloneMemRefs(I); 5293 constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI); 5294 break; 5295 } 5296 case Intrinsic::trap: 5297 MIB.buildInstr(AArch64::BRK, {}, {}).addImm(1); 5298 break; 5299 case Intrinsic::debugtrap: 5300 MIB.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000); 5301 break; 5302 case Intrinsic::ubsantrap: 5303 MIB.buildInstr(AArch64::BRK, {}, {}) 5304 .addImm(I.getOperand(1).getImm() | ('U' << 8)); 5305 break; 5306 case Intrinsic::aarch64_neon_ld2: { 5307 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 5308 unsigned Opc = 0; 5309 if (Ty == LLT::fixed_vector(8, S8)) 5310 Opc = AArch64::LD2Twov8b; 5311 else if (Ty == LLT::fixed_vector(16, S8)) 5312 Opc = AArch64::LD2Twov16b; 5313 else if (Ty == LLT::fixed_vector(4, S16)) 5314 Opc = AArch64::LD2Twov4h; 5315 else if (Ty == LLT::fixed_vector(8, S16)) 5316 Opc = AArch64::LD2Twov8h; 5317 else if (Ty == LLT::fixed_vector(2, S32)) 5318 Opc = AArch64::LD2Twov2s; 5319 else if (Ty == LLT::fixed_vector(4, S32)) 5320 Opc = AArch64::LD2Twov4s; 5321 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 5322 Opc = AArch64::LD2Twov2d; 5323 else if (Ty == S64 || Ty == P0) 5324 Opc = AArch64::LD1Twov1d; 5325 else 5326 llvm_unreachable("Unexpected type for ld2!"); 5327 selectVectorLoadIntrinsic(Opc, 2, I); 5328 break; 5329 } 5330 case Intrinsic::aarch64_neon_ld4: { 5331 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 5332 unsigned Opc = 0; 5333 if (Ty == LLT::fixed_vector(8, S8)) 5334 Opc = AArch64::LD4Fourv8b; 5335 else if (Ty == LLT::fixed_vector(16, S8)) 5336 Opc = AArch64::LD4Fourv16b; 5337 else if (Ty == LLT::fixed_vector(4, S16)) 5338 Opc = AArch64::LD4Fourv4h; 5339 else if (Ty == LLT::fixed_vector(8, S16)) 5340 Opc = AArch64::LD4Fourv8h; 5341 else if (Ty == LLT::fixed_vector(2, S32)) 5342 Opc = AArch64::LD4Fourv2s; 5343 else if (Ty == LLT::fixed_vector(4, S32)) 5344 Opc = AArch64::LD4Fourv4s; 5345 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 5346 Opc = AArch64::LD4Fourv2d; 5347 else if (Ty == S64 || Ty == P0) 5348 Opc = AArch64::LD1Fourv1d; 5349 else 5350 llvm_unreachable("Unexpected type for ld4!"); 5351 selectVectorLoadIntrinsic(Opc, 4, I); 5352 break; 5353 } 5354 case Intrinsic::aarch64_neon_st2: { 5355 Register Src1 = I.getOperand(1).getReg(); 5356 Register Src2 = I.getOperand(2).getReg(); 5357 Register Ptr = I.getOperand(3).getReg(); 5358 LLT Ty = MRI.getType(Src1); 5359 unsigned Opc; 5360 if (Ty == LLT::fixed_vector(8, S8)) 5361 Opc = AArch64::ST2Twov8b; 5362 else if (Ty == LLT::fixed_vector(16, S8)) 5363 Opc = AArch64::ST2Twov16b; 5364 else if (Ty == LLT::fixed_vector(4, S16)) 5365 Opc = AArch64::ST2Twov4h; 5366 else if (Ty == LLT::fixed_vector(8, S16)) 5367 Opc = AArch64::ST2Twov8h; 5368 else if (Ty == LLT::fixed_vector(2, S32)) 5369 Opc = AArch64::ST2Twov2s; 5370 else if (Ty == LLT::fixed_vector(4, S32)) 5371 Opc = AArch64::ST2Twov4s; 5372 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 5373 Opc = AArch64::ST2Twov2d; 5374 else if (Ty == S64 || Ty == P0) 5375 Opc = AArch64::ST1Twov1d; 5376 else 5377 llvm_unreachable("Unexpected type for st2!"); 5378 SmallVector<Register, 2> Regs = {Src1, Src2}; 5379 Register Tuple = Ty.getSizeInBits() == 128 ? createQTuple(Regs, MIB) 5380 : createDTuple(Regs, MIB); 5381 auto Store = MIB.buildInstr(Opc, {}, {Tuple, Ptr}); 5382 Store.cloneMemRefs(I); 5383 constrainSelectedInstRegOperands(*Store, TII, TRI, RBI); 5384 break; 5385 } 5386 } 5387 5388 I.eraseFromParent(); 5389 return true; 5390 } 5391 5392 bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I, 5393 MachineRegisterInfo &MRI) { 5394 unsigned IntrinID = I.getIntrinsicID(); 5395 5396 switch (IntrinID) { 5397 default: 5398 break; 5399 case Intrinsic::aarch64_crypto_sha1h: { 5400 Register DstReg = I.getOperand(0).getReg(); 5401 Register SrcReg = I.getOperand(2).getReg(); 5402 5403 // FIXME: Should this be an assert? 5404 if (MRI.getType(DstReg).getSizeInBits() != 32 || 5405 MRI.getType(SrcReg).getSizeInBits() != 32) 5406 return false; 5407 5408 // The operation has to happen on FPRs. Set up some new FPR registers for 5409 // the source and destination if they are on GPRs. 5410 if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) { 5411 SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); 5412 MIB.buildCopy({SrcReg}, {I.getOperand(2)}); 5413 5414 // Make sure the copy ends up getting constrained properly. 5415 RBI.constrainGenericRegister(I.getOperand(2).getReg(), 5416 AArch64::GPR32RegClass, MRI); 5417 } 5418 5419 if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) 5420 DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); 5421 5422 // Actually insert the instruction. 5423 auto SHA1Inst = MIB.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg}); 5424 constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI); 5425 5426 // Did we create a new register for the destination? 5427 if (DstReg != I.getOperand(0).getReg()) { 5428 // Yep. Copy the result of the instruction back into the original 5429 // destination. 5430 MIB.buildCopy({I.getOperand(0)}, {DstReg}); 5431 RBI.constrainGenericRegister(I.getOperand(0).getReg(), 5432 AArch64::GPR32RegClass, MRI); 5433 } 5434 5435 I.eraseFromParent(); 5436 return true; 5437 } 5438 case Intrinsic::ptrauth_sign: { 5439 Register DstReg = I.getOperand(0).getReg(); 5440 Register ValReg = I.getOperand(2).getReg(); 5441 uint64_t Key = I.getOperand(3).getImm(); 5442 Register DiscReg = I.getOperand(4).getReg(); 5443 auto DiscVal = getIConstantVRegVal(DiscReg, MRI); 5444 bool IsDiscZero = DiscVal.hasValue() && DiscVal->isNullValue(); 5445 5446 if (Key > 3) 5447 return false; 5448 5449 unsigned Opcodes[][4] = { 5450 {AArch64::PACIA, AArch64::PACIB, AArch64::PACDA, AArch64::PACDB}, 5451 {AArch64::PACIZA, AArch64::PACIZB, AArch64::PACDZA, AArch64::PACDZB}}; 5452 unsigned Opcode = Opcodes[IsDiscZero][Key]; 5453 5454 auto PAC = MIB.buildInstr(Opcode, {DstReg}, {ValReg}); 5455 5456 if (!IsDiscZero) { 5457 PAC.addUse(DiscReg); 5458 RBI.constrainGenericRegister(DiscReg, AArch64::GPR64spRegClass, MRI); 5459 } 5460 5461 RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI); 5462 I.eraseFromParent(); 5463 return true; 5464 } 5465 case Intrinsic::frameaddress: 5466 case Intrinsic::returnaddress: { 5467 MachineFunction &MF = *I.getParent()->getParent(); 5468 MachineFrameInfo &MFI = MF.getFrameInfo(); 5469 5470 unsigned Depth = I.getOperand(2).getImm(); 5471 Register DstReg = I.getOperand(0).getReg(); 5472 RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI); 5473 5474 if (Depth == 0 && IntrinID == Intrinsic::returnaddress) { 5475 if (!MFReturnAddr) { 5476 // Insert the copy from LR/X30 into the entry block, before it can be 5477 // clobbered by anything. 5478 MFI.setReturnAddressIsTaken(true); 5479 MFReturnAddr = getFunctionLiveInPhysReg(MF, TII, AArch64::LR, 5480 AArch64::GPR64RegClass); 5481 } 5482 5483 if (STI.hasPAuth()) { 5484 MIB.buildInstr(AArch64::XPACI, {DstReg}, {MFReturnAddr}); 5485 } else { 5486 MIB.buildCopy({Register(AArch64::LR)}, {MFReturnAddr}); 5487 MIB.buildInstr(AArch64::XPACLRI); 5488 MIB.buildCopy({DstReg}, {Register(AArch64::LR)}); 5489 } 5490 5491 I.eraseFromParent(); 5492 return true; 5493 } 5494 5495 MFI.setFrameAddressIsTaken(true); 5496 Register FrameAddr(AArch64::FP); 5497 while (Depth--) { 5498 Register NextFrame = MRI.createVirtualRegister(&AArch64::GPR64spRegClass); 5499 auto Ldr = 5500 MIB.buildInstr(AArch64::LDRXui, {NextFrame}, {FrameAddr}).addImm(0); 5501 constrainSelectedInstRegOperands(*Ldr, TII, TRI, RBI); 5502 FrameAddr = NextFrame; 5503 } 5504 5505 if (IntrinID == Intrinsic::frameaddress) 5506 MIB.buildCopy({DstReg}, {FrameAddr}); 5507 else { 5508 MFI.setReturnAddressIsTaken(true); 5509 5510 if (STI.hasPAuth()) { 5511 Register TmpReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); 5512 MIB.buildInstr(AArch64::LDRXui, {TmpReg}, {FrameAddr}).addImm(1); 5513 MIB.buildInstr(AArch64::XPACI, {DstReg}, {TmpReg}); 5514 } else { 5515 MIB.buildInstr(AArch64::LDRXui, {Register(AArch64::LR)}, {FrameAddr}) 5516 .addImm(1); 5517 MIB.buildInstr(AArch64::XPACLRI); 5518 MIB.buildCopy({DstReg}, {Register(AArch64::LR)}); 5519 } 5520 } 5521 5522 I.eraseFromParent(); 5523 return true; 5524 } 5525 case Intrinsic::swift_async_context_addr: 5526 auto Sub = MIB.buildInstr(AArch64::SUBXri, {I.getOperand(0).getReg()}, 5527 {Register(AArch64::FP)}) 5528 .addImm(8) 5529 .addImm(0); 5530 constrainSelectedInstRegOperands(*Sub, TII, TRI, RBI); 5531 5532 MF->getFrameInfo().setFrameAddressIsTaken(true); 5533 MF->getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true); 5534 I.eraseFromParent(); 5535 return true; 5536 } 5537 return false; 5538 } 5539 5540 InstructionSelector::ComplexRendererFns 5541 AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const { 5542 auto MaybeImmed = getImmedFromMO(Root); 5543 if (MaybeImmed == None || *MaybeImmed > 31) 5544 return None; 5545 uint64_t Enc = (32 - *MaybeImmed) & 0x1f; 5546 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; 5547 } 5548 5549 InstructionSelector::ComplexRendererFns 5550 AArch64InstructionSelector::selectShiftB_32(const MachineOperand &Root) const { 5551 auto MaybeImmed = getImmedFromMO(Root); 5552 if (MaybeImmed == None || *MaybeImmed > 31) 5553 return None; 5554 uint64_t Enc = 31 - *MaybeImmed; 5555 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; 5556 } 5557 5558 InstructionSelector::ComplexRendererFns 5559 AArch64InstructionSelector::selectShiftA_64(const MachineOperand &Root) const { 5560 auto MaybeImmed = getImmedFromMO(Root); 5561 if (MaybeImmed == None || *MaybeImmed > 63) 5562 return None; 5563 uint64_t Enc = (64 - *MaybeImmed) & 0x3f; 5564 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; 5565 } 5566 5567 InstructionSelector::ComplexRendererFns 5568 AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const { 5569 auto MaybeImmed = getImmedFromMO(Root); 5570 if (MaybeImmed == None || *MaybeImmed > 63) 5571 return None; 5572 uint64_t Enc = 63 - *MaybeImmed; 5573 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; 5574 } 5575 5576 /// Helper to select an immediate value that can be represented as a 12-bit 5577 /// value shifted left by either 0 or 12. If it is possible to do so, return 5578 /// the immediate and shift value. If not, return None. 5579 /// 5580 /// Used by selectArithImmed and selectNegArithImmed. 5581 InstructionSelector::ComplexRendererFns 5582 AArch64InstructionSelector::select12BitValueWithLeftShift( 5583 uint64_t Immed) const { 5584 unsigned ShiftAmt; 5585 if (Immed >> 12 == 0) { 5586 ShiftAmt = 0; 5587 } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) { 5588 ShiftAmt = 12; 5589 Immed = Immed >> 12; 5590 } else 5591 return None; 5592 5593 unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt); 5594 return {{ 5595 [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); }, 5596 [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); }, 5597 }}; 5598 } 5599 5600 /// SelectArithImmed - Select an immediate value that can be represented as 5601 /// a 12-bit value shifted left by either 0 or 12. If so, return true with 5602 /// Val set to the 12-bit value and Shift set to the shifter operand. 5603 InstructionSelector::ComplexRendererFns 5604 AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const { 5605 // This function is called from the addsub_shifted_imm ComplexPattern, 5606 // which lists [imm] as the list of opcode it's interested in, however 5607 // we still need to check whether the operand is actually an immediate 5608 // here because the ComplexPattern opcode list is only used in 5609 // root-level opcode matching. 5610 auto MaybeImmed = getImmedFromMO(Root); 5611 if (MaybeImmed == None) 5612 return None; 5613 return select12BitValueWithLeftShift(*MaybeImmed); 5614 } 5615 5616 /// SelectNegArithImmed - As above, but negates the value before trying to 5617 /// select it. 5618 InstructionSelector::ComplexRendererFns 5619 AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const { 5620 // We need a register here, because we need to know if we have a 64 or 32 5621 // bit immediate. 5622 if (!Root.isReg()) 5623 return None; 5624 auto MaybeImmed = getImmedFromMO(Root); 5625 if (MaybeImmed == None) 5626 return None; 5627 uint64_t Immed = *MaybeImmed; 5628 5629 // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0" 5630 // have the opposite effect on the C flag, so this pattern mustn't match under 5631 // those circumstances. 5632 if (Immed == 0) 5633 return None; 5634 5635 // Check if we're dealing with a 32-bit type on the root or a 64-bit type on 5636 // the root. 5637 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 5638 if (MRI.getType(Root.getReg()).getSizeInBits() == 32) 5639 Immed = ~((uint32_t)Immed) + 1; 5640 else 5641 Immed = ~Immed + 1ULL; 5642 5643 if (Immed & 0xFFFFFFFFFF000000ULL) 5644 return None; 5645 5646 Immed &= 0xFFFFFFULL; 5647 return select12BitValueWithLeftShift(Immed); 5648 } 5649 5650 /// Return true if it is worth folding MI into an extended register. That is, 5651 /// if it's safe to pull it into the addressing mode of a load or store as a 5652 /// shift. 5653 bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg( 5654 MachineInstr &MI, const MachineRegisterInfo &MRI) const { 5655 // Always fold if there is one use, or if we're optimizing for size. 5656 Register DefReg = MI.getOperand(0).getReg(); 5657 if (MRI.hasOneNonDBGUse(DefReg) || 5658 MI.getParent()->getParent()->getFunction().hasOptSize()) 5659 return true; 5660 5661 // It's better to avoid folding and recomputing shifts when we don't have a 5662 // fastpath. 5663 if (!STI.hasLSLFast()) 5664 return false; 5665 5666 // We have a fastpath, so folding a shift in and potentially computing it 5667 // many times may be beneficial. Check if this is only used in memory ops. 5668 // If it is, then we should fold. 5669 return all_of(MRI.use_nodbg_instructions(DefReg), 5670 [](MachineInstr &Use) { return Use.mayLoadOrStore(); }); 5671 } 5672 5673 static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) { 5674 switch (Type) { 5675 case AArch64_AM::SXTB: 5676 case AArch64_AM::SXTH: 5677 case AArch64_AM::SXTW: 5678 return true; 5679 default: 5680 return false; 5681 } 5682 } 5683 5684 InstructionSelector::ComplexRendererFns 5685 AArch64InstructionSelector::selectExtendedSHL( 5686 MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset, 5687 unsigned SizeInBytes, bool WantsExt) const { 5688 assert(Base.isReg() && "Expected base to be a register operand"); 5689 assert(Offset.isReg() && "Expected offset to be a register operand"); 5690 5691 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 5692 MachineInstr *OffsetInst = MRI.getVRegDef(Offset.getReg()); 5693 if (!OffsetInst) 5694 return None; 5695 5696 unsigned OffsetOpc = OffsetInst->getOpcode(); 5697 bool LookedThroughZExt = false; 5698 if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) { 5699 // Try to look through a ZEXT. 5700 if (OffsetOpc != TargetOpcode::G_ZEXT || !WantsExt) 5701 return None; 5702 5703 OffsetInst = MRI.getVRegDef(OffsetInst->getOperand(1).getReg()); 5704 OffsetOpc = OffsetInst->getOpcode(); 5705 LookedThroughZExt = true; 5706 5707 if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) 5708 return None; 5709 } 5710 // Make sure that the memory op is a valid size. 5711 int64_t LegalShiftVal = Log2_32(SizeInBytes); 5712 if (LegalShiftVal == 0) 5713 return None; 5714 if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI)) 5715 return None; 5716 5717 // Now, try to find the specific G_CONSTANT. Start by assuming that the 5718 // register we will offset is the LHS, and the register containing the 5719 // constant is the RHS. 5720 Register OffsetReg = OffsetInst->getOperand(1).getReg(); 5721 Register ConstantReg = OffsetInst->getOperand(2).getReg(); 5722 auto ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI); 5723 if (!ValAndVReg) { 5724 // We didn't get a constant on the RHS. If the opcode is a shift, then 5725 // we're done. 5726 if (OffsetOpc == TargetOpcode::G_SHL) 5727 return None; 5728 5729 // If we have a G_MUL, we can use either register. Try looking at the RHS. 5730 std::swap(OffsetReg, ConstantReg); 5731 ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI); 5732 if (!ValAndVReg) 5733 return None; 5734 } 5735 5736 // The value must fit into 3 bits, and must be positive. Make sure that is 5737 // true. 5738 int64_t ImmVal = ValAndVReg->Value.getSExtValue(); 5739 5740 // Since we're going to pull this into a shift, the constant value must be 5741 // a power of 2. If we got a multiply, then we need to check this. 5742 if (OffsetOpc == TargetOpcode::G_MUL) { 5743 if (!isPowerOf2_32(ImmVal)) 5744 return None; 5745 5746 // Got a power of 2. So, the amount we'll shift is the log base-2 of that. 5747 ImmVal = Log2_32(ImmVal); 5748 } 5749 5750 if ((ImmVal & 0x7) != ImmVal) 5751 return None; 5752 5753 // We are only allowed to shift by LegalShiftVal. This shift value is built 5754 // into the instruction, so we can't just use whatever we want. 5755 if (ImmVal != LegalShiftVal) 5756 return None; 5757 5758 unsigned SignExtend = 0; 5759 if (WantsExt) { 5760 // Check if the offset is defined by an extend, unless we looked through a 5761 // G_ZEXT earlier. 5762 if (!LookedThroughZExt) { 5763 MachineInstr *ExtInst = getDefIgnoringCopies(OffsetReg, MRI); 5764 auto Ext = getExtendTypeForInst(*ExtInst, MRI, true); 5765 if (Ext == AArch64_AM::InvalidShiftExtend) 5766 return None; 5767 5768 SignExtend = isSignExtendShiftType(Ext) ? 1 : 0; 5769 // We only support SXTW for signed extension here. 5770 if (SignExtend && Ext != AArch64_AM::SXTW) 5771 return None; 5772 OffsetReg = ExtInst->getOperand(1).getReg(); 5773 } 5774 5775 // Need a 32-bit wide register here. 5776 MachineIRBuilder MIB(*MRI.getVRegDef(Root.getReg())); 5777 OffsetReg = moveScalarRegClass(OffsetReg, AArch64::GPR32RegClass, MIB); 5778 } 5779 5780 // We can use the LHS of the GEP as the base, and the LHS of the shift as an 5781 // offset. Signify that we are shifting by setting the shift flag to 1. 5782 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(Base.getReg()); }, 5783 [=](MachineInstrBuilder &MIB) { MIB.addUse(OffsetReg); }, 5784 [=](MachineInstrBuilder &MIB) { 5785 // Need to add both immediates here to make sure that they are both 5786 // added to the instruction. 5787 MIB.addImm(SignExtend); 5788 MIB.addImm(1); 5789 }}}; 5790 } 5791 5792 /// This is used for computing addresses like this: 5793 /// 5794 /// ldr x1, [x2, x3, lsl #3] 5795 /// 5796 /// Where x2 is the base register, and x3 is an offset register. The shift-left 5797 /// is a constant value specific to this load instruction. That is, we'll never 5798 /// see anything other than a 3 here (which corresponds to the size of the 5799 /// element being loaded.) 5800 InstructionSelector::ComplexRendererFns 5801 AArch64InstructionSelector::selectAddrModeShiftedExtendXReg( 5802 MachineOperand &Root, unsigned SizeInBytes) const { 5803 if (!Root.isReg()) 5804 return None; 5805 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 5806 5807 // We want to find something like this: 5808 // 5809 // val = G_CONSTANT LegalShiftVal 5810 // shift = G_SHL off_reg val 5811 // ptr = G_PTR_ADD base_reg shift 5812 // x = G_LOAD ptr 5813 // 5814 // And fold it into this addressing mode: 5815 // 5816 // ldr x, [base_reg, off_reg, lsl #LegalShiftVal] 5817 5818 // Check if we can find the G_PTR_ADD. 5819 MachineInstr *PtrAdd = 5820 getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); 5821 if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI)) 5822 return None; 5823 5824 // Now, try to match an opcode which will match our specific offset. 5825 // We want a G_SHL or a G_MUL. 5826 MachineInstr *OffsetInst = 5827 getDefIgnoringCopies(PtrAdd->getOperand(2).getReg(), MRI); 5828 return selectExtendedSHL(Root, PtrAdd->getOperand(1), 5829 OffsetInst->getOperand(0), SizeInBytes, 5830 /*WantsExt=*/false); 5831 } 5832 5833 /// This is used for computing addresses like this: 5834 /// 5835 /// ldr x1, [x2, x3] 5836 /// 5837 /// Where x2 is the base register, and x3 is an offset register. 5838 /// 5839 /// When possible (or profitable) to fold a G_PTR_ADD into the address calculation, 5840 /// this will do so. Otherwise, it will return None. 5841 InstructionSelector::ComplexRendererFns 5842 AArch64InstructionSelector::selectAddrModeRegisterOffset( 5843 MachineOperand &Root) const { 5844 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 5845 5846 // We need a GEP. 5847 MachineInstr *Gep = MRI.getVRegDef(Root.getReg()); 5848 if (!Gep || Gep->getOpcode() != TargetOpcode::G_PTR_ADD) 5849 return None; 5850 5851 // If this is used more than once, let's not bother folding. 5852 // TODO: Check if they are memory ops. If they are, then we can still fold 5853 // without having to recompute anything. 5854 if (!MRI.hasOneNonDBGUse(Gep->getOperand(0).getReg())) 5855 return None; 5856 5857 // Base is the GEP's LHS, offset is its RHS. 5858 return {{[=](MachineInstrBuilder &MIB) { 5859 MIB.addUse(Gep->getOperand(1).getReg()); 5860 }, 5861 [=](MachineInstrBuilder &MIB) { 5862 MIB.addUse(Gep->getOperand(2).getReg()); 5863 }, 5864 [=](MachineInstrBuilder &MIB) { 5865 // Need to add both immediates here to make sure that they are both 5866 // added to the instruction. 5867 MIB.addImm(0); 5868 MIB.addImm(0); 5869 }}}; 5870 } 5871 5872 /// This is intended to be equivalent to selectAddrModeXRO in 5873 /// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads. 5874 InstructionSelector::ComplexRendererFns 5875 AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root, 5876 unsigned SizeInBytes) const { 5877 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 5878 if (!Root.isReg()) 5879 return None; 5880 MachineInstr *PtrAdd = 5881 getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); 5882 if (!PtrAdd) 5883 return None; 5884 5885 // Check for an immediates which cannot be encoded in the [base + imm] 5886 // addressing mode, and can't be encoded in an add/sub. If this happens, we'll 5887 // end up with code like: 5888 // 5889 // mov x0, wide 5890 // add x1 base, x0 5891 // ldr x2, [x1, x0] 5892 // 5893 // In this situation, we can use the [base, xreg] addressing mode to save an 5894 // add/sub: 5895 // 5896 // mov x0, wide 5897 // ldr x2, [base, x0] 5898 auto ValAndVReg = 5899 getIConstantVRegValWithLookThrough(PtrAdd->getOperand(2).getReg(), MRI); 5900 if (ValAndVReg) { 5901 unsigned Scale = Log2_32(SizeInBytes); 5902 int64_t ImmOff = ValAndVReg->Value.getSExtValue(); 5903 5904 // Skip immediates that can be selected in the load/store addresing 5905 // mode. 5906 if (ImmOff % SizeInBytes == 0 && ImmOff >= 0 && 5907 ImmOff < (0x1000 << Scale)) 5908 return None; 5909 5910 // Helper lambda to decide whether or not it is preferable to emit an add. 5911 auto isPreferredADD = [](int64_t ImmOff) { 5912 // Constants in [0x0, 0xfff] can be encoded in an add. 5913 if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL) 5914 return true; 5915 5916 // Can it be encoded in an add lsl #12? 5917 if ((ImmOff & 0xffffffffff000fffLL) != 0x0LL) 5918 return false; 5919 5920 // It can be encoded in an add lsl #12, but we may not want to. If it is 5921 // possible to select this as a single movz, then prefer that. A single 5922 // movz is faster than an add with a shift. 5923 return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL && 5924 (ImmOff & 0xffffffffffff0fffLL) != 0x0LL; 5925 }; 5926 5927 // If the immediate can be encoded in a single add/sub, then bail out. 5928 if (isPreferredADD(ImmOff) || isPreferredADD(-ImmOff)) 5929 return None; 5930 } 5931 5932 // Try to fold shifts into the addressing mode. 5933 auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes); 5934 if (AddrModeFns) 5935 return AddrModeFns; 5936 5937 // If that doesn't work, see if it's possible to fold in registers from 5938 // a GEP. 5939 return selectAddrModeRegisterOffset(Root); 5940 } 5941 5942 /// This is used for computing addresses like this: 5943 /// 5944 /// ldr x0, [xBase, wOffset, sxtw #LegalShiftVal] 5945 /// 5946 /// Where we have a 64-bit base register, a 32-bit offset register, and an 5947 /// extend (which may or may not be signed). 5948 InstructionSelector::ComplexRendererFns 5949 AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root, 5950 unsigned SizeInBytes) const { 5951 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 5952 5953 MachineInstr *PtrAdd = 5954 getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); 5955 if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI)) 5956 return None; 5957 5958 MachineOperand &LHS = PtrAdd->getOperand(1); 5959 MachineOperand &RHS = PtrAdd->getOperand(2); 5960 MachineInstr *OffsetInst = getDefIgnoringCopies(RHS.getReg(), MRI); 5961 5962 // The first case is the same as selectAddrModeXRO, except we need an extend. 5963 // In this case, we try to find a shift and extend, and fold them into the 5964 // addressing mode. 5965 // 5966 // E.g. 5967 // 5968 // off_reg = G_Z/S/ANYEXT ext_reg 5969 // val = G_CONSTANT LegalShiftVal 5970 // shift = G_SHL off_reg val 5971 // ptr = G_PTR_ADD base_reg shift 5972 // x = G_LOAD ptr 5973 // 5974 // In this case we can get a load like this: 5975 // 5976 // ldr x0, [base_reg, ext_reg, sxtw #LegalShiftVal] 5977 auto ExtendedShl = selectExtendedSHL(Root, LHS, OffsetInst->getOperand(0), 5978 SizeInBytes, /*WantsExt=*/true); 5979 if (ExtendedShl) 5980 return ExtendedShl; 5981 5982 // There was no shift. We can try and fold a G_Z/S/ANYEXT in alone though. 5983 // 5984 // e.g. 5985 // ldr something, [base_reg, ext_reg, sxtw] 5986 if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI)) 5987 return None; 5988 5989 // Check if this is an extend. We'll get an extend type if it is. 5990 AArch64_AM::ShiftExtendType Ext = 5991 getExtendTypeForInst(*OffsetInst, MRI, /*IsLoadStore=*/true); 5992 if (Ext == AArch64_AM::InvalidShiftExtend) 5993 return None; 5994 5995 // Need a 32-bit wide register. 5996 MachineIRBuilder MIB(*PtrAdd); 5997 Register ExtReg = moveScalarRegClass(OffsetInst->getOperand(1).getReg(), 5998 AArch64::GPR32RegClass, MIB); 5999 unsigned SignExtend = Ext == AArch64_AM::SXTW; 6000 6001 // Base is LHS, offset is ExtReg. 6002 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(LHS.getReg()); }, 6003 [=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }, 6004 [=](MachineInstrBuilder &MIB) { 6005 MIB.addImm(SignExtend); 6006 MIB.addImm(0); 6007 }}}; 6008 } 6009 6010 /// Select a "register plus unscaled signed 9-bit immediate" address. This 6011 /// should only match when there is an offset that is not valid for a scaled 6012 /// immediate addressing mode. The "Size" argument is the size in bytes of the 6013 /// memory reference, which is needed here to know what is valid for a scaled 6014 /// immediate. 6015 InstructionSelector::ComplexRendererFns 6016 AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root, 6017 unsigned Size) const { 6018 MachineRegisterInfo &MRI = 6019 Root.getParent()->getParent()->getParent()->getRegInfo(); 6020 6021 if (!Root.isReg()) 6022 return None; 6023 6024 if (!isBaseWithConstantOffset(Root, MRI)) 6025 return None; 6026 6027 MachineInstr *RootDef = MRI.getVRegDef(Root.getReg()); 6028 if (!RootDef) 6029 return None; 6030 6031 MachineOperand &OffImm = RootDef->getOperand(2); 6032 if (!OffImm.isReg()) 6033 return None; 6034 MachineInstr *RHS = MRI.getVRegDef(OffImm.getReg()); 6035 if (!RHS || RHS->getOpcode() != TargetOpcode::G_CONSTANT) 6036 return None; 6037 int64_t RHSC; 6038 MachineOperand &RHSOp1 = RHS->getOperand(1); 6039 if (!RHSOp1.isCImm() || RHSOp1.getCImm()->getBitWidth() > 64) 6040 return None; 6041 RHSC = RHSOp1.getCImm()->getSExtValue(); 6042 6043 // If the offset is valid as a scaled immediate, don't match here. 6044 if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Log2_32(Size))) 6045 return None; 6046 if (RHSC >= -256 && RHSC < 256) { 6047 MachineOperand &Base = RootDef->getOperand(1); 6048 return {{ 6049 [=](MachineInstrBuilder &MIB) { MIB.add(Base); }, 6050 [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC); }, 6051 }}; 6052 } 6053 return None; 6054 } 6055 6056 InstructionSelector::ComplexRendererFns 6057 AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef, 6058 unsigned Size, 6059 MachineRegisterInfo &MRI) const { 6060 if (RootDef.getOpcode() != AArch64::G_ADD_LOW) 6061 return None; 6062 MachineInstr &Adrp = *MRI.getVRegDef(RootDef.getOperand(1).getReg()); 6063 if (Adrp.getOpcode() != AArch64::ADRP) 6064 return None; 6065 6066 // TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG. 6067 auto Offset = Adrp.getOperand(1).getOffset(); 6068 if (Offset % Size != 0) 6069 return None; 6070 6071 auto GV = Adrp.getOperand(1).getGlobal(); 6072 if (GV->isThreadLocal()) 6073 return None; 6074 6075 auto &MF = *RootDef.getParent()->getParent(); 6076 if (GV->getPointerAlignment(MF.getDataLayout()) < Size) 6077 return None; 6078 6079 unsigned OpFlags = STI.ClassifyGlobalReference(GV, MF.getTarget()); 6080 MachineIRBuilder MIRBuilder(RootDef); 6081 Register AdrpReg = Adrp.getOperand(0).getReg(); 6082 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(AdrpReg); }, 6083 [=](MachineInstrBuilder &MIB) { 6084 MIB.addGlobalAddress(GV, Offset, 6085 OpFlags | AArch64II::MO_PAGEOFF | 6086 AArch64II::MO_NC); 6087 }}}; 6088 } 6089 6090 /// Select a "register plus scaled unsigned 12-bit immediate" address. The 6091 /// "Size" argument is the size in bytes of the memory reference, which 6092 /// determines the scale. 6093 InstructionSelector::ComplexRendererFns 6094 AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root, 6095 unsigned Size) const { 6096 MachineFunction &MF = *Root.getParent()->getParent()->getParent(); 6097 MachineRegisterInfo &MRI = MF.getRegInfo(); 6098 6099 if (!Root.isReg()) 6100 return None; 6101 6102 MachineInstr *RootDef = MRI.getVRegDef(Root.getReg()); 6103 if (!RootDef) 6104 return None; 6105 6106 if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) { 6107 return {{ 6108 [=](MachineInstrBuilder &MIB) { MIB.add(RootDef->getOperand(1)); }, 6109 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, 6110 }}; 6111 } 6112 6113 CodeModel::Model CM = MF.getTarget().getCodeModel(); 6114 // Check if we can fold in the ADD of small code model ADRP + ADD address. 6115 if (CM == CodeModel::Small) { 6116 auto OpFns = tryFoldAddLowIntoImm(*RootDef, Size, MRI); 6117 if (OpFns) 6118 return OpFns; 6119 } 6120 6121 if (isBaseWithConstantOffset(Root, MRI)) { 6122 MachineOperand &LHS = RootDef->getOperand(1); 6123 MachineOperand &RHS = RootDef->getOperand(2); 6124 MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg()); 6125 MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg()); 6126 if (LHSDef && RHSDef) { 6127 int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue(); 6128 unsigned Scale = Log2_32(Size); 6129 if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) { 6130 if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) 6131 return {{ 6132 [=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); }, 6133 [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); }, 6134 }}; 6135 6136 return {{ 6137 [=](MachineInstrBuilder &MIB) { MIB.add(LHS); }, 6138 [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); }, 6139 }}; 6140 } 6141 } 6142 } 6143 6144 // Before falling back to our general case, check if the unscaled 6145 // instructions can handle this. If so, that's preferable. 6146 if (selectAddrModeUnscaled(Root, Size).hasValue()) 6147 return None; 6148 6149 return {{ 6150 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 6151 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, 6152 }}; 6153 } 6154 6155 /// Given a shift instruction, return the correct shift type for that 6156 /// instruction. 6157 static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) { 6158 switch (MI.getOpcode()) { 6159 default: 6160 return AArch64_AM::InvalidShiftExtend; 6161 case TargetOpcode::G_SHL: 6162 return AArch64_AM::LSL; 6163 case TargetOpcode::G_LSHR: 6164 return AArch64_AM::LSR; 6165 case TargetOpcode::G_ASHR: 6166 return AArch64_AM::ASR; 6167 case TargetOpcode::G_ROTR: 6168 return AArch64_AM::ROR; 6169 } 6170 } 6171 6172 /// Select a "shifted register" operand. If the value is not shifted, set the 6173 /// shift operand to a default value of "lsl 0". 6174 InstructionSelector::ComplexRendererFns 6175 AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root, 6176 bool AllowROR) const { 6177 if (!Root.isReg()) 6178 return None; 6179 MachineRegisterInfo &MRI = 6180 Root.getParent()->getParent()->getParent()->getRegInfo(); 6181 6182 // Check if the operand is defined by an instruction which corresponds to 6183 // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc. 6184 MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg()); 6185 if (!ShiftInst) 6186 return None; 6187 AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst); 6188 if (ShType == AArch64_AM::InvalidShiftExtend) 6189 return None; 6190 if (ShType == AArch64_AM::ROR && !AllowROR) 6191 return None; 6192 if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI)) 6193 return None; 6194 6195 // Need an immediate on the RHS. 6196 MachineOperand &ShiftRHS = ShiftInst->getOperand(2); 6197 auto Immed = getImmedFromMO(ShiftRHS); 6198 if (!Immed) 6199 return None; 6200 6201 // We have something that we can fold. Fold in the shift's LHS and RHS into 6202 // the instruction. 6203 MachineOperand &ShiftLHS = ShiftInst->getOperand(1); 6204 Register ShiftReg = ShiftLHS.getReg(); 6205 6206 unsigned NumBits = MRI.getType(ShiftReg).getSizeInBits(); 6207 unsigned Val = *Immed & (NumBits - 1); 6208 unsigned ShiftVal = AArch64_AM::getShifterImm(ShType, Val); 6209 6210 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ShiftReg); }, 6211 [=](MachineInstrBuilder &MIB) { MIB.addImm(ShiftVal); }}}; 6212 } 6213 6214 AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst( 6215 MachineInstr &MI, MachineRegisterInfo &MRI, bool IsLoadStore) const { 6216 unsigned Opc = MI.getOpcode(); 6217 6218 // Handle explicit extend instructions first. 6219 if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) { 6220 unsigned Size; 6221 if (Opc == TargetOpcode::G_SEXT) 6222 Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 6223 else 6224 Size = MI.getOperand(2).getImm(); 6225 assert(Size != 64 && "Extend from 64 bits?"); 6226 switch (Size) { 6227 case 8: 6228 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTB; 6229 case 16: 6230 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTH; 6231 case 32: 6232 return AArch64_AM::SXTW; 6233 default: 6234 return AArch64_AM::InvalidShiftExtend; 6235 } 6236 } 6237 6238 if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) { 6239 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 6240 assert(Size != 64 && "Extend from 64 bits?"); 6241 switch (Size) { 6242 case 8: 6243 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTB; 6244 case 16: 6245 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTH; 6246 case 32: 6247 return AArch64_AM::UXTW; 6248 default: 6249 return AArch64_AM::InvalidShiftExtend; 6250 } 6251 } 6252 6253 // Don't have an explicit extend. Try to handle a G_AND with a constant mask 6254 // on the RHS. 6255 if (Opc != TargetOpcode::G_AND) 6256 return AArch64_AM::InvalidShiftExtend; 6257 6258 Optional<uint64_t> MaybeAndMask = getImmedFromMO(MI.getOperand(2)); 6259 if (!MaybeAndMask) 6260 return AArch64_AM::InvalidShiftExtend; 6261 uint64_t AndMask = *MaybeAndMask; 6262 switch (AndMask) { 6263 default: 6264 return AArch64_AM::InvalidShiftExtend; 6265 case 0xFF: 6266 return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend; 6267 case 0xFFFF: 6268 return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend; 6269 case 0xFFFFFFFF: 6270 return AArch64_AM::UXTW; 6271 } 6272 } 6273 6274 Register AArch64InstructionSelector::moveScalarRegClass( 6275 Register Reg, const TargetRegisterClass &RC, MachineIRBuilder &MIB) const { 6276 MachineRegisterInfo &MRI = *MIB.getMRI(); 6277 auto Ty = MRI.getType(Reg); 6278 assert(!Ty.isVector() && "Expected scalars only!"); 6279 if (Ty.getSizeInBits() == TRI.getRegSizeInBits(RC)) 6280 return Reg; 6281 6282 // Create a copy and immediately select it. 6283 // FIXME: We should have an emitCopy function? 6284 auto Copy = MIB.buildCopy({&RC}, {Reg}); 6285 selectCopy(*Copy, TII, MRI, TRI, RBI); 6286 return Copy.getReg(0); 6287 } 6288 6289 /// Select an "extended register" operand. This operand folds in an extend 6290 /// followed by an optional left shift. 6291 InstructionSelector::ComplexRendererFns 6292 AArch64InstructionSelector::selectArithExtendedRegister( 6293 MachineOperand &Root) const { 6294 if (!Root.isReg()) 6295 return None; 6296 MachineRegisterInfo &MRI = 6297 Root.getParent()->getParent()->getParent()->getRegInfo(); 6298 6299 uint64_t ShiftVal = 0; 6300 Register ExtReg; 6301 AArch64_AM::ShiftExtendType Ext; 6302 MachineInstr *RootDef = getDefIgnoringCopies(Root.getReg(), MRI); 6303 if (!RootDef) 6304 return None; 6305 6306 if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI)) 6307 return None; 6308 6309 // Check if we can fold a shift and an extend. 6310 if (RootDef->getOpcode() == TargetOpcode::G_SHL) { 6311 // Look for a constant on the RHS of the shift. 6312 MachineOperand &RHS = RootDef->getOperand(2); 6313 Optional<uint64_t> MaybeShiftVal = getImmedFromMO(RHS); 6314 if (!MaybeShiftVal) 6315 return None; 6316 ShiftVal = *MaybeShiftVal; 6317 if (ShiftVal > 4) 6318 return None; 6319 // Look for a valid extend instruction on the LHS of the shift. 6320 MachineOperand &LHS = RootDef->getOperand(1); 6321 MachineInstr *ExtDef = getDefIgnoringCopies(LHS.getReg(), MRI); 6322 if (!ExtDef) 6323 return None; 6324 Ext = getExtendTypeForInst(*ExtDef, MRI); 6325 if (Ext == AArch64_AM::InvalidShiftExtend) 6326 return None; 6327 ExtReg = ExtDef->getOperand(1).getReg(); 6328 } else { 6329 // Didn't get a shift. Try just folding an extend. 6330 Ext = getExtendTypeForInst(*RootDef, MRI); 6331 if (Ext == AArch64_AM::InvalidShiftExtend) 6332 return None; 6333 ExtReg = RootDef->getOperand(1).getReg(); 6334 6335 // If we have a 32 bit instruction which zeroes out the high half of a 6336 // register, we get an implicit zero extend for free. Check if we have one. 6337 // FIXME: We actually emit the extend right now even though we don't have 6338 // to. 6339 if (Ext == AArch64_AM::UXTW && MRI.getType(ExtReg).getSizeInBits() == 32) { 6340 MachineInstr *ExtInst = MRI.getVRegDef(ExtReg); 6341 if (ExtInst && isDef32(*ExtInst)) 6342 return None; 6343 } 6344 } 6345 6346 // We require a GPR32 here. Narrow the ExtReg if needed using a subregister 6347 // copy. 6348 MachineIRBuilder MIB(*RootDef); 6349 ExtReg = moveScalarRegClass(ExtReg, AArch64::GPR32RegClass, MIB); 6350 6351 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }, 6352 [=](MachineInstrBuilder &MIB) { 6353 MIB.addImm(getArithExtendImm(Ext, ShiftVal)); 6354 }}}; 6355 } 6356 6357 void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB, 6358 const MachineInstr &MI, 6359 int OpIdx) const { 6360 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 6361 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 6362 "Expected G_CONSTANT"); 6363 Optional<int64_t> CstVal = 6364 getIConstantVRegSExtVal(MI.getOperand(0).getReg(), MRI); 6365 assert(CstVal && "Expected constant value"); 6366 MIB.addImm(CstVal.getValue()); 6367 } 6368 6369 void AArch64InstructionSelector::renderLogicalImm32( 6370 MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const { 6371 assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 6372 "Expected G_CONSTANT"); 6373 uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue(); 6374 uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 32); 6375 MIB.addImm(Enc); 6376 } 6377 6378 void AArch64InstructionSelector::renderLogicalImm64( 6379 MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const { 6380 assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 6381 "Expected G_CONSTANT"); 6382 uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue(); 6383 uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 64); 6384 MIB.addImm(Enc); 6385 } 6386 6387 void AArch64InstructionSelector::renderFPImm16(MachineInstrBuilder &MIB, 6388 const MachineInstr &MI, 6389 int OpIdx) const { 6390 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 && 6391 "Expected G_FCONSTANT"); 6392 MIB.addImm( 6393 AArch64_AM::getFP16Imm(MI.getOperand(1).getFPImm()->getValueAPF())); 6394 } 6395 6396 void AArch64InstructionSelector::renderFPImm32(MachineInstrBuilder &MIB, 6397 const MachineInstr &MI, 6398 int OpIdx) const { 6399 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 && 6400 "Expected G_FCONSTANT"); 6401 MIB.addImm( 6402 AArch64_AM::getFP32Imm(MI.getOperand(1).getFPImm()->getValueAPF())); 6403 } 6404 6405 void AArch64InstructionSelector::renderFPImm64(MachineInstrBuilder &MIB, 6406 const MachineInstr &MI, 6407 int OpIdx) const { 6408 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 && 6409 "Expected G_FCONSTANT"); 6410 MIB.addImm( 6411 AArch64_AM::getFP64Imm(MI.getOperand(1).getFPImm()->getValueAPF())); 6412 } 6413 6414 bool AArch64InstructionSelector::isLoadStoreOfNumBytes( 6415 const MachineInstr &MI, unsigned NumBytes) const { 6416 if (!MI.mayLoadOrStore()) 6417 return false; 6418 assert(MI.hasOneMemOperand() && 6419 "Expected load/store to have only one mem op!"); 6420 return (*MI.memoperands_begin())->getSize() == NumBytes; 6421 } 6422 6423 bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const { 6424 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 6425 if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() != 32) 6426 return false; 6427 6428 // Only return true if we know the operation will zero-out the high half of 6429 // the 64-bit register. Truncates can be subregister copies, which don't 6430 // zero out the high bits. Copies and other copy-like instructions can be 6431 // fed by truncates, or could be lowered as subregister copies. 6432 switch (MI.getOpcode()) { 6433 default: 6434 return true; 6435 case TargetOpcode::COPY: 6436 case TargetOpcode::G_BITCAST: 6437 case TargetOpcode::G_TRUNC: 6438 case TargetOpcode::G_PHI: 6439 return false; 6440 } 6441 } 6442 6443 6444 // Perform fixups on the given PHI instruction's operands to force them all 6445 // to be the same as the destination regbank. 6446 static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI, 6447 const AArch64RegisterBankInfo &RBI) { 6448 assert(MI.getOpcode() == TargetOpcode::G_PHI && "Expected a G_PHI"); 6449 Register DstReg = MI.getOperand(0).getReg(); 6450 const RegisterBank *DstRB = MRI.getRegBankOrNull(DstReg); 6451 assert(DstRB && "Expected PHI dst to have regbank assigned"); 6452 MachineIRBuilder MIB(MI); 6453 6454 // Go through each operand and ensure it has the same regbank. 6455 for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) { 6456 MachineOperand &MO = MI.getOperand(OpIdx); 6457 if (!MO.isReg()) 6458 continue; 6459 Register OpReg = MO.getReg(); 6460 const RegisterBank *RB = MRI.getRegBankOrNull(OpReg); 6461 if (RB != DstRB) { 6462 // Insert a cross-bank copy. 6463 auto *OpDef = MRI.getVRegDef(OpReg); 6464 const LLT &Ty = MRI.getType(OpReg); 6465 MachineBasicBlock &OpDefBB = *OpDef->getParent(); 6466 6467 // Any instruction we insert must appear after all PHIs in the block 6468 // for the block to be valid MIR. 6469 MachineBasicBlock::iterator InsertPt = std::next(OpDef->getIterator()); 6470 if (InsertPt != OpDefBB.end() && InsertPt->isPHI()) 6471 InsertPt = OpDefBB.getFirstNonPHI(); 6472 MIB.setInsertPt(*OpDef->getParent(), InsertPt); 6473 auto Copy = MIB.buildCopy(Ty, OpReg); 6474 MRI.setRegBank(Copy.getReg(0), *DstRB); 6475 MO.setReg(Copy.getReg(0)); 6476 } 6477 } 6478 } 6479 6480 void AArch64InstructionSelector::processPHIs(MachineFunction &MF) { 6481 // We're looking for PHIs, build a list so we don't invalidate iterators. 6482 MachineRegisterInfo &MRI = MF.getRegInfo(); 6483 SmallVector<MachineInstr *, 32> Phis; 6484 for (auto &BB : MF) { 6485 for (auto &MI : BB) { 6486 if (MI.getOpcode() == TargetOpcode::G_PHI) 6487 Phis.emplace_back(&MI); 6488 } 6489 } 6490 6491 for (auto *MI : Phis) { 6492 // We need to do some work here if the operand types are < 16 bit and they 6493 // are split across fpr/gpr banks. Since all types <32b on gpr 6494 // end up being assigned gpr32 regclasses, we can end up with PHIs here 6495 // which try to select between a gpr32 and an fpr16. Ideally RBS shouldn't 6496 // be selecting heterogenous regbanks for operands if possible, but we 6497 // still need to be able to deal with it here. 6498 // 6499 // To fix this, if we have a gpr-bank operand < 32b in size and at least 6500 // one other operand is on the fpr bank, then we add cross-bank copies 6501 // to homogenize the operand banks. For simplicity the bank that we choose 6502 // to settle on is whatever bank the def operand has. For example: 6503 // 6504 // %endbb: 6505 // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2:fpr(s16), %bb2 6506 // => 6507 // %bb2: 6508 // ... 6509 // %in2_copy:gpr(s16) = COPY %in2:fpr(s16) 6510 // ... 6511 // %endbb: 6512 // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2 6513 bool HasGPROp = false, HasFPROp = false; 6514 for (unsigned OpIdx = 1; OpIdx < MI->getNumOperands(); ++OpIdx) { 6515 const auto &MO = MI->getOperand(OpIdx); 6516 if (!MO.isReg()) 6517 continue; 6518 const LLT &Ty = MRI.getType(MO.getReg()); 6519 if (!Ty.isValid() || !Ty.isScalar()) 6520 break; 6521 if (Ty.getSizeInBits() >= 32) 6522 break; 6523 const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg()); 6524 // If for some reason we don't have a regbank yet. Don't try anything. 6525 if (!RB) 6526 break; 6527 6528 if (RB->getID() == AArch64::GPRRegBankID) 6529 HasGPROp = true; 6530 else 6531 HasFPROp = true; 6532 } 6533 // We have heterogenous regbanks, need to fixup. 6534 if (HasGPROp && HasFPROp) 6535 fixupPHIOpBanks(*MI, MRI, RBI); 6536 } 6537 } 6538 6539 namespace llvm { 6540 InstructionSelector * 6541 createAArch64InstructionSelector(const AArch64TargetMachine &TM, 6542 AArch64Subtarget &Subtarget, 6543 AArch64RegisterBankInfo &RBI) { 6544 return new AArch64InstructionSelector(TM, Subtarget, RBI); 6545 } 6546 } 6547