1 //===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the InstructionSelector class for 10 /// AArch64. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AArch64GlobalISelUtils.h" 15 #include "AArch64InstrInfo.h" 16 #include "AArch64MachineFunctionInfo.h" 17 #include "AArch64RegisterBankInfo.h" 18 #include "AArch64RegisterInfo.h" 19 #include "AArch64Subtarget.h" 20 #include "AArch64TargetMachine.h" 21 #include "MCTargetDesc/AArch64AddressingModes.h" 22 #include "MCTargetDesc/AArch64MCTargetDesc.h" 23 #include "llvm/ADT/Optional.h" 24 #include "llvm/BinaryFormat/Dwarf.h" 25 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 26 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" 27 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" 28 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 29 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 30 #include "llvm/CodeGen/GlobalISel/Utils.h" 31 #include "llvm/CodeGen/MachineBasicBlock.h" 32 #include "llvm/CodeGen/MachineConstantPool.h" 33 #include "llvm/CodeGen/MachineFrameInfo.h" 34 #include "llvm/CodeGen/MachineFunction.h" 35 #include "llvm/CodeGen/MachineInstr.h" 36 #include "llvm/CodeGen/MachineInstrBuilder.h" 37 #include "llvm/CodeGen/MachineMemOperand.h" 38 #include "llvm/CodeGen/MachineOperand.h" 39 #include "llvm/CodeGen/MachineRegisterInfo.h" 40 #include "llvm/CodeGen/TargetOpcodes.h" 41 #include "llvm/IR/Constants.h" 42 #include "llvm/IR/DerivedTypes.h" 43 #include "llvm/IR/Instructions.h" 44 #include "llvm/IR/IntrinsicsAArch64.h" 45 #include "llvm/IR/PatternMatch.h" 46 #include "llvm/IR/Type.h" 47 #include "llvm/Pass.h" 48 #include "llvm/Support/Debug.h" 49 #include "llvm/Support/raw_ostream.h" 50 51 #define DEBUG_TYPE "aarch64-isel" 52 53 using namespace llvm; 54 using namespace MIPatternMatch; 55 using namespace AArch64GISelUtils; 56 57 namespace llvm { 58 class BlockFrequencyInfo; 59 class ProfileSummaryInfo; 60 } 61 62 namespace { 63 64 #define GET_GLOBALISEL_PREDICATE_BITSET 65 #include "AArch64GenGlobalISel.inc" 66 #undef GET_GLOBALISEL_PREDICATE_BITSET 67 68 69 class AArch64InstructionSelector : public InstructionSelector { 70 public: 71 AArch64InstructionSelector(const AArch64TargetMachine &TM, 72 const AArch64Subtarget &STI, 73 const AArch64RegisterBankInfo &RBI); 74 75 bool select(MachineInstr &I) override; 76 static const char *getName() { return DEBUG_TYPE; } 77 78 void setupMF(MachineFunction &MF, GISelKnownBits *KB, 79 CodeGenCoverage &CoverageInfo, ProfileSummaryInfo *PSI, 80 BlockFrequencyInfo *BFI) override { 81 InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI); 82 MIB.setMF(MF); 83 84 // hasFnAttribute() is expensive to call on every BRCOND selection, so 85 // cache it here for each run of the selector. 86 ProduceNonFlagSettingCondBr = 87 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening); 88 MFReturnAddr = Register(); 89 90 processPHIs(MF); 91 } 92 93 private: 94 /// tblgen-erated 'select' implementation, used as the initial selector for 95 /// the patterns that don't require complex C++. 96 bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; 97 98 // A lowering phase that runs before any selection attempts. 99 // Returns true if the instruction was modified. 100 bool preISelLower(MachineInstr &I); 101 102 // An early selection function that runs before the selectImpl() call. 103 bool earlySelect(MachineInstr &I); 104 105 // Do some preprocessing of G_PHIs before we begin selection. 106 void processPHIs(MachineFunction &MF); 107 108 bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI); 109 110 /// Eliminate same-sized cross-bank copies into stores before selectImpl(). 111 bool contractCrossBankCopyIntoStore(MachineInstr &I, 112 MachineRegisterInfo &MRI); 113 114 bool convertPtrAddToAdd(MachineInstr &I, MachineRegisterInfo &MRI); 115 116 bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF, 117 MachineRegisterInfo &MRI) const; 118 bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF, 119 MachineRegisterInfo &MRI) const; 120 121 ///@{ 122 /// Helper functions for selectCompareBranch. 123 bool selectCompareBranchFedByFCmp(MachineInstr &I, MachineInstr &FCmp, 124 MachineIRBuilder &MIB) const; 125 bool selectCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp, 126 MachineIRBuilder &MIB) const; 127 bool tryOptCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp, 128 MachineIRBuilder &MIB) const; 129 bool tryOptAndIntoCompareBranch(MachineInstr &AndInst, bool Invert, 130 MachineBasicBlock *DstMBB, 131 MachineIRBuilder &MIB) const; 132 ///@} 133 134 bool selectCompareBranch(MachineInstr &I, MachineFunction &MF, 135 MachineRegisterInfo &MRI); 136 137 bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI); 138 bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI); 139 140 // Helper to generate an equivalent of scalar_to_vector into a new register, 141 // returned via 'Dst'. 142 MachineInstr *emitScalarToVector(unsigned EltSize, 143 const TargetRegisterClass *DstRC, 144 Register Scalar, 145 MachineIRBuilder &MIRBuilder) const; 146 147 /// Emit a lane insert into \p DstReg, or a new vector register if None is 148 /// provided. 149 /// 150 /// The lane inserted into is defined by \p LaneIdx. The vector source 151 /// register is given by \p SrcReg. The register containing the element is 152 /// given by \p EltReg. 153 MachineInstr *emitLaneInsert(Optional<Register> DstReg, Register SrcReg, 154 Register EltReg, unsigned LaneIdx, 155 const RegisterBank &RB, 156 MachineIRBuilder &MIRBuilder) const; 157 158 /// Emit a sequence of instructions representing a constant \p CV for a 159 /// vector register \p Dst. (E.g. a MOV, or a load from a constant pool.) 160 /// 161 /// \returns the last instruction in the sequence on success, and nullptr 162 /// otherwise. 163 MachineInstr *emitConstantVector(Register Dst, Constant *CV, 164 MachineIRBuilder &MIRBuilder, 165 MachineRegisterInfo &MRI); 166 167 bool selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI); 168 bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy, 169 MachineRegisterInfo &MRI); 170 /// \returns true if a G_BUILD_VECTOR instruction \p MI can be selected as a 171 /// SUBREG_TO_REG. 172 bool tryOptBuildVecToSubregToReg(MachineInstr &MI, MachineRegisterInfo &MRI); 173 bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI); 174 bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI); 175 bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI); 176 177 bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI); 178 bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI); 179 bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI); 180 bool selectSplitVectorUnmerge(MachineInstr &I, MachineRegisterInfo &MRI); 181 182 /// Helper function to select vector load intrinsics like 183 /// @llvm.aarch64.neon.ld2.*, @llvm.aarch64.neon.ld4.*, etc. 184 /// \p Opc is the opcode that the selected instruction should use. 185 /// \p NumVecs is the number of vector destinations for the instruction. 186 /// \p I is the original G_INTRINSIC_W_SIDE_EFFECTS instruction. 187 bool selectVectorLoadIntrinsic(unsigned Opc, unsigned NumVecs, 188 MachineInstr &I); 189 bool selectIntrinsicWithSideEffects(MachineInstr &I, 190 MachineRegisterInfo &MRI); 191 bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI); 192 bool selectVectorICmp(MachineInstr &I, MachineRegisterInfo &MRI); 193 bool selectIntrinsicTrunc(MachineInstr &I, MachineRegisterInfo &MRI) const; 194 bool selectIntrinsicRound(MachineInstr &I, MachineRegisterInfo &MRI) const; 195 bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI); 196 bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI); 197 bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI); 198 bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI); 199 bool selectMOPS(MachineInstr &I, MachineRegisterInfo &MRI); 200 bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI); 201 202 unsigned emitConstantPoolEntry(const Constant *CPVal, 203 MachineFunction &MF) const; 204 MachineInstr *emitLoadFromConstantPool(const Constant *CPVal, 205 MachineIRBuilder &MIRBuilder) const; 206 207 // Emit a vector concat operation. 208 MachineInstr *emitVectorConcat(Optional<Register> Dst, Register Op1, 209 Register Op2, 210 MachineIRBuilder &MIRBuilder) const; 211 212 // Emit an integer compare between LHS and RHS, which checks for Predicate. 213 MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS, 214 MachineOperand &Predicate, 215 MachineIRBuilder &MIRBuilder) const; 216 217 /// Emit a floating point comparison between \p LHS and \p RHS. 218 /// \p Pred if given is the intended predicate to use. 219 MachineInstr *emitFPCompare(Register LHS, Register RHS, 220 MachineIRBuilder &MIRBuilder, 221 Optional<CmpInst::Predicate> = None) const; 222 223 MachineInstr *emitInstr(unsigned Opcode, 224 std::initializer_list<llvm::DstOp> DstOps, 225 std::initializer_list<llvm::SrcOp> SrcOps, 226 MachineIRBuilder &MIRBuilder, 227 const ComplexRendererFns &RenderFns = None) const; 228 /// Helper function to emit an add or sub instruction. 229 /// 230 /// \p AddrModeAndSizeToOpcode must contain each of the opcode variants above 231 /// in a specific order. 232 /// 233 /// Below is an example of the expected input to \p AddrModeAndSizeToOpcode. 234 /// 235 /// \code 236 /// const std::array<std::array<unsigned, 2>, 4> Table { 237 /// {{AArch64::ADDXri, AArch64::ADDWri}, 238 /// {AArch64::ADDXrs, AArch64::ADDWrs}, 239 /// {AArch64::ADDXrr, AArch64::ADDWrr}, 240 /// {AArch64::SUBXri, AArch64::SUBWri}, 241 /// {AArch64::ADDXrx, AArch64::ADDWrx}}}; 242 /// \endcode 243 /// 244 /// Each row in the table corresponds to a different addressing mode. Each 245 /// column corresponds to a different register size. 246 /// 247 /// \attention Rows must be structured as follows: 248 /// - Row 0: The ri opcode variants 249 /// - Row 1: The rs opcode variants 250 /// - Row 2: The rr opcode variants 251 /// - Row 3: The ri opcode variants for negative immediates 252 /// - Row 4: The rx opcode variants 253 /// 254 /// \attention Columns must be structured as follows: 255 /// - Column 0: The 64-bit opcode variants 256 /// - Column 1: The 32-bit opcode variants 257 /// 258 /// \p Dst is the destination register of the binop to emit. 259 /// \p LHS is the left-hand operand of the binop to emit. 260 /// \p RHS is the right-hand operand of the binop to emit. 261 MachineInstr *emitAddSub( 262 const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode, 263 Register Dst, MachineOperand &LHS, MachineOperand &RHS, 264 MachineIRBuilder &MIRBuilder) const; 265 MachineInstr *emitADD(Register DefReg, MachineOperand &LHS, 266 MachineOperand &RHS, 267 MachineIRBuilder &MIRBuilder) const; 268 MachineInstr *emitADDS(Register Dst, MachineOperand &LHS, MachineOperand &RHS, 269 MachineIRBuilder &MIRBuilder) const; 270 MachineInstr *emitSUBS(Register Dst, MachineOperand &LHS, MachineOperand &RHS, 271 MachineIRBuilder &MIRBuilder) const; 272 MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS, 273 MachineIRBuilder &MIRBuilder) const; 274 MachineInstr *emitTST(MachineOperand &LHS, MachineOperand &RHS, 275 MachineIRBuilder &MIRBuilder) const; 276 MachineInstr *emitSelect(Register Dst, Register LHS, Register RHS, 277 AArch64CC::CondCode CC, 278 MachineIRBuilder &MIRBuilder) const; 279 MachineInstr *emitExtractVectorElt(Optional<Register> DstReg, 280 const RegisterBank &DstRB, LLT ScalarTy, 281 Register VecReg, unsigned LaneIdx, 282 MachineIRBuilder &MIRBuilder) const; 283 MachineInstr *emitCSINC(Register Dst, Register Src1, Register Src2, 284 AArch64CC::CondCode Pred, 285 MachineIRBuilder &MIRBuilder) const; 286 /// Emit a CSet for a FP compare. 287 /// 288 /// \p Dst is expected to be a 32-bit scalar register. 289 MachineInstr *emitCSetForFCmp(Register Dst, CmpInst::Predicate Pred, 290 MachineIRBuilder &MIRBuilder) const; 291 292 /// Emit the overflow op for \p Opcode. 293 /// 294 /// \p Opcode is expected to be an overflow op's opcode, e.g. G_UADDO, 295 /// G_USUBO, etc. 296 std::pair<MachineInstr *, AArch64CC::CondCode> 297 emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS, 298 MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const; 299 300 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops). 301 /// In some cases this is even possible with OR operations in the expression. 302 MachineInstr *emitConjunction(Register Val, AArch64CC::CondCode &OutCC, 303 MachineIRBuilder &MIB) const; 304 MachineInstr *emitConditionalComparison(Register LHS, Register RHS, 305 CmpInst::Predicate CC, 306 AArch64CC::CondCode Predicate, 307 AArch64CC::CondCode OutCC, 308 MachineIRBuilder &MIB) const; 309 MachineInstr *emitConjunctionRec(Register Val, AArch64CC::CondCode &OutCC, 310 bool Negate, Register CCOp, 311 AArch64CC::CondCode Predicate, 312 MachineIRBuilder &MIB) const; 313 314 /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg. 315 /// \p IsNegative is true if the test should be "not zero". 316 /// This will also optimize the test bit instruction when possible. 317 MachineInstr *emitTestBit(Register TestReg, uint64_t Bit, bool IsNegative, 318 MachineBasicBlock *DstMBB, 319 MachineIRBuilder &MIB) const; 320 321 /// Emit a CB(N)Z instruction which branches to \p DestMBB. 322 MachineInstr *emitCBZ(Register CompareReg, bool IsNegative, 323 MachineBasicBlock *DestMBB, 324 MachineIRBuilder &MIB) const; 325 326 // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td. 327 // We use these manually instead of using the importer since it doesn't 328 // support SDNodeXForm. 329 ComplexRendererFns selectShiftA_32(const MachineOperand &Root) const; 330 ComplexRendererFns selectShiftB_32(const MachineOperand &Root) const; 331 ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const; 332 ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const; 333 334 ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const; 335 ComplexRendererFns selectArithImmed(MachineOperand &Root) const; 336 ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const; 337 338 ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root, 339 unsigned Size) const; 340 341 ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const { 342 return selectAddrModeUnscaled(Root, 1); 343 } 344 ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const { 345 return selectAddrModeUnscaled(Root, 2); 346 } 347 ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const { 348 return selectAddrModeUnscaled(Root, 4); 349 } 350 ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const { 351 return selectAddrModeUnscaled(Root, 8); 352 } 353 ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const { 354 return selectAddrModeUnscaled(Root, 16); 355 } 356 357 /// Helper to try to fold in a GISEL_ADD_LOW into an immediate, to be used 358 /// from complex pattern matchers like selectAddrModeIndexed(). 359 ComplexRendererFns tryFoldAddLowIntoImm(MachineInstr &RootDef, unsigned Size, 360 MachineRegisterInfo &MRI) const; 361 362 ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root, 363 unsigned Size) const; 364 template <int Width> 365 ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const { 366 return selectAddrModeIndexed(Root, Width / 8); 367 } 368 369 bool isWorthFoldingIntoExtendedReg(MachineInstr &MI, 370 const MachineRegisterInfo &MRI) const; 371 ComplexRendererFns 372 selectAddrModeShiftedExtendXReg(MachineOperand &Root, 373 unsigned SizeInBytes) const; 374 375 /// Returns a \p ComplexRendererFns which contains a base, offset, and whether 376 /// or not a shift + extend should be folded into an addressing mode. Returns 377 /// None when this is not profitable or possible. 378 ComplexRendererFns 379 selectExtendedSHL(MachineOperand &Root, MachineOperand &Base, 380 MachineOperand &Offset, unsigned SizeInBytes, 381 bool WantsExt) const; 382 ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const; 383 ComplexRendererFns selectAddrModeXRO(MachineOperand &Root, 384 unsigned SizeInBytes) const; 385 template <int Width> 386 ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const { 387 return selectAddrModeXRO(Root, Width / 8); 388 } 389 390 ComplexRendererFns selectAddrModeWRO(MachineOperand &Root, 391 unsigned SizeInBytes) const; 392 template <int Width> 393 ComplexRendererFns selectAddrModeWRO(MachineOperand &Root) const { 394 return selectAddrModeWRO(Root, Width / 8); 395 } 396 397 ComplexRendererFns selectShiftedRegister(MachineOperand &Root, 398 bool AllowROR = false) const; 399 400 ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const { 401 return selectShiftedRegister(Root); 402 } 403 404 ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const { 405 return selectShiftedRegister(Root, true); 406 } 407 408 /// Given an extend instruction, determine the correct shift-extend type for 409 /// that instruction. 410 /// 411 /// If the instruction is going to be used in a load or store, pass 412 /// \p IsLoadStore = true. 413 AArch64_AM::ShiftExtendType 414 getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI, 415 bool IsLoadStore = false) const; 416 417 /// Move \p Reg to \p RC if \p Reg is not already on \p RC. 418 /// 419 /// \returns Either \p Reg if no change was necessary, or the new register 420 /// created by moving \p Reg. 421 /// 422 /// Note: This uses emitCopy right now. 423 Register moveScalarRegClass(Register Reg, const TargetRegisterClass &RC, 424 MachineIRBuilder &MIB) const; 425 426 ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const; 427 428 void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI, 429 int OpIdx = -1) const; 430 void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I, 431 int OpIdx = -1) const; 432 void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I, 433 int OpIdx = -1) const; 434 void renderFPImm16(MachineInstrBuilder &MIB, const MachineInstr &MI, 435 int OpIdx = -1) const; 436 void renderFPImm32(MachineInstrBuilder &MIB, const MachineInstr &MI, 437 int OpIdx = -1) const; 438 void renderFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI, 439 int OpIdx = -1) const; 440 void renderFPImm32SIMDModImmType4(MachineInstrBuilder &MIB, 441 const MachineInstr &MI, 442 int OpIdx = -1) const; 443 444 // Materialize a GlobalValue or BlockAddress using a movz+movk sequence. 445 void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags); 446 447 // Optimization methods. 448 bool tryOptSelect(GSelect &Sel); 449 bool tryOptSelectConjunction(GSelect &Sel, MachineInstr &CondMI); 450 MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS, 451 MachineOperand &Predicate, 452 MachineIRBuilder &MIRBuilder) const; 453 454 /// Return true if \p MI is a load or store of \p NumBytes bytes. 455 bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const; 456 457 /// Returns true if \p MI is guaranteed to have the high-half of a 64-bit 458 /// register zeroed out. In other words, the result of MI has been explicitly 459 /// zero extended. 460 bool isDef32(const MachineInstr &MI) const; 461 462 const AArch64TargetMachine &TM; 463 const AArch64Subtarget &STI; 464 const AArch64InstrInfo &TII; 465 const AArch64RegisterInfo &TRI; 466 const AArch64RegisterBankInfo &RBI; 467 468 bool ProduceNonFlagSettingCondBr = false; 469 470 // Some cached values used during selection. 471 // We use LR as a live-in register, and we keep track of it here as it can be 472 // clobbered by calls. 473 Register MFReturnAddr; 474 475 MachineIRBuilder MIB; 476 477 #define GET_GLOBALISEL_PREDICATES_DECL 478 #include "AArch64GenGlobalISel.inc" 479 #undef GET_GLOBALISEL_PREDICATES_DECL 480 481 // We declare the temporaries used by selectImpl() in the class to minimize the 482 // cost of constructing placeholder values. 483 #define GET_GLOBALISEL_TEMPORARIES_DECL 484 #include "AArch64GenGlobalISel.inc" 485 #undef GET_GLOBALISEL_TEMPORARIES_DECL 486 }; 487 488 } // end anonymous namespace 489 490 #define GET_GLOBALISEL_IMPL 491 #include "AArch64GenGlobalISel.inc" 492 #undef GET_GLOBALISEL_IMPL 493 494 AArch64InstructionSelector::AArch64InstructionSelector( 495 const AArch64TargetMachine &TM, const AArch64Subtarget &STI, 496 const AArch64RegisterBankInfo &RBI) 497 : TM(TM), STI(STI), TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), 498 RBI(RBI), 499 #define GET_GLOBALISEL_PREDICATES_INIT 500 #include "AArch64GenGlobalISel.inc" 501 #undef GET_GLOBALISEL_PREDICATES_INIT 502 #define GET_GLOBALISEL_TEMPORARIES_INIT 503 #include "AArch64GenGlobalISel.inc" 504 #undef GET_GLOBALISEL_TEMPORARIES_INIT 505 { 506 } 507 508 // FIXME: This should be target-independent, inferred from the types declared 509 // for each class in the bank. 510 // 511 /// Given a register bank, and a type, return the smallest register class that 512 /// can represent that combination. 513 static const TargetRegisterClass * 514 getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB, 515 bool GetAllRegSet = false) { 516 if (RB.getID() == AArch64::GPRRegBankID) { 517 if (Ty.getSizeInBits() <= 32) 518 return GetAllRegSet ? &AArch64::GPR32allRegClass 519 : &AArch64::GPR32RegClass; 520 if (Ty.getSizeInBits() == 64) 521 return GetAllRegSet ? &AArch64::GPR64allRegClass 522 : &AArch64::GPR64RegClass; 523 if (Ty.getSizeInBits() == 128) 524 return &AArch64::XSeqPairsClassRegClass; 525 return nullptr; 526 } 527 528 if (RB.getID() == AArch64::FPRRegBankID) { 529 switch (Ty.getSizeInBits()) { 530 case 8: 531 return &AArch64::FPR8RegClass; 532 case 16: 533 return &AArch64::FPR16RegClass; 534 case 32: 535 return &AArch64::FPR32RegClass; 536 case 64: 537 return &AArch64::FPR64RegClass; 538 case 128: 539 return &AArch64::FPR128RegClass; 540 } 541 return nullptr; 542 } 543 544 return nullptr; 545 } 546 547 /// Given a register bank, and size in bits, return the smallest register class 548 /// that can represent that combination. 549 static const TargetRegisterClass * 550 getMinClassForRegBank(const RegisterBank &RB, unsigned SizeInBits, 551 bool GetAllRegSet = false) { 552 unsigned RegBankID = RB.getID(); 553 554 if (RegBankID == AArch64::GPRRegBankID) { 555 if (SizeInBits <= 32) 556 return GetAllRegSet ? &AArch64::GPR32allRegClass 557 : &AArch64::GPR32RegClass; 558 if (SizeInBits == 64) 559 return GetAllRegSet ? &AArch64::GPR64allRegClass 560 : &AArch64::GPR64RegClass; 561 if (SizeInBits == 128) 562 return &AArch64::XSeqPairsClassRegClass; 563 } 564 565 if (RegBankID == AArch64::FPRRegBankID) { 566 switch (SizeInBits) { 567 default: 568 return nullptr; 569 case 8: 570 return &AArch64::FPR8RegClass; 571 case 16: 572 return &AArch64::FPR16RegClass; 573 case 32: 574 return &AArch64::FPR32RegClass; 575 case 64: 576 return &AArch64::FPR64RegClass; 577 case 128: 578 return &AArch64::FPR128RegClass; 579 } 580 } 581 582 return nullptr; 583 } 584 585 /// Returns the correct subregister to use for a given register class. 586 static bool getSubRegForClass(const TargetRegisterClass *RC, 587 const TargetRegisterInfo &TRI, unsigned &SubReg) { 588 switch (TRI.getRegSizeInBits(*RC)) { 589 case 8: 590 SubReg = AArch64::bsub; 591 break; 592 case 16: 593 SubReg = AArch64::hsub; 594 break; 595 case 32: 596 if (RC != &AArch64::FPR32RegClass) 597 SubReg = AArch64::sub_32; 598 else 599 SubReg = AArch64::ssub; 600 break; 601 case 64: 602 SubReg = AArch64::dsub; 603 break; 604 default: 605 LLVM_DEBUG( 606 dbgs() << "Couldn't find appropriate subregister for register class."); 607 return false; 608 } 609 610 return true; 611 } 612 613 /// Returns the minimum size the given register bank can hold. 614 static unsigned getMinSizeForRegBank(const RegisterBank &RB) { 615 switch (RB.getID()) { 616 case AArch64::GPRRegBankID: 617 return 32; 618 case AArch64::FPRRegBankID: 619 return 8; 620 default: 621 llvm_unreachable("Tried to get minimum size for unknown register bank."); 622 } 623 } 624 625 /// Create a REG_SEQUENCE instruction using the registers in \p Regs. 626 /// Helper function for functions like createDTuple and createQTuple. 627 /// 628 /// \p RegClassIDs - The list of register class IDs available for some tuple of 629 /// a scalar class. E.g. QQRegClassID, QQQRegClassID, QQQQRegClassID. This is 630 /// expected to contain between 2 and 4 tuple classes. 631 /// 632 /// \p SubRegs - The list of subregister classes associated with each register 633 /// class ID in \p RegClassIDs. E.g., QQRegClassID should use the qsub0 634 /// subregister class. The index of each subregister class is expected to 635 /// correspond with the index of each register class. 636 /// 637 /// \returns Either the destination register of REG_SEQUENCE instruction that 638 /// was created, or the 0th element of \p Regs if \p Regs contains a single 639 /// element. 640 static Register createTuple(ArrayRef<Register> Regs, 641 const unsigned RegClassIDs[], 642 const unsigned SubRegs[], MachineIRBuilder &MIB) { 643 unsigned NumRegs = Regs.size(); 644 if (NumRegs == 1) 645 return Regs[0]; 646 assert(NumRegs >= 2 && NumRegs <= 4 && 647 "Only support between two and 4 registers in a tuple!"); 648 const TargetRegisterInfo *TRI = MIB.getMF().getSubtarget().getRegisterInfo(); 649 auto *DesiredClass = TRI->getRegClass(RegClassIDs[NumRegs - 2]); 650 auto RegSequence = 651 MIB.buildInstr(TargetOpcode::REG_SEQUENCE, {DesiredClass}, {}); 652 for (unsigned I = 0, E = Regs.size(); I < E; ++I) { 653 RegSequence.addUse(Regs[I]); 654 RegSequence.addImm(SubRegs[I]); 655 } 656 return RegSequence.getReg(0); 657 } 658 659 /// Create a tuple of D-registers using the registers in \p Regs. 660 static Register createDTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) { 661 static const unsigned RegClassIDs[] = { 662 AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID}; 663 static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1, 664 AArch64::dsub2, AArch64::dsub3}; 665 return createTuple(Regs, RegClassIDs, SubRegs, MIB); 666 } 667 668 /// Create a tuple of Q-registers using the registers in \p Regs. 669 static Register createQTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) { 670 static const unsigned RegClassIDs[] = { 671 AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID}; 672 static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1, 673 AArch64::qsub2, AArch64::qsub3}; 674 return createTuple(Regs, RegClassIDs, SubRegs, MIB); 675 } 676 677 static Optional<uint64_t> getImmedFromMO(const MachineOperand &Root) { 678 auto &MI = *Root.getParent(); 679 auto &MBB = *MI.getParent(); 680 auto &MF = *MBB.getParent(); 681 auto &MRI = MF.getRegInfo(); 682 uint64_t Immed; 683 if (Root.isImm()) 684 Immed = Root.getImm(); 685 else if (Root.isCImm()) 686 Immed = Root.getCImm()->getZExtValue(); 687 else if (Root.isReg()) { 688 auto ValAndVReg = 689 getIConstantVRegValWithLookThrough(Root.getReg(), MRI, true); 690 if (!ValAndVReg) 691 return None; 692 Immed = ValAndVReg->Value.getSExtValue(); 693 } else 694 return None; 695 return Immed; 696 } 697 698 /// Check whether \p I is a currently unsupported binary operation: 699 /// - it has an unsized type 700 /// - an operand is not a vreg 701 /// - all operands are not in the same bank 702 /// These are checks that should someday live in the verifier, but right now, 703 /// these are mostly limitations of the aarch64 selector. 704 static bool unsupportedBinOp(const MachineInstr &I, 705 const AArch64RegisterBankInfo &RBI, 706 const MachineRegisterInfo &MRI, 707 const AArch64RegisterInfo &TRI) { 708 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 709 if (!Ty.isValid()) { 710 LLVM_DEBUG(dbgs() << "Generic binop register should be typed\n"); 711 return true; 712 } 713 714 const RegisterBank *PrevOpBank = nullptr; 715 for (auto &MO : I.operands()) { 716 // FIXME: Support non-register operands. 717 if (!MO.isReg()) { 718 LLVM_DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n"); 719 return true; 720 } 721 722 // FIXME: Can generic operations have physical registers operands? If 723 // so, this will need to be taught about that, and we'll need to get the 724 // bank out of the minimal class for the register. 725 // Either way, this needs to be documented (and possibly verified). 726 if (!Register::isVirtualRegister(MO.getReg())) { 727 LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n"); 728 return true; 729 } 730 731 const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI); 732 if (!OpBank) { 733 LLVM_DEBUG(dbgs() << "Generic register has no bank or class\n"); 734 return true; 735 } 736 737 if (PrevOpBank && OpBank != PrevOpBank) { 738 LLVM_DEBUG(dbgs() << "Generic inst operands have different banks\n"); 739 return true; 740 } 741 PrevOpBank = OpBank; 742 } 743 return false; 744 } 745 746 /// Select the AArch64 opcode for the basic binary operation \p GenericOpc 747 /// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID 748 /// and of size \p OpSize. 749 /// \returns \p GenericOpc if the combination is unsupported. 750 static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID, 751 unsigned OpSize) { 752 switch (RegBankID) { 753 case AArch64::GPRRegBankID: 754 if (OpSize == 32) { 755 switch (GenericOpc) { 756 case TargetOpcode::G_SHL: 757 return AArch64::LSLVWr; 758 case TargetOpcode::G_LSHR: 759 return AArch64::LSRVWr; 760 case TargetOpcode::G_ASHR: 761 return AArch64::ASRVWr; 762 default: 763 return GenericOpc; 764 } 765 } else if (OpSize == 64) { 766 switch (GenericOpc) { 767 case TargetOpcode::G_PTR_ADD: 768 return AArch64::ADDXrr; 769 case TargetOpcode::G_SHL: 770 return AArch64::LSLVXr; 771 case TargetOpcode::G_LSHR: 772 return AArch64::LSRVXr; 773 case TargetOpcode::G_ASHR: 774 return AArch64::ASRVXr; 775 default: 776 return GenericOpc; 777 } 778 } 779 break; 780 case AArch64::FPRRegBankID: 781 switch (OpSize) { 782 case 32: 783 switch (GenericOpc) { 784 case TargetOpcode::G_FADD: 785 return AArch64::FADDSrr; 786 case TargetOpcode::G_FSUB: 787 return AArch64::FSUBSrr; 788 case TargetOpcode::G_FMUL: 789 return AArch64::FMULSrr; 790 case TargetOpcode::G_FDIV: 791 return AArch64::FDIVSrr; 792 default: 793 return GenericOpc; 794 } 795 case 64: 796 switch (GenericOpc) { 797 case TargetOpcode::G_FADD: 798 return AArch64::FADDDrr; 799 case TargetOpcode::G_FSUB: 800 return AArch64::FSUBDrr; 801 case TargetOpcode::G_FMUL: 802 return AArch64::FMULDrr; 803 case TargetOpcode::G_FDIV: 804 return AArch64::FDIVDrr; 805 case TargetOpcode::G_OR: 806 return AArch64::ORRv8i8; 807 default: 808 return GenericOpc; 809 } 810 } 811 break; 812 } 813 return GenericOpc; 814 } 815 816 /// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc, 817 /// appropriate for the (value) register bank \p RegBankID and of memory access 818 /// size \p OpSize. This returns the variant with the base+unsigned-immediate 819 /// addressing mode (e.g., LDRXui). 820 /// \returns \p GenericOpc if the combination is unsupported. 821 static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID, 822 unsigned OpSize) { 823 const bool isStore = GenericOpc == TargetOpcode::G_STORE; 824 switch (RegBankID) { 825 case AArch64::GPRRegBankID: 826 switch (OpSize) { 827 case 8: 828 return isStore ? AArch64::STRBBui : AArch64::LDRBBui; 829 case 16: 830 return isStore ? AArch64::STRHHui : AArch64::LDRHHui; 831 case 32: 832 return isStore ? AArch64::STRWui : AArch64::LDRWui; 833 case 64: 834 return isStore ? AArch64::STRXui : AArch64::LDRXui; 835 } 836 break; 837 case AArch64::FPRRegBankID: 838 switch (OpSize) { 839 case 8: 840 return isStore ? AArch64::STRBui : AArch64::LDRBui; 841 case 16: 842 return isStore ? AArch64::STRHui : AArch64::LDRHui; 843 case 32: 844 return isStore ? AArch64::STRSui : AArch64::LDRSui; 845 case 64: 846 return isStore ? AArch64::STRDui : AArch64::LDRDui; 847 case 128: 848 return isStore ? AArch64::STRQui : AArch64::LDRQui; 849 } 850 break; 851 } 852 return GenericOpc; 853 } 854 855 /// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg 856 /// to \p *To. 857 /// 858 /// E.g "To = COPY SrcReg:SubReg" 859 static bool copySubReg(MachineInstr &I, MachineRegisterInfo &MRI, 860 const RegisterBankInfo &RBI, Register SrcReg, 861 const TargetRegisterClass *To, unsigned SubReg) { 862 assert(SrcReg.isValid() && "Expected a valid source register?"); 863 assert(To && "Destination register class cannot be null"); 864 assert(SubReg && "Expected a valid subregister"); 865 866 MachineIRBuilder MIB(I); 867 auto SubRegCopy = 868 MIB.buildInstr(TargetOpcode::COPY, {To}, {}).addReg(SrcReg, 0, SubReg); 869 MachineOperand &RegOp = I.getOperand(1); 870 RegOp.setReg(SubRegCopy.getReg(0)); 871 872 // It's possible that the destination register won't be constrained. Make 873 // sure that happens. 874 if (!Register::isPhysicalRegister(I.getOperand(0).getReg())) 875 RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI); 876 877 return true; 878 } 879 880 /// Helper function to get the source and destination register classes for a 881 /// copy. Returns a std::pair containing the source register class for the 882 /// copy, and the destination register class for the copy. If a register class 883 /// cannot be determined, then it will be nullptr. 884 static std::pair<const TargetRegisterClass *, const TargetRegisterClass *> 885 getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII, 886 MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, 887 const RegisterBankInfo &RBI) { 888 Register DstReg = I.getOperand(0).getReg(); 889 Register SrcReg = I.getOperand(1).getReg(); 890 const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); 891 const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); 892 unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI); 893 unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); 894 895 // Special casing for cross-bank copies of s1s. We can technically represent 896 // a 1-bit value with any size of register. The minimum size for a GPR is 32 897 // bits. So, we need to put the FPR on 32 bits as well. 898 // 899 // FIXME: I'm not sure if this case holds true outside of copies. If it does, 900 // then we can pull it into the helpers that get the appropriate class for a 901 // register bank. Or make a new helper that carries along some constraint 902 // information. 903 if (SrcRegBank != DstRegBank && (DstSize == 1 && SrcSize == 1)) 904 SrcSize = DstSize = 32; 905 906 return {getMinClassForRegBank(SrcRegBank, SrcSize, true), 907 getMinClassForRegBank(DstRegBank, DstSize, true)}; 908 } 909 910 static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, 911 MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, 912 const RegisterBankInfo &RBI) { 913 Register DstReg = I.getOperand(0).getReg(); 914 Register SrcReg = I.getOperand(1).getReg(); 915 const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); 916 const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); 917 918 // Find the correct register classes for the source and destination registers. 919 const TargetRegisterClass *SrcRC; 920 const TargetRegisterClass *DstRC; 921 std::tie(SrcRC, DstRC) = getRegClassesForCopy(I, TII, MRI, TRI, RBI); 922 923 if (!DstRC) { 924 LLVM_DEBUG(dbgs() << "Unexpected dest size " 925 << RBI.getSizeInBits(DstReg, MRI, TRI) << '\n'); 926 return false; 927 } 928 929 // Is this a copy? If so, then we may need to insert a subregister copy. 930 if (I.isCopy()) { 931 // Yes. Check if there's anything to fix up. 932 if (!SrcRC) { 933 LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n"); 934 return false; 935 } 936 937 unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC); 938 unsigned DstSize = TRI.getRegSizeInBits(*DstRC); 939 unsigned SubReg; 940 941 // If the source bank doesn't support a subregister copy small enough, 942 // then we first need to copy to the destination bank. 943 if (getMinSizeForRegBank(SrcRegBank) > DstSize) { 944 const TargetRegisterClass *DstTempRC = 945 getMinClassForRegBank(DstRegBank, SrcSize, /* GetAllRegSet */ true); 946 getSubRegForClass(DstRC, TRI, SubReg); 947 948 MachineIRBuilder MIB(I); 949 auto Copy = MIB.buildCopy({DstTempRC}, {SrcReg}); 950 copySubReg(I, MRI, RBI, Copy.getReg(0), DstRC, SubReg); 951 } else if (SrcSize > DstSize) { 952 // If the source register is bigger than the destination we need to 953 // perform a subregister copy. 954 const TargetRegisterClass *SubRegRC = 955 getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true); 956 getSubRegForClass(SubRegRC, TRI, SubReg); 957 copySubReg(I, MRI, RBI, SrcReg, DstRC, SubReg); 958 } else if (DstSize > SrcSize) { 959 // If the destination register is bigger than the source we need to do 960 // a promotion using SUBREG_TO_REG. 961 const TargetRegisterClass *PromotionRC = 962 getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true); 963 getSubRegForClass(SrcRC, TRI, SubReg); 964 965 Register PromoteReg = MRI.createVirtualRegister(PromotionRC); 966 BuildMI(*I.getParent(), I, I.getDebugLoc(), 967 TII.get(AArch64::SUBREG_TO_REG), PromoteReg) 968 .addImm(0) 969 .addUse(SrcReg) 970 .addImm(SubReg); 971 MachineOperand &RegOp = I.getOperand(1); 972 RegOp.setReg(PromoteReg); 973 } 974 975 // If the destination is a physical register, then there's nothing to 976 // change, so we're done. 977 if (Register::isPhysicalRegister(DstReg)) 978 return true; 979 } 980 981 // No need to constrain SrcReg. It will get constrained when we hit another 982 // of its use or its defs. Copies do not have constraints. 983 if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { 984 LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode()) 985 << " operand\n"); 986 return false; 987 } 988 989 // If this a GPR ZEXT that we want to just reduce down into a copy. 990 // The sizes will be mismatched with the source < 32b but that's ok. 991 if (I.getOpcode() == TargetOpcode::G_ZEXT) { 992 I.setDesc(TII.get(AArch64::COPY)); 993 assert(SrcRegBank.getID() == AArch64::GPRRegBankID); 994 return selectCopy(I, TII, MRI, TRI, RBI); 995 } 996 997 I.setDesc(TII.get(AArch64::COPY)); 998 return true; 999 } 1000 1001 static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) { 1002 if (!DstTy.isScalar() || !SrcTy.isScalar()) 1003 return GenericOpc; 1004 1005 const unsigned DstSize = DstTy.getSizeInBits(); 1006 const unsigned SrcSize = SrcTy.getSizeInBits(); 1007 1008 switch (DstSize) { 1009 case 32: 1010 switch (SrcSize) { 1011 case 32: 1012 switch (GenericOpc) { 1013 case TargetOpcode::G_SITOFP: 1014 return AArch64::SCVTFUWSri; 1015 case TargetOpcode::G_UITOFP: 1016 return AArch64::UCVTFUWSri; 1017 case TargetOpcode::G_FPTOSI: 1018 return AArch64::FCVTZSUWSr; 1019 case TargetOpcode::G_FPTOUI: 1020 return AArch64::FCVTZUUWSr; 1021 default: 1022 return GenericOpc; 1023 } 1024 case 64: 1025 switch (GenericOpc) { 1026 case TargetOpcode::G_SITOFP: 1027 return AArch64::SCVTFUXSri; 1028 case TargetOpcode::G_UITOFP: 1029 return AArch64::UCVTFUXSri; 1030 case TargetOpcode::G_FPTOSI: 1031 return AArch64::FCVTZSUWDr; 1032 case TargetOpcode::G_FPTOUI: 1033 return AArch64::FCVTZUUWDr; 1034 default: 1035 return GenericOpc; 1036 } 1037 default: 1038 return GenericOpc; 1039 } 1040 case 64: 1041 switch (SrcSize) { 1042 case 32: 1043 switch (GenericOpc) { 1044 case TargetOpcode::G_SITOFP: 1045 return AArch64::SCVTFUWDri; 1046 case TargetOpcode::G_UITOFP: 1047 return AArch64::UCVTFUWDri; 1048 case TargetOpcode::G_FPTOSI: 1049 return AArch64::FCVTZSUXSr; 1050 case TargetOpcode::G_FPTOUI: 1051 return AArch64::FCVTZUUXSr; 1052 default: 1053 return GenericOpc; 1054 } 1055 case 64: 1056 switch (GenericOpc) { 1057 case TargetOpcode::G_SITOFP: 1058 return AArch64::SCVTFUXDri; 1059 case TargetOpcode::G_UITOFP: 1060 return AArch64::UCVTFUXDri; 1061 case TargetOpcode::G_FPTOSI: 1062 return AArch64::FCVTZSUXDr; 1063 case TargetOpcode::G_FPTOUI: 1064 return AArch64::FCVTZUUXDr; 1065 default: 1066 return GenericOpc; 1067 } 1068 default: 1069 return GenericOpc; 1070 } 1071 default: 1072 return GenericOpc; 1073 }; 1074 return GenericOpc; 1075 } 1076 1077 MachineInstr * 1078 AArch64InstructionSelector::emitSelect(Register Dst, Register True, 1079 Register False, AArch64CC::CondCode CC, 1080 MachineIRBuilder &MIB) const { 1081 MachineRegisterInfo &MRI = *MIB.getMRI(); 1082 assert(RBI.getRegBank(False, MRI, TRI)->getID() == 1083 RBI.getRegBank(True, MRI, TRI)->getID() && 1084 "Expected both select operands to have the same regbank?"); 1085 LLT Ty = MRI.getType(True); 1086 if (Ty.isVector()) 1087 return nullptr; 1088 const unsigned Size = Ty.getSizeInBits(); 1089 assert((Size == 32 || Size == 64) && 1090 "Expected 32 bit or 64 bit select only?"); 1091 const bool Is32Bit = Size == 32; 1092 if (RBI.getRegBank(True, MRI, TRI)->getID() != AArch64::GPRRegBankID) { 1093 unsigned Opc = Is32Bit ? AArch64::FCSELSrrr : AArch64::FCSELDrrr; 1094 auto FCSel = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC); 1095 constrainSelectedInstRegOperands(*FCSel, TII, TRI, RBI); 1096 return &*FCSel; 1097 } 1098 1099 // By default, we'll try and emit a CSEL. 1100 unsigned Opc = Is32Bit ? AArch64::CSELWr : AArch64::CSELXr; 1101 bool Optimized = false; 1102 auto TryFoldBinOpIntoSelect = [&Opc, Is32Bit, &CC, &MRI, 1103 &Optimized](Register &Reg, Register &OtherReg, 1104 bool Invert) { 1105 if (Optimized) 1106 return false; 1107 1108 // Attempt to fold: 1109 // 1110 // %sub = G_SUB 0, %x 1111 // %select = G_SELECT cc, %reg, %sub 1112 // 1113 // Into: 1114 // %select = CSNEG %reg, %x, cc 1115 Register MatchReg; 1116 if (mi_match(Reg, MRI, m_Neg(m_Reg(MatchReg)))) { 1117 Opc = Is32Bit ? AArch64::CSNEGWr : AArch64::CSNEGXr; 1118 Reg = MatchReg; 1119 if (Invert) { 1120 CC = AArch64CC::getInvertedCondCode(CC); 1121 std::swap(Reg, OtherReg); 1122 } 1123 return true; 1124 } 1125 1126 // Attempt to fold: 1127 // 1128 // %xor = G_XOR %x, -1 1129 // %select = G_SELECT cc, %reg, %xor 1130 // 1131 // Into: 1132 // %select = CSINV %reg, %x, cc 1133 if (mi_match(Reg, MRI, m_Not(m_Reg(MatchReg)))) { 1134 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; 1135 Reg = MatchReg; 1136 if (Invert) { 1137 CC = AArch64CC::getInvertedCondCode(CC); 1138 std::swap(Reg, OtherReg); 1139 } 1140 return true; 1141 } 1142 1143 // Attempt to fold: 1144 // 1145 // %add = G_ADD %x, 1 1146 // %select = G_SELECT cc, %reg, %add 1147 // 1148 // Into: 1149 // %select = CSINC %reg, %x, cc 1150 if (mi_match(Reg, MRI, 1151 m_any_of(m_GAdd(m_Reg(MatchReg), m_SpecificICst(1)), 1152 m_GPtrAdd(m_Reg(MatchReg), m_SpecificICst(1))))) { 1153 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; 1154 Reg = MatchReg; 1155 if (Invert) { 1156 CC = AArch64CC::getInvertedCondCode(CC); 1157 std::swap(Reg, OtherReg); 1158 } 1159 return true; 1160 } 1161 1162 return false; 1163 }; 1164 1165 // Helper lambda which tries to use CSINC/CSINV for the instruction when its 1166 // true/false values are constants. 1167 // FIXME: All of these patterns already exist in tablegen. We should be 1168 // able to import these. 1169 auto TryOptSelectCst = [&Opc, &True, &False, &CC, Is32Bit, &MRI, 1170 &Optimized]() { 1171 if (Optimized) 1172 return false; 1173 auto TrueCst = getIConstantVRegValWithLookThrough(True, MRI); 1174 auto FalseCst = getIConstantVRegValWithLookThrough(False, MRI); 1175 if (!TrueCst && !FalseCst) 1176 return false; 1177 1178 Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR; 1179 if (TrueCst && FalseCst) { 1180 int64_t T = TrueCst->Value.getSExtValue(); 1181 int64_t F = FalseCst->Value.getSExtValue(); 1182 1183 if (T == 0 && F == 1) { 1184 // G_SELECT cc, 0, 1 -> CSINC zreg, zreg, cc 1185 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; 1186 True = ZReg; 1187 False = ZReg; 1188 return true; 1189 } 1190 1191 if (T == 0 && F == -1) { 1192 // G_SELECT cc 0, -1 -> CSINV zreg, zreg cc 1193 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; 1194 True = ZReg; 1195 False = ZReg; 1196 return true; 1197 } 1198 } 1199 1200 if (TrueCst) { 1201 int64_t T = TrueCst->Value.getSExtValue(); 1202 if (T == 1) { 1203 // G_SELECT cc, 1, f -> CSINC f, zreg, inv_cc 1204 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; 1205 True = False; 1206 False = ZReg; 1207 CC = AArch64CC::getInvertedCondCode(CC); 1208 return true; 1209 } 1210 1211 if (T == -1) { 1212 // G_SELECT cc, -1, f -> CSINV f, zreg, inv_cc 1213 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; 1214 True = False; 1215 False = ZReg; 1216 CC = AArch64CC::getInvertedCondCode(CC); 1217 return true; 1218 } 1219 } 1220 1221 if (FalseCst) { 1222 int64_t F = FalseCst->Value.getSExtValue(); 1223 if (F == 1) { 1224 // G_SELECT cc, t, 1 -> CSINC t, zreg, cc 1225 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; 1226 False = ZReg; 1227 return true; 1228 } 1229 1230 if (F == -1) { 1231 // G_SELECT cc, t, -1 -> CSINC t, zreg, cc 1232 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; 1233 False = ZReg; 1234 return true; 1235 } 1236 } 1237 return false; 1238 }; 1239 1240 Optimized |= TryFoldBinOpIntoSelect(False, True, /*Invert = */ false); 1241 Optimized |= TryFoldBinOpIntoSelect(True, False, /*Invert = */ true); 1242 Optimized |= TryOptSelectCst(); 1243 auto SelectInst = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC); 1244 constrainSelectedInstRegOperands(*SelectInst, TII, TRI, RBI); 1245 return &*SelectInst; 1246 } 1247 1248 static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) { 1249 switch (P) { 1250 default: 1251 llvm_unreachable("Unknown condition code!"); 1252 case CmpInst::ICMP_NE: 1253 return AArch64CC::NE; 1254 case CmpInst::ICMP_EQ: 1255 return AArch64CC::EQ; 1256 case CmpInst::ICMP_SGT: 1257 return AArch64CC::GT; 1258 case CmpInst::ICMP_SGE: 1259 return AArch64CC::GE; 1260 case CmpInst::ICMP_SLT: 1261 return AArch64CC::LT; 1262 case CmpInst::ICMP_SLE: 1263 return AArch64CC::LE; 1264 case CmpInst::ICMP_UGT: 1265 return AArch64CC::HI; 1266 case CmpInst::ICMP_UGE: 1267 return AArch64CC::HS; 1268 case CmpInst::ICMP_ULT: 1269 return AArch64CC::LO; 1270 case CmpInst::ICMP_ULE: 1271 return AArch64CC::LS; 1272 } 1273 } 1274 1275 /// changeFPCCToORAArch64CC - Convert an IR fp condition code to an AArch64 CC. 1276 static void changeFPCCToORAArch64CC(CmpInst::Predicate CC, 1277 AArch64CC::CondCode &CondCode, 1278 AArch64CC::CondCode &CondCode2) { 1279 CondCode2 = AArch64CC::AL; 1280 switch (CC) { 1281 default: 1282 llvm_unreachable("Unknown FP condition!"); 1283 case CmpInst::FCMP_OEQ: 1284 CondCode = AArch64CC::EQ; 1285 break; 1286 case CmpInst::FCMP_OGT: 1287 CondCode = AArch64CC::GT; 1288 break; 1289 case CmpInst::FCMP_OGE: 1290 CondCode = AArch64CC::GE; 1291 break; 1292 case CmpInst::FCMP_OLT: 1293 CondCode = AArch64CC::MI; 1294 break; 1295 case CmpInst::FCMP_OLE: 1296 CondCode = AArch64CC::LS; 1297 break; 1298 case CmpInst::FCMP_ONE: 1299 CondCode = AArch64CC::MI; 1300 CondCode2 = AArch64CC::GT; 1301 break; 1302 case CmpInst::FCMP_ORD: 1303 CondCode = AArch64CC::VC; 1304 break; 1305 case CmpInst::FCMP_UNO: 1306 CondCode = AArch64CC::VS; 1307 break; 1308 case CmpInst::FCMP_UEQ: 1309 CondCode = AArch64CC::EQ; 1310 CondCode2 = AArch64CC::VS; 1311 break; 1312 case CmpInst::FCMP_UGT: 1313 CondCode = AArch64CC::HI; 1314 break; 1315 case CmpInst::FCMP_UGE: 1316 CondCode = AArch64CC::PL; 1317 break; 1318 case CmpInst::FCMP_ULT: 1319 CondCode = AArch64CC::LT; 1320 break; 1321 case CmpInst::FCMP_ULE: 1322 CondCode = AArch64CC::LE; 1323 break; 1324 case CmpInst::FCMP_UNE: 1325 CondCode = AArch64CC::NE; 1326 break; 1327 } 1328 } 1329 1330 /// Convert an IR fp condition code to an AArch64 CC. 1331 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that 1332 /// should be AND'ed instead of OR'ed. 1333 static void changeFPCCToANDAArch64CC(CmpInst::Predicate CC, 1334 AArch64CC::CondCode &CondCode, 1335 AArch64CC::CondCode &CondCode2) { 1336 CondCode2 = AArch64CC::AL; 1337 switch (CC) { 1338 default: 1339 changeFPCCToORAArch64CC(CC, CondCode, CondCode2); 1340 assert(CondCode2 == AArch64CC::AL); 1341 break; 1342 case CmpInst::FCMP_ONE: 1343 // (a one b) 1344 // == ((a olt b) || (a ogt b)) 1345 // == ((a ord b) && (a une b)) 1346 CondCode = AArch64CC::VC; 1347 CondCode2 = AArch64CC::NE; 1348 break; 1349 case CmpInst::FCMP_UEQ: 1350 // (a ueq b) 1351 // == ((a uno b) || (a oeq b)) 1352 // == ((a ule b) && (a uge b)) 1353 CondCode = AArch64CC::PL; 1354 CondCode2 = AArch64CC::LE; 1355 break; 1356 } 1357 } 1358 1359 /// Return a register which can be used as a bit to test in a TB(N)Z. 1360 static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert, 1361 MachineRegisterInfo &MRI) { 1362 assert(Reg.isValid() && "Expected valid register!"); 1363 bool HasZext = false; 1364 while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) { 1365 unsigned Opc = MI->getOpcode(); 1366 1367 if (!MI->getOperand(0).isReg() || 1368 !MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) 1369 break; 1370 1371 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits. 1372 // 1373 // (tbz (trunc x), b) -> (tbz x, b) is always safe, because the bit number 1374 // on the truncated x is the same as the bit number on x. 1375 if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT || 1376 Opc == TargetOpcode::G_TRUNC) { 1377 if (Opc == TargetOpcode::G_ZEXT) 1378 HasZext = true; 1379 1380 Register NextReg = MI->getOperand(1).getReg(); 1381 // Did we find something worth folding? 1382 if (!NextReg.isValid() || !MRI.hasOneNonDBGUse(NextReg)) 1383 break; 1384 1385 // NextReg is worth folding. Keep looking. 1386 Reg = NextReg; 1387 continue; 1388 } 1389 1390 // Attempt to find a suitable operation with a constant on one side. 1391 Optional<uint64_t> C; 1392 Register TestReg; 1393 switch (Opc) { 1394 default: 1395 break; 1396 case TargetOpcode::G_AND: 1397 case TargetOpcode::G_XOR: { 1398 TestReg = MI->getOperand(1).getReg(); 1399 Register ConstantReg = MI->getOperand(2).getReg(); 1400 auto VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI); 1401 if (!VRegAndVal) { 1402 // AND commutes, check the other side for a constant. 1403 // FIXME: Can we canonicalize the constant so that it's always on the 1404 // same side at some point earlier? 1405 std::swap(ConstantReg, TestReg); 1406 VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI); 1407 } 1408 if (VRegAndVal) { 1409 if (HasZext) 1410 C = VRegAndVal->Value.getZExtValue(); 1411 else 1412 C = VRegAndVal->Value.getSExtValue(); 1413 } 1414 break; 1415 } 1416 case TargetOpcode::G_ASHR: 1417 case TargetOpcode::G_LSHR: 1418 case TargetOpcode::G_SHL: { 1419 TestReg = MI->getOperand(1).getReg(); 1420 auto VRegAndVal = 1421 getIConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI); 1422 if (VRegAndVal) 1423 C = VRegAndVal->Value.getSExtValue(); 1424 break; 1425 } 1426 } 1427 1428 // Didn't find a constant or viable register. Bail out of the loop. 1429 if (!C || !TestReg.isValid()) 1430 break; 1431 1432 // We found a suitable instruction with a constant. Check to see if we can 1433 // walk through the instruction. 1434 Register NextReg; 1435 unsigned TestRegSize = MRI.getType(TestReg).getSizeInBits(); 1436 switch (Opc) { 1437 default: 1438 break; 1439 case TargetOpcode::G_AND: 1440 // (tbz (and x, m), b) -> (tbz x, b) when the b-th bit of m is set. 1441 if ((*C >> Bit) & 1) 1442 NextReg = TestReg; 1443 break; 1444 case TargetOpcode::G_SHL: 1445 // (tbz (shl x, c), b) -> (tbz x, b-c) when b-c is positive and fits in 1446 // the type of the register. 1447 if (*C <= Bit && (Bit - *C) < TestRegSize) { 1448 NextReg = TestReg; 1449 Bit = Bit - *C; 1450 } 1451 break; 1452 case TargetOpcode::G_ASHR: 1453 // (tbz (ashr x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits 1454 // in x 1455 NextReg = TestReg; 1456 Bit = Bit + *C; 1457 if (Bit >= TestRegSize) 1458 Bit = TestRegSize - 1; 1459 break; 1460 case TargetOpcode::G_LSHR: 1461 // (tbz (lshr x, c), b) -> (tbz x, b+c) when b + c is < # bits in x 1462 if ((Bit + *C) < TestRegSize) { 1463 NextReg = TestReg; 1464 Bit = Bit + *C; 1465 } 1466 break; 1467 case TargetOpcode::G_XOR: 1468 // We can walk through a G_XOR by inverting whether we use tbz/tbnz when 1469 // appropriate. 1470 // 1471 // e.g. If x' = xor x, c, and the b-th bit is set in c then 1472 // 1473 // tbz x', b -> tbnz x, b 1474 // 1475 // Because x' only has the b-th bit set if x does not. 1476 if ((*C >> Bit) & 1) 1477 Invert = !Invert; 1478 NextReg = TestReg; 1479 break; 1480 } 1481 1482 // Check if we found anything worth folding. 1483 if (!NextReg.isValid()) 1484 return Reg; 1485 Reg = NextReg; 1486 } 1487 1488 return Reg; 1489 } 1490 1491 MachineInstr *AArch64InstructionSelector::emitTestBit( 1492 Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB, 1493 MachineIRBuilder &MIB) const { 1494 assert(TestReg.isValid()); 1495 assert(ProduceNonFlagSettingCondBr && 1496 "Cannot emit TB(N)Z with speculation tracking!"); 1497 MachineRegisterInfo &MRI = *MIB.getMRI(); 1498 1499 // Attempt to optimize the test bit by walking over instructions. 1500 TestReg = getTestBitReg(TestReg, Bit, IsNegative, MRI); 1501 LLT Ty = MRI.getType(TestReg); 1502 unsigned Size = Ty.getSizeInBits(); 1503 assert(!Ty.isVector() && "Expected a scalar!"); 1504 assert(Bit < 64 && "Bit is too large!"); 1505 1506 // When the test register is a 64-bit register, we have to narrow to make 1507 // TBNZW work. 1508 bool UseWReg = Bit < 32; 1509 unsigned NecessarySize = UseWReg ? 32 : 64; 1510 if (Size != NecessarySize) 1511 TestReg = moveScalarRegClass( 1512 TestReg, UseWReg ? AArch64::GPR32RegClass : AArch64::GPR64RegClass, 1513 MIB); 1514 1515 static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX}, 1516 {AArch64::TBZW, AArch64::TBNZW}}; 1517 unsigned Opc = OpcTable[UseWReg][IsNegative]; 1518 auto TestBitMI = 1519 MIB.buildInstr(Opc).addReg(TestReg).addImm(Bit).addMBB(DstMBB); 1520 constrainSelectedInstRegOperands(*TestBitMI, TII, TRI, RBI); 1521 return &*TestBitMI; 1522 } 1523 1524 bool AArch64InstructionSelector::tryOptAndIntoCompareBranch( 1525 MachineInstr &AndInst, bool Invert, MachineBasicBlock *DstMBB, 1526 MachineIRBuilder &MIB) const { 1527 assert(AndInst.getOpcode() == TargetOpcode::G_AND && "Expected G_AND only?"); 1528 // Given something like this: 1529 // 1530 // %x = ...Something... 1531 // %one = G_CONSTANT i64 1 1532 // %zero = G_CONSTANT i64 0 1533 // %and = G_AND %x, %one 1534 // %cmp = G_ICMP intpred(ne), %and, %zero 1535 // %cmp_trunc = G_TRUNC %cmp 1536 // G_BRCOND %cmp_trunc, %bb.3 1537 // 1538 // We want to try and fold the AND into the G_BRCOND and produce either a 1539 // TBNZ (when we have intpred(ne)) or a TBZ (when we have intpred(eq)). 1540 // 1541 // In this case, we'd get 1542 // 1543 // TBNZ %x %bb.3 1544 // 1545 1546 // Check if the AND has a constant on its RHS which we can use as a mask. 1547 // If it's a power of 2, then it's the same as checking a specific bit. 1548 // (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set) 1549 auto MaybeBit = getIConstantVRegValWithLookThrough( 1550 AndInst.getOperand(2).getReg(), *MIB.getMRI()); 1551 if (!MaybeBit) 1552 return false; 1553 1554 int32_t Bit = MaybeBit->Value.exactLogBase2(); 1555 if (Bit < 0) 1556 return false; 1557 1558 Register TestReg = AndInst.getOperand(1).getReg(); 1559 1560 // Emit a TB(N)Z. 1561 emitTestBit(TestReg, Bit, Invert, DstMBB, MIB); 1562 return true; 1563 } 1564 1565 MachineInstr *AArch64InstructionSelector::emitCBZ(Register CompareReg, 1566 bool IsNegative, 1567 MachineBasicBlock *DestMBB, 1568 MachineIRBuilder &MIB) const { 1569 assert(ProduceNonFlagSettingCondBr && "CBZ does not set flags!"); 1570 MachineRegisterInfo &MRI = *MIB.getMRI(); 1571 assert(RBI.getRegBank(CompareReg, MRI, TRI)->getID() == 1572 AArch64::GPRRegBankID && 1573 "Expected GPRs only?"); 1574 auto Ty = MRI.getType(CompareReg); 1575 unsigned Width = Ty.getSizeInBits(); 1576 assert(!Ty.isVector() && "Expected scalar only?"); 1577 assert(Width <= 64 && "Expected width to be at most 64?"); 1578 static const unsigned OpcTable[2][2] = {{AArch64::CBZW, AArch64::CBZX}, 1579 {AArch64::CBNZW, AArch64::CBNZX}}; 1580 unsigned Opc = OpcTable[IsNegative][Width == 64]; 1581 auto BranchMI = MIB.buildInstr(Opc, {}, {CompareReg}).addMBB(DestMBB); 1582 constrainSelectedInstRegOperands(*BranchMI, TII, TRI, RBI); 1583 return &*BranchMI; 1584 } 1585 1586 bool AArch64InstructionSelector::selectCompareBranchFedByFCmp( 1587 MachineInstr &I, MachineInstr &FCmp, MachineIRBuilder &MIB) const { 1588 assert(FCmp.getOpcode() == TargetOpcode::G_FCMP); 1589 assert(I.getOpcode() == TargetOpcode::G_BRCOND); 1590 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't 1591 // totally clean. Some of them require two branches to implement. 1592 auto Pred = (CmpInst::Predicate)FCmp.getOperand(1).getPredicate(); 1593 emitFPCompare(FCmp.getOperand(2).getReg(), FCmp.getOperand(3).getReg(), MIB, 1594 Pred); 1595 AArch64CC::CondCode CC1, CC2; 1596 changeFCMPPredToAArch64CC(static_cast<CmpInst::Predicate>(Pred), CC1, CC2); 1597 MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); 1598 MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC1).addMBB(DestMBB); 1599 if (CC2 != AArch64CC::AL) 1600 MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC2).addMBB(DestMBB); 1601 I.eraseFromParent(); 1602 return true; 1603 } 1604 1605 bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp( 1606 MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const { 1607 assert(ICmp.getOpcode() == TargetOpcode::G_ICMP); 1608 assert(I.getOpcode() == TargetOpcode::G_BRCOND); 1609 // Attempt to optimize the G_BRCOND + G_ICMP into a TB(N)Z/CB(N)Z. 1610 // 1611 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z 1612 // instructions will not be produced, as they are conditional branch 1613 // instructions that do not set flags. 1614 if (!ProduceNonFlagSettingCondBr) 1615 return false; 1616 1617 MachineRegisterInfo &MRI = *MIB.getMRI(); 1618 MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); 1619 auto Pred = 1620 static_cast<CmpInst::Predicate>(ICmp.getOperand(1).getPredicate()); 1621 Register LHS = ICmp.getOperand(2).getReg(); 1622 Register RHS = ICmp.getOperand(3).getReg(); 1623 1624 // We're allowed to emit a TB(N)Z/CB(N)Z. Try to do that. 1625 auto VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI); 1626 MachineInstr *AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI); 1627 1628 // When we can emit a TB(N)Z, prefer that. 1629 // 1630 // Handle non-commutative condition codes first. 1631 // Note that we don't want to do this when we have a G_AND because it can 1632 // become a tst. The tst will make the test bit in the TB(N)Z redundant. 1633 if (VRegAndVal && !AndInst) { 1634 int64_t C = VRegAndVal->Value.getSExtValue(); 1635 1636 // When we have a greater-than comparison, we can just test if the msb is 1637 // zero. 1638 if (C == -1 && Pred == CmpInst::ICMP_SGT) { 1639 uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1; 1640 emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB); 1641 I.eraseFromParent(); 1642 return true; 1643 } 1644 1645 // When we have a less than comparison, we can just test if the msb is not 1646 // zero. 1647 if (C == 0 && Pred == CmpInst::ICMP_SLT) { 1648 uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1; 1649 emitTestBit(LHS, Bit, /*IsNegative = */ true, DestMBB, MIB); 1650 I.eraseFromParent(); 1651 return true; 1652 } 1653 } 1654 1655 // Attempt to handle commutative condition codes. Right now, that's only 1656 // eq/ne. 1657 if (ICmpInst::isEquality(Pred)) { 1658 if (!VRegAndVal) { 1659 std::swap(RHS, LHS); 1660 VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI); 1661 AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI); 1662 } 1663 1664 if (VRegAndVal && VRegAndVal->Value == 0) { 1665 // If there's a G_AND feeding into this branch, try to fold it away by 1666 // emitting a TB(N)Z instead. 1667 // 1668 // Note: If we have LT, then it *is* possible to fold, but it wouldn't be 1669 // beneficial. When we have an AND and LT, we need a TST/ANDS, so folding 1670 // would be redundant. 1671 if (AndInst && 1672 tryOptAndIntoCompareBranch( 1673 *AndInst, /*Invert = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB)) { 1674 I.eraseFromParent(); 1675 return true; 1676 } 1677 1678 // Otherwise, try to emit a CB(N)Z instead. 1679 auto LHSTy = MRI.getType(LHS); 1680 if (!LHSTy.isVector() && LHSTy.getSizeInBits() <= 64) { 1681 emitCBZ(LHS, /*IsNegative = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB); 1682 I.eraseFromParent(); 1683 return true; 1684 } 1685 } 1686 } 1687 1688 return false; 1689 } 1690 1691 bool AArch64InstructionSelector::selectCompareBranchFedByICmp( 1692 MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const { 1693 assert(ICmp.getOpcode() == TargetOpcode::G_ICMP); 1694 assert(I.getOpcode() == TargetOpcode::G_BRCOND); 1695 if (tryOptCompareBranchFedByICmp(I, ICmp, MIB)) 1696 return true; 1697 1698 // Couldn't optimize. Emit a compare + a Bcc. 1699 MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); 1700 auto PredOp = ICmp.getOperand(1); 1701 emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB); 1702 const AArch64CC::CondCode CC = changeICMPPredToAArch64CC( 1703 static_cast<CmpInst::Predicate>(PredOp.getPredicate())); 1704 MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB); 1705 I.eraseFromParent(); 1706 return true; 1707 } 1708 1709 bool AArch64InstructionSelector::selectCompareBranch( 1710 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) { 1711 Register CondReg = I.getOperand(0).getReg(); 1712 MachineInstr *CCMI = MRI.getVRegDef(CondReg); 1713 // Try to select the G_BRCOND using whatever is feeding the condition if 1714 // possible. 1715 unsigned CCMIOpc = CCMI->getOpcode(); 1716 if (CCMIOpc == TargetOpcode::G_FCMP) 1717 return selectCompareBranchFedByFCmp(I, *CCMI, MIB); 1718 if (CCMIOpc == TargetOpcode::G_ICMP) 1719 return selectCompareBranchFedByICmp(I, *CCMI, MIB); 1720 1721 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z 1722 // instructions will not be produced, as they are conditional branch 1723 // instructions that do not set flags. 1724 if (ProduceNonFlagSettingCondBr) { 1725 emitTestBit(CondReg, /*Bit = */ 0, /*IsNegative = */ true, 1726 I.getOperand(1).getMBB(), MIB); 1727 I.eraseFromParent(); 1728 return true; 1729 } 1730 1731 // Can't emit TB(N)Z/CB(N)Z. Emit a tst + bcc instead. 1732 auto TstMI = 1733 MIB.buildInstr(AArch64::ANDSWri, {LLT::scalar(32)}, {CondReg}).addImm(1); 1734 constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI); 1735 auto Bcc = MIB.buildInstr(AArch64::Bcc) 1736 .addImm(AArch64CC::EQ) 1737 .addMBB(I.getOperand(1).getMBB()); 1738 I.eraseFromParent(); 1739 return constrainSelectedInstRegOperands(*Bcc, TII, TRI, RBI); 1740 } 1741 1742 /// Returns the element immediate value of a vector shift operand if found. 1743 /// This needs to detect a splat-like operation, e.g. a G_BUILD_VECTOR. 1744 static Optional<int64_t> getVectorShiftImm(Register Reg, 1745 MachineRegisterInfo &MRI) { 1746 assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand"); 1747 MachineInstr *OpMI = MRI.getVRegDef(Reg); 1748 return getAArch64VectorSplatScalar(*OpMI, MRI); 1749 } 1750 1751 /// Matches and returns the shift immediate value for a SHL instruction given 1752 /// a shift operand. 1753 static Optional<int64_t> getVectorSHLImm(LLT SrcTy, Register Reg, MachineRegisterInfo &MRI) { 1754 Optional<int64_t> ShiftImm = getVectorShiftImm(Reg, MRI); 1755 if (!ShiftImm) 1756 return None; 1757 // Check the immediate is in range for a SHL. 1758 int64_t Imm = *ShiftImm; 1759 if (Imm < 0) 1760 return None; 1761 switch (SrcTy.getElementType().getSizeInBits()) { 1762 default: 1763 LLVM_DEBUG(dbgs() << "Unhandled element type for vector shift"); 1764 return None; 1765 case 8: 1766 if (Imm > 7) 1767 return None; 1768 break; 1769 case 16: 1770 if (Imm > 15) 1771 return None; 1772 break; 1773 case 32: 1774 if (Imm > 31) 1775 return None; 1776 break; 1777 case 64: 1778 if (Imm > 63) 1779 return None; 1780 break; 1781 } 1782 return Imm; 1783 } 1784 1785 bool AArch64InstructionSelector::selectVectorSHL(MachineInstr &I, 1786 MachineRegisterInfo &MRI) { 1787 assert(I.getOpcode() == TargetOpcode::G_SHL); 1788 Register DstReg = I.getOperand(0).getReg(); 1789 const LLT Ty = MRI.getType(DstReg); 1790 Register Src1Reg = I.getOperand(1).getReg(); 1791 Register Src2Reg = I.getOperand(2).getReg(); 1792 1793 if (!Ty.isVector()) 1794 return false; 1795 1796 // Check if we have a vector of constants on RHS that we can select as the 1797 // immediate form. 1798 Optional<int64_t> ImmVal = getVectorSHLImm(Ty, Src2Reg, MRI); 1799 1800 unsigned Opc = 0; 1801 if (Ty == LLT::fixed_vector(2, 64)) { 1802 Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64; 1803 } else if (Ty == LLT::fixed_vector(4, 32)) { 1804 Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32; 1805 } else if (Ty == LLT::fixed_vector(2, 32)) { 1806 Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32; 1807 } else if (Ty == LLT::fixed_vector(4, 16)) { 1808 Opc = ImmVal ? AArch64::SHLv4i16_shift : AArch64::USHLv4i16; 1809 } else if (Ty == LLT::fixed_vector(8, 16)) { 1810 Opc = ImmVal ? AArch64::SHLv8i16_shift : AArch64::USHLv8i16; 1811 } else if (Ty == LLT::fixed_vector(16, 8)) { 1812 Opc = ImmVal ? AArch64::SHLv16i8_shift : AArch64::USHLv16i8; 1813 } else if (Ty == LLT::fixed_vector(8, 8)) { 1814 Opc = ImmVal ? AArch64::SHLv8i8_shift : AArch64::USHLv8i8; 1815 } else { 1816 LLVM_DEBUG(dbgs() << "Unhandled G_SHL type"); 1817 return false; 1818 } 1819 1820 auto Shl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg}); 1821 if (ImmVal) 1822 Shl.addImm(*ImmVal); 1823 else 1824 Shl.addUse(Src2Reg); 1825 constrainSelectedInstRegOperands(*Shl, TII, TRI, RBI); 1826 I.eraseFromParent(); 1827 return true; 1828 } 1829 1830 bool AArch64InstructionSelector::selectVectorAshrLshr( 1831 MachineInstr &I, MachineRegisterInfo &MRI) { 1832 assert(I.getOpcode() == TargetOpcode::G_ASHR || 1833 I.getOpcode() == TargetOpcode::G_LSHR); 1834 Register DstReg = I.getOperand(0).getReg(); 1835 const LLT Ty = MRI.getType(DstReg); 1836 Register Src1Reg = I.getOperand(1).getReg(); 1837 Register Src2Reg = I.getOperand(2).getReg(); 1838 1839 if (!Ty.isVector()) 1840 return false; 1841 1842 bool IsASHR = I.getOpcode() == TargetOpcode::G_ASHR; 1843 1844 // We expect the immediate case to be lowered in the PostLegalCombiner to 1845 // AArch64ISD::VASHR or AArch64ISD::VLSHR equivalents. 1846 1847 // There is not a shift right register instruction, but the shift left 1848 // register instruction takes a signed value, where negative numbers specify a 1849 // right shift. 1850 1851 unsigned Opc = 0; 1852 unsigned NegOpc = 0; 1853 const TargetRegisterClass *RC = 1854 getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID)); 1855 if (Ty == LLT::fixed_vector(2, 64)) { 1856 Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64; 1857 NegOpc = AArch64::NEGv2i64; 1858 } else if (Ty == LLT::fixed_vector(4, 32)) { 1859 Opc = IsASHR ? AArch64::SSHLv4i32 : AArch64::USHLv4i32; 1860 NegOpc = AArch64::NEGv4i32; 1861 } else if (Ty == LLT::fixed_vector(2, 32)) { 1862 Opc = IsASHR ? AArch64::SSHLv2i32 : AArch64::USHLv2i32; 1863 NegOpc = AArch64::NEGv2i32; 1864 } else if (Ty == LLT::fixed_vector(4, 16)) { 1865 Opc = IsASHR ? AArch64::SSHLv4i16 : AArch64::USHLv4i16; 1866 NegOpc = AArch64::NEGv4i16; 1867 } else if (Ty == LLT::fixed_vector(8, 16)) { 1868 Opc = IsASHR ? AArch64::SSHLv8i16 : AArch64::USHLv8i16; 1869 NegOpc = AArch64::NEGv8i16; 1870 } else if (Ty == LLT::fixed_vector(16, 8)) { 1871 Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8; 1872 NegOpc = AArch64::NEGv16i8; 1873 } else if (Ty == LLT::fixed_vector(8, 8)) { 1874 Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8; 1875 NegOpc = AArch64::NEGv8i8; 1876 } else { 1877 LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type"); 1878 return false; 1879 } 1880 1881 auto Neg = MIB.buildInstr(NegOpc, {RC}, {Src2Reg}); 1882 constrainSelectedInstRegOperands(*Neg, TII, TRI, RBI); 1883 auto SShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Neg}); 1884 constrainSelectedInstRegOperands(*SShl, TII, TRI, RBI); 1885 I.eraseFromParent(); 1886 return true; 1887 } 1888 1889 bool AArch64InstructionSelector::selectVaStartAAPCS( 1890 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { 1891 return false; 1892 } 1893 1894 bool AArch64InstructionSelector::selectVaStartDarwin( 1895 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { 1896 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 1897 Register ListReg = I.getOperand(0).getReg(); 1898 1899 Register ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); 1900 1901 auto MIB = 1902 BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri)) 1903 .addDef(ArgsAddrReg) 1904 .addFrameIndex(FuncInfo->getVarArgsStackIndex()) 1905 .addImm(0) 1906 .addImm(0); 1907 1908 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1909 1910 MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui)) 1911 .addUse(ArgsAddrReg) 1912 .addUse(ListReg) 1913 .addImm(0) 1914 .addMemOperand(*I.memoperands_begin()); 1915 1916 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1917 I.eraseFromParent(); 1918 return true; 1919 } 1920 1921 void AArch64InstructionSelector::materializeLargeCMVal( 1922 MachineInstr &I, const Value *V, unsigned OpFlags) { 1923 MachineBasicBlock &MBB = *I.getParent(); 1924 MachineFunction &MF = *MBB.getParent(); 1925 MachineRegisterInfo &MRI = MF.getRegInfo(); 1926 1927 auto MovZ = MIB.buildInstr(AArch64::MOVZXi, {&AArch64::GPR64RegClass}, {}); 1928 MovZ->addOperand(MF, I.getOperand(1)); 1929 MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 | 1930 AArch64II::MO_NC); 1931 MovZ->addOperand(MF, MachineOperand::CreateImm(0)); 1932 constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI); 1933 1934 auto BuildMovK = [&](Register SrcReg, unsigned char Flags, unsigned Offset, 1935 Register ForceDstReg) { 1936 Register DstReg = ForceDstReg 1937 ? ForceDstReg 1938 : MRI.createVirtualRegister(&AArch64::GPR64RegClass); 1939 auto MovI = MIB.buildInstr(AArch64::MOVKXi).addDef(DstReg).addUse(SrcReg); 1940 if (auto *GV = dyn_cast<GlobalValue>(V)) { 1941 MovI->addOperand(MF, MachineOperand::CreateGA( 1942 GV, MovZ->getOperand(1).getOffset(), Flags)); 1943 } else { 1944 MovI->addOperand( 1945 MF, MachineOperand::CreateBA(cast<BlockAddress>(V), 1946 MovZ->getOperand(1).getOffset(), Flags)); 1947 } 1948 MovI->addOperand(MF, MachineOperand::CreateImm(Offset)); 1949 constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI); 1950 return DstReg; 1951 }; 1952 Register DstReg = BuildMovK(MovZ.getReg(0), 1953 AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0); 1954 DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0); 1955 BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg()); 1956 } 1957 1958 bool AArch64InstructionSelector::preISelLower(MachineInstr &I) { 1959 MachineBasicBlock &MBB = *I.getParent(); 1960 MachineFunction &MF = *MBB.getParent(); 1961 MachineRegisterInfo &MRI = MF.getRegInfo(); 1962 1963 switch (I.getOpcode()) { 1964 case TargetOpcode::G_STORE: { 1965 bool Changed = contractCrossBankCopyIntoStore(I, MRI); 1966 MachineOperand &SrcOp = I.getOperand(0); 1967 if (MRI.getType(SrcOp.getReg()).isPointer()) { 1968 // Allow matching with imported patterns for stores of pointers. Unlike 1969 // G_LOAD/G_PTR_ADD, we may not have selected all users. So, emit a copy 1970 // and constrain. 1971 auto Copy = MIB.buildCopy(LLT::scalar(64), SrcOp); 1972 Register NewSrc = Copy.getReg(0); 1973 SrcOp.setReg(NewSrc); 1974 RBI.constrainGenericRegister(NewSrc, AArch64::GPR64RegClass, MRI); 1975 Changed = true; 1976 } 1977 return Changed; 1978 } 1979 case TargetOpcode::G_PTR_ADD: 1980 return convertPtrAddToAdd(I, MRI); 1981 case TargetOpcode::G_LOAD: { 1982 // For scalar loads of pointers, we try to convert the dest type from p0 1983 // to s64 so that our imported patterns can match. Like with the G_PTR_ADD 1984 // conversion, this should be ok because all users should have been 1985 // selected already, so the type doesn't matter for them. 1986 Register DstReg = I.getOperand(0).getReg(); 1987 const LLT DstTy = MRI.getType(DstReg); 1988 if (!DstTy.isPointer()) 1989 return false; 1990 MRI.setType(DstReg, LLT::scalar(64)); 1991 return true; 1992 } 1993 case AArch64::G_DUP: { 1994 // Convert the type from p0 to s64 to help selection. 1995 LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 1996 if (!DstTy.getElementType().isPointer()) 1997 return false; 1998 auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(1).getReg()); 1999 MRI.setType(I.getOperand(0).getReg(), 2000 DstTy.changeElementType(LLT::scalar(64))); 2001 MRI.setRegClass(NewSrc.getReg(0), &AArch64::GPR64RegClass); 2002 I.getOperand(1).setReg(NewSrc.getReg(0)); 2003 return true; 2004 } 2005 case TargetOpcode::G_UITOFP: 2006 case TargetOpcode::G_SITOFP: { 2007 // If both source and destination regbanks are FPR, then convert the opcode 2008 // to G_SITOF so that the importer can select it to an fpr variant. 2009 // Otherwise, it ends up matching an fpr/gpr variant and adding a cross-bank 2010 // copy. 2011 Register SrcReg = I.getOperand(1).getReg(); 2012 LLT SrcTy = MRI.getType(SrcReg); 2013 LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 2014 if (SrcTy.isVector() || SrcTy.getSizeInBits() != DstTy.getSizeInBits()) 2015 return false; 2016 2017 if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::FPRRegBankID) { 2018 if (I.getOpcode() == TargetOpcode::G_SITOFP) 2019 I.setDesc(TII.get(AArch64::G_SITOF)); 2020 else 2021 I.setDesc(TII.get(AArch64::G_UITOF)); 2022 return true; 2023 } 2024 return false; 2025 } 2026 default: 2027 return false; 2028 } 2029 } 2030 2031 /// This lowering tries to look for G_PTR_ADD instructions and then converts 2032 /// them to a standard G_ADD with a COPY on the source. 2033 /// 2034 /// The motivation behind this is to expose the add semantics to the imported 2035 /// tablegen patterns. We shouldn't need to check for uses being loads/stores, 2036 /// because the selector works bottom up, uses before defs. By the time we 2037 /// end up trying to select a G_PTR_ADD, we should have already attempted to 2038 /// fold this into addressing modes and were therefore unsuccessful. 2039 bool AArch64InstructionSelector::convertPtrAddToAdd( 2040 MachineInstr &I, MachineRegisterInfo &MRI) { 2041 assert(I.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD"); 2042 Register DstReg = I.getOperand(0).getReg(); 2043 Register AddOp1Reg = I.getOperand(1).getReg(); 2044 const LLT PtrTy = MRI.getType(DstReg); 2045 if (PtrTy.getAddressSpace() != 0) 2046 return false; 2047 2048 const LLT CastPtrTy = 2049 PtrTy.isVector() ? LLT::fixed_vector(2, 64) : LLT::scalar(64); 2050 auto PtrToInt = MIB.buildPtrToInt(CastPtrTy, AddOp1Reg); 2051 // Set regbanks on the registers. 2052 if (PtrTy.isVector()) 2053 MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::FPRRegBankID)); 2054 else 2055 MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID)); 2056 2057 // Now turn the %dst(p0) = G_PTR_ADD %base, off into: 2058 // %dst(intty) = G_ADD %intbase, off 2059 I.setDesc(TII.get(TargetOpcode::G_ADD)); 2060 MRI.setType(DstReg, CastPtrTy); 2061 I.getOperand(1).setReg(PtrToInt.getReg(0)); 2062 if (!select(*PtrToInt)) { 2063 LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd"); 2064 return false; 2065 } 2066 2067 // Also take the opportunity here to try to do some optimization. 2068 // Try to convert this into a G_SUB if the offset is a 0-x negate idiom. 2069 Register NegatedReg; 2070 if (!mi_match(I.getOperand(2).getReg(), MRI, m_Neg(m_Reg(NegatedReg)))) 2071 return true; 2072 I.getOperand(2).setReg(NegatedReg); 2073 I.setDesc(TII.get(TargetOpcode::G_SUB)); 2074 return true; 2075 } 2076 2077 bool AArch64InstructionSelector::earlySelectSHL(MachineInstr &I, 2078 MachineRegisterInfo &MRI) { 2079 // We try to match the immediate variant of LSL, which is actually an alias 2080 // for a special case of UBFM. Otherwise, we fall back to the imported 2081 // selector which will match the register variant. 2082 assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op"); 2083 const auto &MO = I.getOperand(2); 2084 auto VRegAndVal = getIConstantVRegVal(MO.getReg(), MRI); 2085 if (!VRegAndVal) 2086 return false; 2087 2088 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 2089 if (DstTy.isVector()) 2090 return false; 2091 bool Is64Bit = DstTy.getSizeInBits() == 64; 2092 auto Imm1Fn = Is64Bit ? selectShiftA_64(MO) : selectShiftA_32(MO); 2093 auto Imm2Fn = Is64Bit ? selectShiftB_64(MO) : selectShiftB_32(MO); 2094 2095 if (!Imm1Fn || !Imm2Fn) 2096 return false; 2097 2098 auto NewI = 2099 MIB.buildInstr(Is64Bit ? AArch64::UBFMXri : AArch64::UBFMWri, 2100 {I.getOperand(0).getReg()}, {I.getOperand(1).getReg()}); 2101 2102 for (auto &RenderFn : *Imm1Fn) 2103 RenderFn(NewI); 2104 for (auto &RenderFn : *Imm2Fn) 2105 RenderFn(NewI); 2106 2107 I.eraseFromParent(); 2108 return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI); 2109 } 2110 2111 bool AArch64InstructionSelector::contractCrossBankCopyIntoStore( 2112 MachineInstr &I, MachineRegisterInfo &MRI) { 2113 assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE"); 2114 // If we're storing a scalar, it doesn't matter what register bank that 2115 // scalar is on. All that matters is the size. 2116 // 2117 // So, if we see something like this (with a 32-bit scalar as an example): 2118 // 2119 // %x:gpr(s32) = ... something ... 2120 // %y:fpr(s32) = COPY %x:gpr(s32) 2121 // G_STORE %y:fpr(s32) 2122 // 2123 // We can fix this up into something like this: 2124 // 2125 // G_STORE %x:gpr(s32) 2126 // 2127 // And then continue the selection process normally. 2128 Register DefDstReg = getSrcRegIgnoringCopies(I.getOperand(0).getReg(), MRI); 2129 if (!DefDstReg.isValid()) 2130 return false; 2131 LLT DefDstTy = MRI.getType(DefDstReg); 2132 Register StoreSrcReg = I.getOperand(0).getReg(); 2133 LLT StoreSrcTy = MRI.getType(StoreSrcReg); 2134 2135 // If we get something strange like a physical register, then we shouldn't 2136 // go any further. 2137 if (!DefDstTy.isValid()) 2138 return false; 2139 2140 // Are the source and dst types the same size? 2141 if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits()) 2142 return false; 2143 2144 if (RBI.getRegBank(StoreSrcReg, MRI, TRI) == 2145 RBI.getRegBank(DefDstReg, MRI, TRI)) 2146 return false; 2147 2148 // We have a cross-bank copy, which is entering a store. Let's fold it. 2149 I.getOperand(0).setReg(DefDstReg); 2150 return true; 2151 } 2152 2153 bool AArch64InstructionSelector::earlySelect(MachineInstr &I) { 2154 assert(I.getParent() && "Instruction should be in a basic block!"); 2155 assert(I.getParent()->getParent() && "Instruction should be in a function!"); 2156 2157 MachineBasicBlock &MBB = *I.getParent(); 2158 MachineFunction &MF = *MBB.getParent(); 2159 MachineRegisterInfo &MRI = MF.getRegInfo(); 2160 2161 switch (I.getOpcode()) { 2162 case AArch64::G_DUP: { 2163 // Before selecting a DUP instruction, check if it is better selected as a 2164 // MOV or load from a constant pool. 2165 Register Src = I.getOperand(1).getReg(); 2166 auto ValAndVReg = getIConstantVRegValWithLookThrough(Src, MRI); 2167 if (!ValAndVReg) 2168 return false; 2169 LLVMContext &Ctx = MF.getFunction().getContext(); 2170 Register Dst = I.getOperand(0).getReg(); 2171 auto *CV = ConstantDataVector::getSplat( 2172 MRI.getType(Dst).getNumElements(), 2173 ConstantInt::get(Type::getIntNTy(Ctx, MRI.getType(Src).getSizeInBits()), 2174 ValAndVReg->Value)); 2175 if (!emitConstantVector(Dst, CV, MIB, MRI)) 2176 return false; 2177 I.eraseFromParent(); 2178 return true; 2179 } 2180 case TargetOpcode::G_SEXT: 2181 // Check for i64 sext(i32 vector_extract) prior to tablegen to select SMOV 2182 // over a normal extend. 2183 if (selectUSMovFromExtend(I, MRI)) 2184 return true; 2185 return false; 2186 case TargetOpcode::G_BR: 2187 return false; 2188 case TargetOpcode::G_SHL: 2189 return earlySelectSHL(I, MRI); 2190 case TargetOpcode::G_CONSTANT: { 2191 bool IsZero = false; 2192 if (I.getOperand(1).isCImm()) 2193 IsZero = I.getOperand(1).getCImm()->getZExtValue() == 0; 2194 else if (I.getOperand(1).isImm()) 2195 IsZero = I.getOperand(1).getImm() == 0; 2196 2197 if (!IsZero) 2198 return false; 2199 2200 Register DefReg = I.getOperand(0).getReg(); 2201 LLT Ty = MRI.getType(DefReg); 2202 if (Ty.getSizeInBits() == 64) { 2203 I.getOperand(1).ChangeToRegister(AArch64::XZR, false); 2204 RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI); 2205 } else if (Ty.getSizeInBits() == 32) { 2206 I.getOperand(1).ChangeToRegister(AArch64::WZR, false); 2207 RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI); 2208 } else 2209 return false; 2210 2211 I.setDesc(TII.get(TargetOpcode::COPY)); 2212 return true; 2213 } 2214 2215 case TargetOpcode::G_ADD: { 2216 // Check if this is being fed by a G_ICMP on either side. 2217 // 2218 // (cmp pred, x, y) + z 2219 // 2220 // In the above case, when the cmp is true, we increment z by 1. So, we can 2221 // fold the add into the cset for the cmp by using cinc. 2222 // 2223 // FIXME: This would probably be a lot nicer in PostLegalizerLowering. 2224 Register AddDst = I.getOperand(0).getReg(); 2225 Register AddLHS = I.getOperand(1).getReg(); 2226 Register AddRHS = I.getOperand(2).getReg(); 2227 // Only handle scalars. 2228 LLT Ty = MRI.getType(AddLHS); 2229 if (Ty.isVector()) 2230 return false; 2231 // Since G_ICMP is modeled as ADDS/SUBS/ANDS, we can handle 32 bits or 64 2232 // bits. 2233 unsigned Size = Ty.getSizeInBits(); 2234 if (Size != 32 && Size != 64) 2235 return false; 2236 auto MatchCmp = [&](Register Reg) -> MachineInstr * { 2237 if (!MRI.hasOneNonDBGUse(Reg)) 2238 return nullptr; 2239 // If the LHS of the add is 32 bits, then we want to fold a 32-bit 2240 // compare. 2241 if (Size == 32) 2242 return getOpcodeDef(TargetOpcode::G_ICMP, Reg, MRI); 2243 // We model scalar compares using 32-bit destinations right now. 2244 // If it's a 64-bit compare, it'll have 64-bit sources. 2245 Register ZExt; 2246 if (!mi_match(Reg, MRI, 2247 m_OneNonDBGUse(m_GZExt(m_OneNonDBGUse(m_Reg(ZExt)))))) 2248 return nullptr; 2249 auto *Cmp = getOpcodeDef(TargetOpcode::G_ICMP, ZExt, MRI); 2250 if (!Cmp || 2251 MRI.getType(Cmp->getOperand(2).getReg()).getSizeInBits() != 64) 2252 return nullptr; 2253 return Cmp; 2254 }; 2255 // Try to match 2256 // z + (cmp pred, x, y) 2257 MachineInstr *Cmp = MatchCmp(AddRHS); 2258 if (!Cmp) { 2259 // (cmp pred, x, y) + z 2260 std::swap(AddLHS, AddRHS); 2261 Cmp = MatchCmp(AddRHS); 2262 if (!Cmp) 2263 return false; 2264 } 2265 auto &PredOp = Cmp->getOperand(1); 2266 auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate()); 2267 const AArch64CC::CondCode InvCC = 2268 changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred)); 2269 MIB.setInstrAndDebugLoc(I); 2270 emitIntegerCompare(/*LHS=*/Cmp->getOperand(2), 2271 /*RHS=*/Cmp->getOperand(3), PredOp, MIB); 2272 emitCSINC(/*Dst=*/AddDst, /*Src =*/AddLHS, /*Src2=*/AddLHS, InvCC, MIB); 2273 I.eraseFromParent(); 2274 return true; 2275 } 2276 case TargetOpcode::G_OR: { 2277 // Look for operations that take the lower `Width=Size-ShiftImm` bits of 2278 // `ShiftSrc` and insert them into the upper `Width` bits of `MaskSrc` via 2279 // shifting and masking that we can replace with a BFI (encoded as a BFM). 2280 Register Dst = I.getOperand(0).getReg(); 2281 LLT Ty = MRI.getType(Dst); 2282 2283 if (!Ty.isScalar()) 2284 return false; 2285 2286 unsigned Size = Ty.getSizeInBits(); 2287 if (Size != 32 && Size != 64) 2288 return false; 2289 2290 Register ShiftSrc; 2291 int64_t ShiftImm; 2292 Register MaskSrc; 2293 int64_t MaskImm; 2294 if (!mi_match( 2295 Dst, MRI, 2296 m_GOr(m_OneNonDBGUse(m_GShl(m_Reg(ShiftSrc), m_ICst(ShiftImm))), 2297 m_OneNonDBGUse(m_GAnd(m_Reg(MaskSrc), m_ICst(MaskImm)))))) 2298 return false; 2299 2300 if (ShiftImm > Size || ((1ULL << ShiftImm) - 1ULL) != uint64_t(MaskImm)) 2301 return false; 2302 2303 int64_t Immr = Size - ShiftImm; 2304 int64_t Imms = Size - ShiftImm - 1; 2305 unsigned Opc = Size == 32 ? AArch64::BFMWri : AArch64::BFMXri; 2306 emitInstr(Opc, {Dst}, {MaskSrc, ShiftSrc, Immr, Imms}, MIB); 2307 I.eraseFromParent(); 2308 return true; 2309 } 2310 case TargetOpcode::G_FENCE: { 2311 if (I.getOperand(1).getImm() == 0) 2312 BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CompilerBarrier)) 2313 .addImm(I.getOperand(0).getImm()); 2314 else 2315 BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::DMB)) 2316 .addImm(I.getOperand(0).getImm() == 4 ? 0x9 : 0xb); 2317 I.eraseFromParent(); 2318 return true; 2319 } 2320 default: 2321 return false; 2322 } 2323 } 2324 2325 bool AArch64InstructionSelector::select(MachineInstr &I) { 2326 assert(I.getParent() && "Instruction should be in a basic block!"); 2327 assert(I.getParent()->getParent() && "Instruction should be in a function!"); 2328 2329 MachineBasicBlock &MBB = *I.getParent(); 2330 MachineFunction &MF = *MBB.getParent(); 2331 MachineRegisterInfo &MRI = MF.getRegInfo(); 2332 2333 const AArch64Subtarget *Subtarget = &MF.getSubtarget<AArch64Subtarget>(); 2334 if (Subtarget->requiresStrictAlign()) { 2335 // We don't support this feature yet. 2336 LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n"); 2337 return false; 2338 } 2339 2340 MIB.setInstrAndDebugLoc(I); 2341 2342 unsigned Opcode = I.getOpcode(); 2343 // G_PHI requires same handling as PHI 2344 if (!I.isPreISelOpcode() || Opcode == TargetOpcode::G_PHI) { 2345 // Certain non-generic instructions also need some special handling. 2346 2347 if (Opcode == TargetOpcode::LOAD_STACK_GUARD) 2348 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2349 2350 if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) { 2351 const Register DefReg = I.getOperand(0).getReg(); 2352 const LLT DefTy = MRI.getType(DefReg); 2353 2354 const RegClassOrRegBank &RegClassOrBank = 2355 MRI.getRegClassOrRegBank(DefReg); 2356 2357 const TargetRegisterClass *DefRC 2358 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); 2359 if (!DefRC) { 2360 if (!DefTy.isValid()) { 2361 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); 2362 return false; 2363 } 2364 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); 2365 DefRC = getRegClassForTypeOnBank(DefTy, RB); 2366 if (!DefRC) { 2367 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); 2368 return false; 2369 } 2370 } 2371 2372 I.setDesc(TII.get(TargetOpcode::PHI)); 2373 2374 return RBI.constrainGenericRegister(DefReg, *DefRC, MRI); 2375 } 2376 2377 if (I.isCopy()) 2378 return selectCopy(I, TII, MRI, TRI, RBI); 2379 2380 return true; 2381 } 2382 2383 2384 if (I.getNumOperands() != I.getNumExplicitOperands()) { 2385 LLVM_DEBUG( 2386 dbgs() << "Generic instruction has unexpected implicit operands\n"); 2387 return false; 2388 } 2389 2390 // Try to do some lowering before we start instruction selecting. These 2391 // lowerings are purely transformations on the input G_MIR and so selection 2392 // must continue after any modification of the instruction. 2393 if (preISelLower(I)) { 2394 Opcode = I.getOpcode(); // The opcode may have been modified, refresh it. 2395 } 2396 2397 // There may be patterns where the importer can't deal with them optimally, 2398 // but does select it to a suboptimal sequence so our custom C++ selection 2399 // code later never has a chance to work on it. Therefore, we have an early 2400 // selection attempt here to give priority to certain selection routines 2401 // over the imported ones. 2402 if (earlySelect(I)) 2403 return true; 2404 2405 if (selectImpl(I, *CoverageInfo)) 2406 return true; 2407 2408 LLT Ty = 2409 I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{}; 2410 2411 switch (Opcode) { 2412 case TargetOpcode::G_SBFX: 2413 case TargetOpcode::G_UBFX: { 2414 static const unsigned OpcTable[2][2] = { 2415 {AArch64::UBFMWri, AArch64::UBFMXri}, 2416 {AArch64::SBFMWri, AArch64::SBFMXri}}; 2417 bool IsSigned = Opcode == TargetOpcode::G_SBFX; 2418 unsigned Size = Ty.getSizeInBits(); 2419 unsigned Opc = OpcTable[IsSigned][Size == 64]; 2420 auto Cst1 = 2421 getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), MRI); 2422 assert(Cst1 && "Should have gotten a constant for src 1?"); 2423 auto Cst2 = 2424 getIConstantVRegValWithLookThrough(I.getOperand(3).getReg(), MRI); 2425 assert(Cst2 && "Should have gotten a constant for src 2?"); 2426 auto LSB = Cst1->Value.getZExtValue(); 2427 auto Width = Cst2->Value.getZExtValue(); 2428 auto BitfieldInst = 2429 MIB.buildInstr(Opc, {I.getOperand(0)}, {I.getOperand(1)}) 2430 .addImm(LSB) 2431 .addImm(LSB + Width - 1); 2432 I.eraseFromParent(); 2433 return constrainSelectedInstRegOperands(*BitfieldInst, TII, TRI, RBI); 2434 } 2435 case TargetOpcode::G_BRCOND: 2436 return selectCompareBranch(I, MF, MRI); 2437 2438 case TargetOpcode::G_BRINDIRECT: { 2439 I.setDesc(TII.get(AArch64::BR)); 2440 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2441 } 2442 2443 case TargetOpcode::G_BRJT: 2444 return selectBrJT(I, MRI); 2445 2446 case AArch64::G_ADD_LOW: { 2447 // This op may have been separated from it's ADRP companion by the localizer 2448 // or some other code motion pass. Given that many CPUs will try to 2449 // macro fuse these operations anyway, select this into a MOVaddr pseudo 2450 // which will later be expanded into an ADRP+ADD pair after scheduling. 2451 MachineInstr *BaseMI = MRI.getVRegDef(I.getOperand(1).getReg()); 2452 if (BaseMI->getOpcode() != AArch64::ADRP) { 2453 I.setDesc(TII.get(AArch64::ADDXri)); 2454 I.addOperand(MachineOperand::CreateImm(0)); 2455 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2456 } 2457 assert(TM.getCodeModel() == CodeModel::Small && 2458 "Expected small code model"); 2459 auto Op1 = BaseMI->getOperand(1); 2460 auto Op2 = I.getOperand(2); 2461 auto MovAddr = MIB.buildInstr(AArch64::MOVaddr, {I.getOperand(0)}, {}) 2462 .addGlobalAddress(Op1.getGlobal(), Op1.getOffset(), 2463 Op1.getTargetFlags()) 2464 .addGlobalAddress(Op2.getGlobal(), Op2.getOffset(), 2465 Op2.getTargetFlags()); 2466 I.eraseFromParent(); 2467 return constrainSelectedInstRegOperands(*MovAddr, TII, TRI, RBI); 2468 } 2469 2470 case TargetOpcode::G_BSWAP: { 2471 // Handle vector types for G_BSWAP directly. 2472 Register DstReg = I.getOperand(0).getReg(); 2473 LLT DstTy = MRI.getType(DstReg); 2474 2475 // We should only get vector types here; everything else is handled by the 2476 // importer right now. 2477 if (!DstTy.isVector() || DstTy.getSizeInBits() > 128) { 2478 LLVM_DEBUG(dbgs() << "Dst type for G_BSWAP currently unsupported.\n"); 2479 return false; 2480 } 2481 2482 // Only handle 4 and 2 element vectors for now. 2483 // TODO: 16-bit elements. 2484 unsigned NumElts = DstTy.getNumElements(); 2485 if (NumElts != 4 && NumElts != 2) { 2486 LLVM_DEBUG(dbgs() << "Unsupported number of elements for G_BSWAP.\n"); 2487 return false; 2488 } 2489 2490 // Choose the correct opcode for the supported types. Right now, that's 2491 // v2s32, v4s32, and v2s64. 2492 unsigned Opc = 0; 2493 unsigned EltSize = DstTy.getElementType().getSizeInBits(); 2494 if (EltSize == 32) 2495 Opc = (DstTy.getNumElements() == 2) ? AArch64::REV32v8i8 2496 : AArch64::REV32v16i8; 2497 else if (EltSize == 64) 2498 Opc = AArch64::REV64v16i8; 2499 2500 // We should always get something by the time we get here... 2501 assert(Opc != 0 && "Didn't get an opcode for G_BSWAP?"); 2502 2503 I.setDesc(TII.get(Opc)); 2504 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2505 } 2506 2507 case TargetOpcode::G_FCONSTANT: 2508 case TargetOpcode::G_CONSTANT: { 2509 const bool isFP = Opcode == TargetOpcode::G_FCONSTANT; 2510 2511 const LLT s8 = LLT::scalar(8); 2512 const LLT s16 = LLT::scalar(16); 2513 const LLT s32 = LLT::scalar(32); 2514 const LLT s64 = LLT::scalar(64); 2515 const LLT s128 = LLT::scalar(128); 2516 const LLT p0 = LLT::pointer(0, 64); 2517 2518 const Register DefReg = I.getOperand(0).getReg(); 2519 const LLT DefTy = MRI.getType(DefReg); 2520 const unsigned DefSize = DefTy.getSizeInBits(); 2521 const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); 2522 2523 // FIXME: Redundant check, but even less readable when factored out. 2524 if (isFP) { 2525 if (Ty != s16 && Ty != s32 && Ty != s64 && Ty != s128) { 2526 LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty 2527 << " constant, expected: " << s16 << " or " << s32 2528 << " or " << s64 << " or " << s128 << '\n'); 2529 return false; 2530 } 2531 2532 if (RB.getID() != AArch64::FPRRegBankID) { 2533 LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty 2534 << " constant on bank: " << RB 2535 << ", expected: FPR\n"); 2536 return false; 2537 } 2538 2539 // The case when we have 0.0 is covered by tablegen. Reject it here so we 2540 // can be sure tablegen works correctly and isn't rescued by this code. 2541 // 0.0 is not covered by tablegen for FP128. So we will handle this 2542 // scenario in the code here. 2543 if (DefSize != 128 && I.getOperand(1).getFPImm()->isExactlyValue(0.0)) 2544 return false; 2545 } else { 2546 // s32 and s64 are covered by tablegen. 2547 if (Ty != p0 && Ty != s8 && Ty != s16) { 2548 LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty 2549 << " constant, expected: " << s32 << ", " << s64 2550 << ", or " << p0 << '\n'); 2551 return false; 2552 } 2553 2554 if (RB.getID() != AArch64::GPRRegBankID) { 2555 LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty 2556 << " constant on bank: " << RB 2557 << ", expected: GPR\n"); 2558 return false; 2559 } 2560 } 2561 2562 if (isFP) { 2563 const TargetRegisterClass &FPRRC = *getRegClassForTypeOnBank(DefTy, RB); 2564 // For 16, 64, and 128b values, emit a constant pool load. 2565 switch (DefSize) { 2566 default: 2567 llvm_unreachable("Unexpected destination size for G_FCONSTANT?"); 2568 case 32: 2569 // For s32, use a cp load if we have optsize/minsize. 2570 if (!shouldOptForSize(&MF)) 2571 break; 2572 LLVM_FALLTHROUGH; 2573 case 16: 2574 case 64: 2575 case 128: { 2576 auto *FPImm = I.getOperand(1).getFPImm(); 2577 auto *LoadMI = emitLoadFromConstantPool(FPImm, MIB); 2578 if (!LoadMI) { 2579 LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n"); 2580 return false; 2581 } 2582 MIB.buildCopy({DefReg}, {LoadMI->getOperand(0).getReg()}); 2583 I.eraseFromParent(); 2584 return RBI.constrainGenericRegister(DefReg, FPRRC, MRI); 2585 } 2586 } 2587 2588 // Either emit a FMOV, or emit a copy to emit a normal mov. 2589 assert(DefSize == 32 && 2590 "Expected constant pool loads for all sizes other than 32!"); 2591 const Register DefGPRReg = 2592 MRI.createVirtualRegister(&AArch64::GPR32RegClass); 2593 MachineOperand &RegOp = I.getOperand(0); 2594 RegOp.setReg(DefGPRReg); 2595 MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator())); 2596 MIB.buildCopy({DefReg}, {DefGPRReg}); 2597 2598 if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) { 2599 LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n"); 2600 return false; 2601 } 2602 2603 MachineOperand &ImmOp = I.getOperand(1); 2604 // FIXME: Is going through int64_t always correct? 2605 ImmOp.ChangeToImmediate( 2606 ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue()); 2607 } else if (I.getOperand(1).isCImm()) { 2608 uint64_t Val = I.getOperand(1).getCImm()->getZExtValue(); 2609 I.getOperand(1).ChangeToImmediate(Val); 2610 } else if (I.getOperand(1).isImm()) { 2611 uint64_t Val = I.getOperand(1).getImm(); 2612 I.getOperand(1).ChangeToImmediate(Val); 2613 } 2614 2615 const unsigned MovOpc = 2616 DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm; 2617 I.setDesc(TII.get(MovOpc)); 2618 constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2619 return true; 2620 } 2621 case TargetOpcode::G_EXTRACT: { 2622 Register DstReg = I.getOperand(0).getReg(); 2623 Register SrcReg = I.getOperand(1).getReg(); 2624 LLT SrcTy = MRI.getType(SrcReg); 2625 LLT DstTy = MRI.getType(DstReg); 2626 (void)DstTy; 2627 unsigned SrcSize = SrcTy.getSizeInBits(); 2628 2629 if (SrcTy.getSizeInBits() > 64) { 2630 // This should be an extract of an s128, which is like a vector extract. 2631 if (SrcTy.getSizeInBits() != 128) 2632 return false; 2633 // Only support extracting 64 bits from an s128 at the moment. 2634 if (DstTy.getSizeInBits() != 64) 2635 return false; 2636 2637 unsigned Offset = I.getOperand(2).getImm(); 2638 if (Offset % 64 != 0) 2639 return false; 2640 2641 // Check we have the right regbank always. 2642 const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI); 2643 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); 2644 assert(SrcRB.getID() == DstRB.getID() && "Wrong extract regbank!"); 2645 2646 if (SrcRB.getID() == AArch64::GPRRegBankID) { 2647 MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}) 2648 .addUse(SrcReg, 0, Offset == 0 ? AArch64::sube64 : AArch64::subo64); 2649 I.eraseFromParent(); 2650 return true; 2651 } 2652 2653 // Emit the same code as a vector extract. 2654 // Offset must be a multiple of 64. 2655 unsigned LaneIdx = Offset / 64; 2656 MachineInstr *Extract = emitExtractVectorElt( 2657 DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB); 2658 if (!Extract) 2659 return false; 2660 I.eraseFromParent(); 2661 return true; 2662 } 2663 2664 I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri)); 2665 MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() + 2666 Ty.getSizeInBits() - 1); 2667 2668 if (SrcSize < 64) { 2669 assert(SrcSize == 32 && DstTy.getSizeInBits() == 16 && 2670 "unexpected G_EXTRACT types"); 2671 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2672 } 2673 2674 DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); 2675 MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator())); 2676 MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {}) 2677 .addReg(DstReg, 0, AArch64::sub_32); 2678 RBI.constrainGenericRegister(I.getOperand(0).getReg(), 2679 AArch64::GPR32RegClass, MRI); 2680 I.getOperand(0).setReg(DstReg); 2681 2682 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2683 } 2684 2685 case TargetOpcode::G_INSERT: { 2686 LLT SrcTy = MRI.getType(I.getOperand(2).getReg()); 2687 LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 2688 unsigned DstSize = DstTy.getSizeInBits(); 2689 // Larger inserts are vectors, same-size ones should be something else by 2690 // now (split up or turned into COPYs). 2691 if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32) 2692 return false; 2693 2694 I.setDesc(TII.get(DstSize == 64 ? AArch64::BFMXri : AArch64::BFMWri)); 2695 unsigned LSB = I.getOperand(3).getImm(); 2696 unsigned Width = MRI.getType(I.getOperand(2).getReg()).getSizeInBits(); 2697 I.getOperand(3).setImm((DstSize - LSB) % DstSize); 2698 MachineInstrBuilder(MF, I).addImm(Width - 1); 2699 2700 if (DstSize < 64) { 2701 assert(DstSize == 32 && SrcTy.getSizeInBits() == 16 && 2702 "unexpected G_INSERT types"); 2703 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2704 } 2705 2706 Register SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); 2707 BuildMI(MBB, I.getIterator(), I.getDebugLoc(), 2708 TII.get(AArch64::SUBREG_TO_REG)) 2709 .addDef(SrcReg) 2710 .addImm(0) 2711 .addUse(I.getOperand(2).getReg()) 2712 .addImm(AArch64::sub_32); 2713 RBI.constrainGenericRegister(I.getOperand(2).getReg(), 2714 AArch64::GPR32RegClass, MRI); 2715 I.getOperand(2).setReg(SrcReg); 2716 2717 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2718 } 2719 case TargetOpcode::G_FRAME_INDEX: { 2720 // allocas and G_FRAME_INDEX are only supported in addrspace(0). 2721 if (Ty != LLT::pointer(0, 64)) { 2722 LLVM_DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty 2723 << ", expected: " << LLT::pointer(0, 64) << '\n'); 2724 return false; 2725 } 2726 I.setDesc(TII.get(AArch64::ADDXri)); 2727 2728 // MOs for a #0 shifted immediate. 2729 I.addOperand(MachineOperand::CreateImm(0)); 2730 I.addOperand(MachineOperand::CreateImm(0)); 2731 2732 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2733 } 2734 2735 case TargetOpcode::G_GLOBAL_VALUE: { 2736 auto GV = I.getOperand(1).getGlobal(); 2737 if (GV->isThreadLocal()) 2738 return selectTLSGlobalValue(I, MRI); 2739 2740 unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM); 2741 if (OpFlags & AArch64II::MO_GOT) { 2742 I.setDesc(TII.get(AArch64::LOADgot)); 2743 I.getOperand(1).setTargetFlags(OpFlags); 2744 } else if (TM.getCodeModel() == CodeModel::Large) { 2745 // Materialize the global using movz/movk instructions. 2746 materializeLargeCMVal(I, GV, OpFlags); 2747 I.eraseFromParent(); 2748 return true; 2749 } else if (TM.getCodeModel() == CodeModel::Tiny) { 2750 I.setDesc(TII.get(AArch64::ADR)); 2751 I.getOperand(1).setTargetFlags(OpFlags); 2752 } else { 2753 I.setDesc(TII.get(AArch64::MOVaddr)); 2754 I.getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_PAGE); 2755 MachineInstrBuilder MIB(MF, I); 2756 MIB.addGlobalAddress(GV, I.getOperand(1).getOffset(), 2757 OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 2758 } 2759 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2760 } 2761 2762 case TargetOpcode::G_ZEXTLOAD: 2763 case TargetOpcode::G_LOAD: 2764 case TargetOpcode::G_STORE: { 2765 GLoadStore &LdSt = cast<GLoadStore>(I); 2766 bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD; 2767 LLT PtrTy = MRI.getType(LdSt.getPointerReg()); 2768 2769 if (PtrTy != LLT::pointer(0, 64)) { 2770 LLVM_DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy 2771 << ", expected: " << LLT::pointer(0, 64) << '\n'); 2772 return false; 2773 } 2774 2775 uint64_t MemSizeInBytes = LdSt.getMemSize(); 2776 unsigned MemSizeInBits = LdSt.getMemSizeInBits(); 2777 AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering(); 2778 2779 // Need special instructions for atomics that affect ordering. 2780 if (Order != AtomicOrdering::NotAtomic && 2781 Order != AtomicOrdering::Unordered && 2782 Order != AtomicOrdering::Monotonic) { 2783 assert(!isa<GZExtLoad>(LdSt)); 2784 if (MemSizeInBytes > 64) 2785 return false; 2786 2787 if (isa<GLoad>(LdSt)) { 2788 static constexpr unsigned LDAPROpcodes[] = { 2789 AArch64::LDAPRB, AArch64::LDAPRH, AArch64::LDAPRW, AArch64::LDAPRX}; 2790 static constexpr unsigned LDAROpcodes[] = { 2791 AArch64::LDARB, AArch64::LDARH, AArch64::LDARW, AArch64::LDARX}; 2792 ArrayRef<unsigned> Opcodes = 2793 STI.hasLDAPR() && Order != AtomicOrdering::SequentiallyConsistent 2794 ? LDAPROpcodes 2795 : LDAROpcodes; 2796 I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)])); 2797 } else { 2798 static constexpr unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH, 2799 AArch64::STLRW, AArch64::STLRX}; 2800 Register ValReg = LdSt.getReg(0); 2801 if (MRI.getType(ValReg).getSizeInBits() == 64 && MemSizeInBits != 64) { 2802 // Emit a subreg copy of 32 bits. 2803 Register NewVal = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 2804 MIB.buildInstr(TargetOpcode::COPY, {NewVal}, {}) 2805 .addReg(I.getOperand(0).getReg(), 0, AArch64::sub_32); 2806 I.getOperand(0).setReg(NewVal); 2807 } 2808 I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)])); 2809 } 2810 constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2811 return true; 2812 } 2813 2814 #ifndef NDEBUG 2815 const Register PtrReg = LdSt.getPointerReg(); 2816 const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI); 2817 // Check that the pointer register is valid. 2818 assert(PtrRB.getID() == AArch64::GPRRegBankID && 2819 "Load/Store pointer operand isn't a GPR"); 2820 assert(MRI.getType(PtrReg).isPointer() && 2821 "Load/Store pointer operand isn't a pointer"); 2822 #endif 2823 2824 const Register ValReg = LdSt.getReg(0); 2825 const LLT ValTy = MRI.getType(ValReg); 2826 const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI); 2827 2828 // The code below doesn't support truncating stores, so we need to split it 2829 // again. 2830 if (isa<GStore>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) { 2831 unsigned SubReg; 2832 LLT MemTy = LdSt.getMMO().getMemoryType(); 2833 auto *RC = getRegClassForTypeOnBank(MemTy, RB); 2834 if (!getSubRegForClass(RC, TRI, SubReg)) 2835 return false; 2836 2837 // Generate a subreg copy. 2838 auto Copy = MIB.buildInstr(TargetOpcode::COPY, {MemTy}, {}) 2839 .addReg(ValReg, 0, SubReg) 2840 .getReg(0); 2841 RBI.constrainGenericRegister(Copy, *RC, MRI); 2842 LdSt.getOperand(0).setReg(Copy); 2843 } else if (isa<GLoad>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) { 2844 // If this is an any-extending load from the FPR bank, split it into a regular 2845 // load + extend. 2846 if (RB.getID() == AArch64::FPRRegBankID) { 2847 unsigned SubReg; 2848 LLT MemTy = LdSt.getMMO().getMemoryType(); 2849 auto *RC = getRegClassForTypeOnBank(MemTy, RB); 2850 if (!getSubRegForClass(RC, TRI, SubReg)) 2851 return false; 2852 Register OldDst = LdSt.getReg(0); 2853 Register NewDst = 2854 MRI.createGenericVirtualRegister(LdSt.getMMO().getMemoryType()); 2855 LdSt.getOperand(0).setReg(NewDst); 2856 MRI.setRegBank(NewDst, RB); 2857 // Generate a SUBREG_TO_REG to extend it. 2858 MIB.setInsertPt(MIB.getMBB(), std::next(LdSt.getIterator())); 2859 MIB.buildInstr(AArch64::SUBREG_TO_REG, {OldDst}, {}) 2860 .addImm(0) 2861 .addUse(NewDst) 2862 .addImm(SubReg); 2863 auto SubRegRC = getRegClassForTypeOnBank(MRI.getType(OldDst), RB); 2864 RBI.constrainGenericRegister(OldDst, *SubRegRC, MRI); 2865 MIB.setInstr(LdSt); 2866 } 2867 } 2868 2869 // Helper lambda for partially selecting I. Either returns the original 2870 // instruction with an updated opcode, or a new instruction. 2871 auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * { 2872 bool IsStore = isa<GStore>(I); 2873 const unsigned NewOpc = 2874 selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits); 2875 if (NewOpc == I.getOpcode()) 2876 return nullptr; 2877 // Check if we can fold anything into the addressing mode. 2878 auto AddrModeFns = 2879 selectAddrModeIndexed(I.getOperand(1), MemSizeInBytes); 2880 if (!AddrModeFns) { 2881 // Can't fold anything. Use the original instruction. 2882 I.setDesc(TII.get(NewOpc)); 2883 I.addOperand(MachineOperand::CreateImm(0)); 2884 return &I; 2885 } 2886 2887 // Folded something. Create a new instruction and return it. 2888 auto NewInst = MIB.buildInstr(NewOpc, {}, {}, I.getFlags()); 2889 Register CurValReg = I.getOperand(0).getReg(); 2890 IsStore ? NewInst.addUse(CurValReg) : NewInst.addDef(CurValReg); 2891 NewInst.cloneMemRefs(I); 2892 for (auto &Fn : *AddrModeFns) 2893 Fn(NewInst); 2894 I.eraseFromParent(); 2895 return &*NewInst; 2896 }; 2897 2898 MachineInstr *LoadStore = SelectLoadStoreAddressingMode(); 2899 if (!LoadStore) 2900 return false; 2901 2902 // If we're storing a 0, use WZR/XZR. 2903 if (Opcode == TargetOpcode::G_STORE) { 2904 auto CVal = getIConstantVRegValWithLookThrough( 2905 LoadStore->getOperand(0).getReg(), MRI); 2906 if (CVal && CVal->Value == 0) { 2907 switch (LoadStore->getOpcode()) { 2908 case AArch64::STRWui: 2909 case AArch64::STRHHui: 2910 case AArch64::STRBBui: 2911 LoadStore->getOperand(0).setReg(AArch64::WZR); 2912 break; 2913 case AArch64::STRXui: 2914 LoadStore->getOperand(0).setReg(AArch64::XZR); 2915 break; 2916 } 2917 } 2918 } 2919 2920 if (IsZExtLoad) { 2921 // The zextload from a smaller type to i32 should be handled by the 2922 // importer. 2923 if (MRI.getType(LoadStore->getOperand(0).getReg()).getSizeInBits() != 64) 2924 return false; 2925 // If we have a ZEXTLOAD then change the load's type to be a narrower reg 2926 // and zero_extend with SUBREG_TO_REG. 2927 Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 2928 Register DstReg = LoadStore->getOperand(0).getReg(); 2929 LoadStore->getOperand(0).setReg(LdReg); 2930 2931 MIB.setInsertPt(MIB.getMBB(), std::next(LoadStore->getIterator())); 2932 MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {}) 2933 .addImm(0) 2934 .addUse(LdReg) 2935 .addImm(AArch64::sub_32); 2936 constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI); 2937 return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass, 2938 MRI); 2939 } 2940 return constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI); 2941 } 2942 2943 case TargetOpcode::G_SMULH: 2944 case TargetOpcode::G_UMULH: { 2945 // Reject the various things we don't support yet. 2946 if (unsupportedBinOp(I, RBI, MRI, TRI)) 2947 return false; 2948 2949 const Register DefReg = I.getOperand(0).getReg(); 2950 const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); 2951 2952 if (RB.getID() != AArch64::GPRRegBankID) { 2953 LLVM_DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n"); 2954 return false; 2955 } 2956 2957 if (Ty != LLT::scalar(64)) { 2958 LLVM_DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty 2959 << ", expected: " << LLT::scalar(64) << '\n'); 2960 return false; 2961 } 2962 2963 unsigned NewOpc = I.getOpcode() == TargetOpcode::G_SMULH ? AArch64::SMULHrr 2964 : AArch64::UMULHrr; 2965 I.setDesc(TII.get(NewOpc)); 2966 2967 // Now that we selected an opcode, we need to constrain the register 2968 // operands to use appropriate classes. 2969 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2970 } 2971 case TargetOpcode::G_LSHR: 2972 case TargetOpcode::G_ASHR: 2973 if (MRI.getType(I.getOperand(0).getReg()).isVector()) 2974 return selectVectorAshrLshr(I, MRI); 2975 LLVM_FALLTHROUGH; 2976 case TargetOpcode::G_SHL: 2977 if (Opcode == TargetOpcode::G_SHL && 2978 MRI.getType(I.getOperand(0).getReg()).isVector()) 2979 return selectVectorSHL(I, MRI); 2980 2981 // These shifts were legalized to have 64 bit shift amounts because we 2982 // want to take advantage of the selection patterns that assume the 2983 // immediates are s64s, however, selectBinaryOp will assume both operands 2984 // will have the same bit size. 2985 { 2986 Register SrcReg = I.getOperand(1).getReg(); 2987 Register ShiftReg = I.getOperand(2).getReg(); 2988 const LLT ShiftTy = MRI.getType(ShiftReg); 2989 const LLT SrcTy = MRI.getType(SrcReg); 2990 if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 32 && 2991 ShiftTy.getSizeInBits() == 64) { 2992 assert(!ShiftTy.isVector() && "unexpected vector shift ty"); 2993 // Insert a subregister copy to implement a 64->32 trunc 2994 auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {}) 2995 .addReg(ShiftReg, 0, AArch64::sub_32); 2996 MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID)); 2997 I.getOperand(2).setReg(Trunc.getReg(0)); 2998 } 2999 } 3000 LLVM_FALLTHROUGH; 3001 case TargetOpcode::G_OR: { 3002 // Reject the various things we don't support yet. 3003 if (unsupportedBinOp(I, RBI, MRI, TRI)) 3004 return false; 3005 3006 const unsigned OpSize = Ty.getSizeInBits(); 3007 3008 const Register DefReg = I.getOperand(0).getReg(); 3009 const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); 3010 3011 const unsigned NewOpc = selectBinaryOp(I.getOpcode(), RB.getID(), OpSize); 3012 if (NewOpc == I.getOpcode()) 3013 return false; 3014 3015 I.setDesc(TII.get(NewOpc)); 3016 // FIXME: Should the type be always reset in setDesc? 3017 3018 // Now that we selected an opcode, we need to constrain the register 3019 // operands to use appropriate classes. 3020 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3021 } 3022 3023 case TargetOpcode::G_PTR_ADD: { 3024 emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2), MIB); 3025 I.eraseFromParent(); 3026 return true; 3027 } 3028 case TargetOpcode::G_SADDO: 3029 case TargetOpcode::G_UADDO: 3030 case TargetOpcode::G_SSUBO: 3031 case TargetOpcode::G_USUBO: { 3032 // Emit the operation and get the correct condition code. 3033 auto OpAndCC = emitOverflowOp(Opcode, I.getOperand(0).getReg(), 3034 I.getOperand(2), I.getOperand(3), MIB); 3035 3036 // Now, put the overflow result in the register given by the first operand 3037 // to the overflow op. CSINC increments the result when the predicate is 3038 // false, so to get the increment when it's true, we need to use the 3039 // inverse. In this case, we want to increment when carry is set. 3040 Register ZReg = AArch64::WZR; 3041 emitCSINC(/*Dst=*/I.getOperand(1).getReg(), /*Src1=*/ZReg, /*Src2=*/ZReg, 3042 getInvertedCondCode(OpAndCC.second), MIB); 3043 I.eraseFromParent(); 3044 return true; 3045 } 3046 3047 case TargetOpcode::G_PTRMASK: { 3048 Register MaskReg = I.getOperand(2).getReg(); 3049 Optional<int64_t> MaskVal = getIConstantVRegSExtVal(MaskReg, MRI); 3050 // TODO: Implement arbitrary cases 3051 if (!MaskVal || !isShiftedMask_64(*MaskVal)) 3052 return false; 3053 3054 uint64_t Mask = *MaskVal; 3055 I.setDesc(TII.get(AArch64::ANDXri)); 3056 I.getOperand(2).ChangeToImmediate( 3057 AArch64_AM::encodeLogicalImmediate(Mask, 64)); 3058 3059 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3060 } 3061 case TargetOpcode::G_PTRTOINT: 3062 case TargetOpcode::G_TRUNC: { 3063 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 3064 const LLT SrcTy = MRI.getType(I.getOperand(1).getReg()); 3065 3066 const Register DstReg = I.getOperand(0).getReg(); 3067 const Register SrcReg = I.getOperand(1).getReg(); 3068 3069 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); 3070 const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI); 3071 3072 if (DstRB.getID() != SrcRB.getID()) { 3073 LLVM_DEBUG( 3074 dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n"); 3075 return false; 3076 } 3077 3078 if (DstRB.getID() == AArch64::GPRRegBankID) { 3079 const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB); 3080 if (!DstRC) 3081 return false; 3082 3083 const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(SrcTy, SrcRB); 3084 if (!SrcRC) 3085 return false; 3086 3087 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) || 3088 !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { 3089 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n"); 3090 return false; 3091 } 3092 3093 if (DstRC == SrcRC) { 3094 // Nothing to be done 3095 } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) && 3096 SrcTy == LLT::scalar(64)) { 3097 llvm_unreachable("TableGen can import this case"); 3098 return false; 3099 } else if (DstRC == &AArch64::GPR32RegClass && 3100 SrcRC == &AArch64::GPR64RegClass) { 3101 I.getOperand(1).setSubReg(AArch64::sub_32); 3102 } else { 3103 LLVM_DEBUG( 3104 dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n"); 3105 return false; 3106 } 3107 3108 I.setDesc(TII.get(TargetOpcode::COPY)); 3109 return true; 3110 } else if (DstRB.getID() == AArch64::FPRRegBankID) { 3111 if (DstTy == LLT::fixed_vector(4, 16) && 3112 SrcTy == LLT::fixed_vector(4, 32)) { 3113 I.setDesc(TII.get(AArch64::XTNv4i16)); 3114 constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3115 return true; 3116 } 3117 3118 if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) { 3119 MachineInstr *Extract = emitExtractVectorElt( 3120 DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB); 3121 if (!Extract) 3122 return false; 3123 I.eraseFromParent(); 3124 return true; 3125 } 3126 3127 // We might have a vector G_PTRTOINT, in which case just emit a COPY. 3128 if (Opcode == TargetOpcode::G_PTRTOINT) { 3129 assert(DstTy.isVector() && "Expected an FPR ptrtoint to be a vector"); 3130 I.setDesc(TII.get(TargetOpcode::COPY)); 3131 return selectCopy(I, TII, MRI, TRI, RBI); 3132 } 3133 } 3134 3135 return false; 3136 } 3137 3138 case TargetOpcode::G_ANYEXT: { 3139 if (selectUSMovFromExtend(I, MRI)) 3140 return true; 3141 3142 const Register DstReg = I.getOperand(0).getReg(); 3143 const Register SrcReg = I.getOperand(1).getReg(); 3144 3145 const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI); 3146 if (RBDst.getID() != AArch64::GPRRegBankID) { 3147 LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst 3148 << ", expected: GPR\n"); 3149 return false; 3150 } 3151 3152 const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI); 3153 if (RBSrc.getID() != AArch64::GPRRegBankID) { 3154 LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc 3155 << ", expected: GPR\n"); 3156 return false; 3157 } 3158 3159 const unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); 3160 3161 if (DstSize == 0) { 3162 LLVM_DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n"); 3163 return false; 3164 } 3165 3166 if (DstSize != 64 && DstSize > 32) { 3167 LLVM_DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize 3168 << ", expected: 32 or 64\n"); 3169 return false; 3170 } 3171 // At this point G_ANYEXT is just like a plain COPY, but we need 3172 // to explicitly form the 64-bit value if any. 3173 if (DstSize > 32) { 3174 Register ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass); 3175 BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG)) 3176 .addDef(ExtSrc) 3177 .addImm(0) 3178 .addUse(SrcReg) 3179 .addImm(AArch64::sub_32); 3180 I.getOperand(1).setReg(ExtSrc); 3181 } 3182 return selectCopy(I, TII, MRI, TRI, RBI); 3183 } 3184 3185 case TargetOpcode::G_ZEXT: 3186 case TargetOpcode::G_SEXT_INREG: 3187 case TargetOpcode::G_SEXT: { 3188 if (selectUSMovFromExtend(I, MRI)) 3189 return true; 3190 3191 unsigned Opcode = I.getOpcode(); 3192 const bool IsSigned = Opcode != TargetOpcode::G_ZEXT; 3193 const Register DefReg = I.getOperand(0).getReg(); 3194 Register SrcReg = I.getOperand(1).getReg(); 3195 const LLT DstTy = MRI.getType(DefReg); 3196 const LLT SrcTy = MRI.getType(SrcReg); 3197 unsigned DstSize = DstTy.getSizeInBits(); 3198 unsigned SrcSize = SrcTy.getSizeInBits(); 3199 3200 // SEXT_INREG has the same src reg size as dst, the size of the value to be 3201 // extended is encoded in the imm. 3202 if (Opcode == TargetOpcode::G_SEXT_INREG) 3203 SrcSize = I.getOperand(2).getImm(); 3204 3205 if (DstTy.isVector()) 3206 return false; // Should be handled by imported patterns. 3207 3208 assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() == 3209 AArch64::GPRRegBankID && 3210 "Unexpected ext regbank"); 3211 3212 MachineInstr *ExtI; 3213 3214 // First check if we're extending the result of a load which has a dest type 3215 // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest 3216 // GPR register on AArch64 and all loads which are smaller automatically 3217 // zero-extend the upper bits. E.g. 3218 // %v(s8) = G_LOAD %p, :: (load 1) 3219 // %v2(s32) = G_ZEXT %v(s8) 3220 if (!IsSigned) { 3221 auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI); 3222 bool IsGPR = 3223 RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID; 3224 if (LoadMI && IsGPR) { 3225 const MachineMemOperand *MemOp = *LoadMI->memoperands_begin(); 3226 unsigned BytesLoaded = MemOp->getSize(); 3227 if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded) 3228 return selectCopy(I, TII, MRI, TRI, RBI); 3229 } 3230 3231 // For the 32-bit -> 64-bit case, we can emit a mov (ORRWrs) 3232 // + SUBREG_TO_REG. 3233 // 3234 // If we are zero extending from 32 bits to 64 bits, it's possible that 3235 // the instruction implicitly does the zero extend for us. In that case, 3236 // we only need the SUBREG_TO_REG. 3237 if (IsGPR && SrcSize == 32 && DstSize == 64) { 3238 // Unlike with the G_LOAD case, we don't want to look through copies 3239 // here. (See isDef32.) 3240 MachineInstr *Def = MRI.getVRegDef(SrcReg); 3241 Register SubregToRegSrc = SrcReg; 3242 3243 // Does the instruction implicitly zero extend? 3244 if (!Def || !isDef32(*Def)) { 3245 // No. Zero out using an OR. 3246 Register OrDst = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 3247 const Register ZReg = AArch64::WZR; 3248 MIB.buildInstr(AArch64::ORRWrs, {OrDst}, {ZReg, SrcReg}).addImm(0); 3249 SubregToRegSrc = OrDst; 3250 } 3251 3252 MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {}) 3253 .addImm(0) 3254 .addUse(SubregToRegSrc) 3255 .addImm(AArch64::sub_32); 3256 3257 if (!RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, 3258 MRI)) { 3259 LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n"); 3260 return false; 3261 } 3262 3263 if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, 3264 MRI)) { 3265 LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n"); 3266 return false; 3267 } 3268 3269 I.eraseFromParent(); 3270 return true; 3271 } 3272 } 3273 3274 if (DstSize == 64) { 3275 if (Opcode != TargetOpcode::G_SEXT_INREG) { 3276 // FIXME: Can we avoid manually doing this? 3277 if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, 3278 MRI)) { 3279 LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode) 3280 << " operand\n"); 3281 return false; 3282 } 3283 SrcReg = MIB.buildInstr(AArch64::SUBREG_TO_REG, 3284 {&AArch64::GPR64RegClass}, {}) 3285 .addImm(0) 3286 .addUse(SrcReg) 3287 .addImm(AArch64::sub_32) 3288 .getReg(0); 3289 } 3290 3291 ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri, 3292 {DefReg}, {SrcReg}) 3293 .addImm(0) 3294 .addImm(SrcSize - 1); 3295 } else if (DstSize <= 32) { 3296 ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri, 3297 {DefReg}, {SrcReg}) 3298 .addImm(0) 3299 .addImm(SrcSize - 1); 3300 } else { 3301 return false; 3302 } 3303 3304 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 3305 I.eraseFromParent(); 3306 return true; 3307 } 3308 3309 case TargetOpcode::G_SITOFP: 3310 case TargetOpcode::G_UITOFP: 3311 case TargetOpcode::G_FPTOSI: 3312 case TargetOpcode::G_FPTOUI: { 3313 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()), 3314 SrcTy = MRI.getType(I.getOperand(1).getReg()); 3315 const unsigned NewOpc = selectFPConvOpc(Opcode, DstTy, SrcTy); 3316 if (NewOpc == Opcode) 3317 return false; 3318 3319 I.setDesc(TII.get(NewOpc)); 3320 constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3321 I.setFlags(MachineInstr::NoFPExcept); 3322 3323 return true; 3324 } 3325 3326 case TargetOpcode::G_FREEZE: 3327 return selectCopy(I, TII, MRI, TRI, RBI); 3328 3329 case TargetOpcode::G_INTTOPTR: 3330 // The importer is currently unable to import pointer types since they 3331 // didn't exist in SelectionDAG. 3332 return selectCopy(I, TII, MRI, TRI, RBI); 3333 3334 case TargetOpcode::G_BITCAST: 3335 // Imported SelectionDAG rules can handle every bitcast except those that 3336 // bitcast from a type to the same type. Ideally, these shouldn't occur 3337 // but we might not run an optimizer that deletes them. The other exception 3338 // is bitcasts involving pointer types, as SelectionDAG has no knowledge 3339 // of them. 3340 return selectCopy(I, TII, MRI, TRI, RBI); 3341 3342 case TargetOpcode::G_SELECT: { 3343 auto &Sel = cast<GSelect>(I); 3344 const Register CondReg = Sel.getCondReg(); 3345 const Register TReg = Sel.getTrueReg(); 3346 const Register FReg = Sel.getFalseReg(); 3347 3348 if (tryOptSelect(Sel)) 3349 return true; 3350 3351 // Make sure to use an unused vreg instead of wzr, so that the peephole 3352 // optimizations will be able to optimize these. 3353 Register DeadVReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 3354 auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg}) 3355 .addImm(AArch64_AM::encodeLogicalImmediate(1, 32)); 3356 constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI); 3357 if (!emitSelect(Sel.getReg(0), TReg, FReg, AArch64CC::NE, MIB)) 3358 return false; 3359 Sel.eraseFromParent(); 3360 return true; 3361 } 3362 case TargetOpcode::G_ICMP: { 3363 if (Ty.isVector()) 3364 return selectVectorICmp(I, MRI); 3365 3366 if (Ty != LLT::scalar(32)) { 3367 LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty 3368 << ", expected: " << LLT::scalar(32) << '\n'); 3369 return false; 3370 } 3371 3372 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate()); 3373 const AArch64CC::CondCode InvCC = 3374 changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred)); 3375 emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1), MIB); 3376 emitCSINC(/*Dst=*/I.getOperand(0).getReg(), /*Src1=*/AArch64::WZR, 3377 /*Src2=*/AArch64::WZR, InvCC, MIB); 3378 I.eraseFromParent(); 3379 return true; 3380 } 3381 3382 case TargetOpcode::G_FCMP: { 3383 CmpInst::Predicate Pred = 3384 static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate()); 3385 if (!emitFPCompare(I.getOperand(2).getReg(), I.getOperand(3).getReg(), MIB, 3386 Pred) || 3387 !emitCSetForFCmp(I.getOperand(0).getReg(), Pred, MIB)) 3388 return false; 3389 I.eraseFromParent(); 3390 return true; 3391 } 3392 case TargetOpcode::G_VASTART: 3393 return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI) 3394 : selectVaStartAAPCS(I, MF, MRI); 3395 case TargetOpcode::G_INTRINSIC: 3396 return selectIntrinsic(I, MRI); 3397 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: 3398 return selectIntrinsicWithSideEffects(I, MRI); 3399 case TargetOpcode::G_IMPLICIT_DEF: { 3400 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); 3401 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 3402 const Register DstReg = I.getOperand(0).getReg(); 3403 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); 3404 const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB); 3405 RBI.constrainGenericRegister(DstReg, *DstRC, MRI); 3406 return true; 3407 } 3408 case TargetOpcode::G_BLOCK_ADDR: { 3409 if (TM.getCodeModel() == CodeModel::Large) { 3410 materializeLargeCMVal(I, I.getOperand(1).getBlockAddress(), 0); 3411 I.eraseFromParent(); 3412 return true; 3413 } else { 3414 I.setDesc(TII.get(AArch64::MOVaddrBA)); 3415 auto MovMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::MOVaddrBA), 3416 I.getOperand(0).getReg()) 3417 .addBlockAddress(I.getOperand(1).getBlockAddress(), 3418 /* Offset */ 0, AArch64II::MO_PAGE) 3419 .addBlockAddress( 3420 I.getOperand(1).getBlockAddress(), /* Offset */ 0, 3421 AArch64II::MO_NC | AArch64II::MO_PAGEOFF); 3422 I.eraseFromParent(); 3423 return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI); 3424 } 3425 } 3426 case AArch64::G_DUP: { 3427 // When the scalar of G_DUP is an s8/s16 gpr, they can't be selected by 3428 // imported patterns. Do it manually here. Avoiding generating s16 gpr is 3429 // difficult because at RBS we may end up pessimizing the fpr case if we 3430 // decided to add an anyextend to fix this. Manual selection is the most 3431 // robust solution for now. 3432 if (RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() != 3433 AArch64::GPRRegBankID) 3434 return false; // We expect the fpr regbank case to be imported. 3435 LLT VecTy = MRI.getType(I.getOperand(0).getReg()); 3436 if (VecTy == LLT::fixed_vector(8, 8)) 3437 I.setDesc(TII.get(AArch64::DUPv8i8gpr)); 3438 else if (VecTy == LLT::fixed_vector(16, 8)) 3439 I.setDesc(TII.get(AArch64::DUPv16i8gpr)); 3440 else if (VecTy == LLT::fixed_vector(4, 16)) 3441 I.setDesc(TII.get(AArch64::DUPv4i16gpr)); 3442 else if (VecTy == LLT::fixed_vector(8, 16)) 3443 I.setDesc(TII.get(AArch64::DUPv8i16gpr)); 3444 else 3445 return false; 3446 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3447 } 3448 case TargetOpcode::G_INTRINSIC_TRUNC: 3449 return selectIntrinsicTrunc(I, MRI); 3450 case TargetOpcode::G_INTRINSIC_ROUND: 3451 return selectIntrinsicRound(I, MRI); 3452 case TargetOpcode::G_BUILD_VECTOR: 3453 return selectBuildVector(I, MRI); 3454 case TargetOpcode::G_MERGE_VALUES: 3455 return selectMergeValues(I, MRI); 3456 case TargetOpcode::G_UNMERGE_VALUES: 3457 return selectUnmergeValues(I, MRI); 3458 case TargetOpcode::G_SHUFFLE_VECTOR: 3459 return selectShuffleVector(I, MRI); 3460 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 3461 return selectExtractElt(I, MRI); 3462 case TargetOpcode::G_INSERT_VECTOR_ELT: 3463 return selectInsertElt(I, MRI); 3464 case TargetOpcode::G_CONCAT_VECTORS: 3465 return selectConcatVectors(I, MRI); 3466 case TargetOpcode::G_JUMP_TABLE: 3467 return selectJumpTable(I, MRI); 3468 case TargetOpcode::G_VECREDUCE_FADD: 3469 case TargetOpcode::G_VECREDUCE_ADD: 3470 return selectReduction(I, MRI); 3471 case TargetOpcode::G_MEMCPY: 3472 case TargetOpcode::G_MEMCPY_INLINE: 3473 case TargetOpcode::G_MEMMOVE: 3474 case TargetOpcode::G_MEMSET: 3475 assert(STI.hasMOPS() && "Shouldn't get here without +mops feature"); 3476 return selectMOPS(I, MRI); 3477 } 3478 3479 return false; 3480 } 3481 3482 bool AArch64InstructionSelector::selectReduction(MachineInstr &I, 3483 MachineRegisterInfo &MRI) { 3484 Register VecReg = I.getOperand(1).getReg(); 3485 LLT VecTy = MRI.getType(VecReg); 3486 if (I.getOpcode() == TargetOpcode::G_VECREDUCE_ADD) { 3487 // For <2 x i32> ADDPv2i32 generates an FPR64 value, so we need to emit 3488 // a subregister copy afterwards. 3489 if (VecTy == LLT::fixed_vector(2, 32)) { 3490 Register DstReg = I.getOperand(0).getReg(); 3491 auto AddP = MIB.buildInstr(AArch64::ADDPv2i32, {&AArch64::FPR64RegClass}, 3492 {VecReg, VecReg}); 3493 auto Copy = MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}) 3494 .addReg(AddP.getReg(0), 0, AArch64::ssub) 3495 .getReg(0); 3496 RBI.constrainGenericRegister(Copy, AArch64::FPR32RegClass, MRI); 3497 I.eraseFromParent(); 3498 return constrainSelectedInstRegOperands(*AddP, TII, TRI, RBI); 3499 } 3500 3501 unsigned Opc = 0; 3502 if (VecTy == LLT::fixed_vector(16, 8)) 3503 Opc = AArch64::ADDVv16i8v; 3504 else if (VecTy == LLT::fixed_vector(8, 16)) 3505 Opc = AArch64::ADDVv8i16v; 3506 else if (VecTy == LLT::fixed_vector(4, 32)) 3507 Opc = AArch64::ADDVv4i32v; 3508 else if (VecTy == LLT::fixed_vector(2, 64)) 3509 Opc = AArch64::ADDPv2i64p; 3510 else { 3511 LLVM_DEBUG(dbgs() << "Unhandled type for add reduction"); 3512 return false; 3513 } 3514 I.setDesc(TII.get(Opc)); 3515 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3516 } 3517 3518 if (I.getOpcode() == TargetOpcode::G_VECREDUCE_FADD) { 3519 unsigned Opc = 0; 3520 if (VecTy == LLT::fixed_vector(2, 32)) 3521 Opc = AArch64::FADDPv2i32p; 3522 else if (VecTy == LLT::fixed_vector(2, 64)) 3523 Opc = AArch64::FADDPv2i64p; 3524 else { 3525 LLVM_DEBUG(dbgs() << "Unhandled type for fadd reduction"); 3526 return false; 3527 } 3528 I.setDesc(TII.get(Opc)); 3529 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3530 } 3531 return false; 3532 } 3533 3534 bool AArch64InstructionSelector::selectMOPS(MachineInstr &GI, 3535 MachineRegisterInfo &MRI) { 3536 unsigned Mopcode; 3537 switch (GI.getOpcode()) { 3538 case TargetOpcode::G_MEMCPY: 3539 case TargetOpcode::G_MEMCPY_INLINE: 3540 Mopcode = AArch64::MOPSMemoryCopyPseudo; 3541 break; 3542 case TargetOpcode::G_MEMMOVE: 3543 Mopcode = AArch64::MOPSMemoryMovePseudo; 3544 break; 3545 case TargetOpcode::G_MEMSET: 3546 // For tagged memset see llvm.aarch64.mops.memset.tag 3547 Mopcode = AArch64::MOPSMemorySetPseudo; 3548 break; 3549 } 3550 3551 auto &DstPtr = GI.getOperand(0); 3552 auto &SrcOrVal = GI.getOperand(1); 3553 auto &Size = GI.getOperand(2); 3554 3555 // Create copies of the registers that can be clobbered. 3556 const Register DstPtrCopy = MRI.cloneVirtualRegister(DstPtr.getReg()); 3557 const Register SrcValCopy = MRI.cloneVirtualRegister(SrcOrVal.getReg()); 3558 const Register SizeCopy = MRI.cloneVirtualRegister(Size.getReg()); 3559 3560 const bool IsSet = Mopcode == AArch64::MOPSMemorySetPseudo; 3561 const auto &SrcValRegClass = 3562 IsSet ? AArch64::GPR64RegClass : AArch64::GPR64commonRegClass; 3563 3564 // Constrain to specific registers 3565 RBI.constrainGenericRegister(DstPtrCopy, AArch64::GPR64commonRegClass, MRI); 3566 RBI.constrainGenericRegister(SrcValCopy, SrcValRegClass, MRI); 3567 RBI.constrainGenericRegister(SizeCopy, AArch64::GPR64RegClass, MRI); 3568 3569 MIB.buildCopy(DstPtrCopy, DstPtr); 3570 MIB.buildCopy(SrcValCopy, SrcOrVal); 3571 MIB.buildCopy(SizeCopy, Size); 3572 3573 // New instruction uses the copied registers because it must update them. 3574 // The defs are not used since they don't exist in G_MEM*. They are still 3575 // tied. 3576 // Note: order of operands is different from G_MEMSET, G_MEMCPY, G_MEMMOVE 3577 Register DefDstPtr = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass); 3578 Register DefSize = MRI.createVirtualRegister(&AArch64::GPR64RegClass); 3579 if (IsSet) { 3580 MIB.buildInstr(Mopcode, {DefDstPtr, DefSize}, 3581 {DstPtrCopy, SizeCopy, SrcValCopy}); 3582 } else { 3583 Register DefSrcPtr = MRI.createVirtualRegister(&SrcValRegClass); 3584 MIB.buildInstr(Mopcode, {DefDstPtr, DefSrcPtr, DefSize}, 3585 {DstPtrCopy, SrcValCopy, SizeCopy}); 3586 } 3587 3588 GI.eraseFromParent(); 3589 return true; 3590 } 3591 3592 bool AArch64InstructionSelector::selectBrJT(MachineInstr &I, 3593 MachineRegisterInfo &MRI) { 3594 assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT"); 3595 Register JTAddr = I.getOperand(0).getReg(); 3596 unsigned JTI = I.getOperand(1).getIndex(); 3597 Register Index = I.getOperand(2).getReg(); 3598 3599 Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); 3600 Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass); 3601 3602 MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(JTI, 4, nullptr); 3603 auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32, 3604 {TargetReg, ScratchReg}, {JTAddr, Index}) 3605 .addJumpTableIndex(JTI); 3606 // Build the indirect branch. 3607 MIB.buildInstr(AArch64::BR, {}, {TargetReg}); 3608 I.eraseFromParent(); 3609 return constrainSelectedInstRegOperands(*JumpTableInst, TII, TRI, RBI); 3610 } 3611 3612 bool AArch64InstructionSelector::selectJumpTable(MachineInstr &I, 3613 MachineRegisterInfo &MRI) { 3614 assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table"); 3615 assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!"); 3616 3617 Register DstReg = I.getOperand(0).getReg(); 3618 unsigned JTI = I.getOperand(1).getIndex(); 3619 // We generate a MOVaddrJT which will get expanded to an ADRP + ADD later. 3620 auto MovMI = 3621 MIB.buildInstr(AArch64::MOVaddrJT, {DstReg}, {}) 3622 .addJumpTableIndex(JTI, AArch64II::MO_PAGE) 3623 .addJumpTableIndex(JTI, AArch64II::MO_NC | AArch64II::MO_PAGEOFF); 3624 I.eraseFromParent(); 3625 return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI); 3626 } 3627 3628 bool AArch64InstructionSelector::selectTLSGlobalValue( 3629 MachineInstr &I, MachineRegisterInfo &MRI) { 3630 if (!STI.isTargetMachO()) 3631 return false; 3632 MachineFunction &MF = *I.getParent()->getParent(); 3633 MF.getFrameInfo().setAdjustsStack(true); 3634 3635 const auto &GlobalOp = I.getOperand(1); 3636 assert(GlobalOp.getOffset() == 0 && 3637 "Shouldn't have an offset on TLS globals!"); 3638 const GlobalValue &GV = *GlobalOp.getGlobal(); 3639 3640 auto LoadGOT = 3641 MIB.buildInstr(AArch64::LOADgot, {&AArch64::GPR64commonRegClass}, {}) 3642 .addGlobalAddress(&GV, 0, AArch64II::MO_TLS); 3643 3644 auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass}, 3645 {LoadGOT.getReg(0)}) 3646 .addImm(0); 3647 3648 MIB.buildCopy(Register(AArch64::X0), LoadGOT.getReg(0)); 3649 // TLS calls preserve all registers except those that absolutely must be 3650 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be 3651 // silly). 3652 MIB.buildInstr(getBLRCallOpcode(MF), {}, {Load}) 3653 .addUse(AArch64::X0, RegState::Implicit) 3654 .addDef(AArch64::X0, RegState::Implicit) 3655 .addRegMask(TRI.getTLSCallPreservedMask()); 3656 3657 MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X0)); 3658 RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR64RegClass, 3659 MRI); 3660 I.eraseFromParent(); 3661 return true; 3662 } 3663 3664 bool AArch64InstructionSelector::selectIntrinsicTrunc( 3665 MachineInstr &I, MachineRegisterInfo &MRI) const { 3666 const LLT SrcTy = MRI.getType(I.getOperand(0).getReg()); 3667 3668 // Select the correct opcode. 3669 unsigned Opc = 0; 3670 if (!SrcTy.isVector()) { 3671 switch (SrcTy.getSizeInBits()) { 3672 default: 3673 case 16: 3674 Opc = AArch64::FRINTZHr; 3675 break; 3676 case 32: 3677 Opc = AArch64::FRINTZSr; 3678 break; 3679 case 64: 3680 Opc = AArch64::FRINTZDr; 3681 break; 3682 } 3683 } else { 3684 unsigned NumElts = SrcTy.getNumElements(); 3685 switch (SrcTy.getElementType().getSizeInBits()) { 3686 default: 3687 break; 3688 case 16: 3689 if (NumElts == 4) 3690 Opc = AArch64::FRINTZv4f16; 3691 else if (NumElts == 8) 3692 Opc = AArch64::FRINTZv8f16; 3693 break; 3694 case 32: 3695 if (NumElts == 2) 3696 Opc = AArch64::FRINTZv2f32; 3697 else if (NumElts == 4) 3698 Opc = AArch64::FRINTZv4f32; 3699 break; 3700 case 64: 3701 if (NumElts == 2) 3702 Opc = AArch64::FRINTZv2f64; 3703 break; 3704 } 3705 } 3706 3707 if (!Opc) { 3708 // Didn't get an opcode above, bail. 3709 LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_TRUNC!\n"); 3710 return false; 3711 } 3712 3713 // Legalization would have set us up perfectly for this; we just need to 3714 // set the opcode and move on. 3715 I.setDesc(TII.get(Opc)); 3716 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3717 } 3718 3719 bool AArch64InstructionSelector::selectIntrinsicRound( 3720 MachineInstr &I, MachineRegisterInfo &MRI) const { 3721 const LLT SrcTy = MRI.getType(I.getOperand(0).getReg()); 3722 3723 // Select the correct opcode. 3724 unsigned Opc = 0; 3725 if (!SrcTy.isVector()) { 3726 switch (SrcTy.getSizeInBits()) { 3727 default: 3728 case 16: 3729 Opc = AArch64::FRINTAHr; 3730 break; 3731 case 32: 3732 Opc = AArch64::FRINTASr; 3733 break; 3734 case 64: 3735 Opc = AArch64::FRINTADr; 3736 break; 3737 } 3738 } else { 3739 unsigned NumElts = SrcTy.getNumElements(); 3740 switch (SrcTy.getElementType().getSizeInBits()) { 3741 default: 3742 break; 3743 case 16: 3744 if (NumElts == 4) 3745 Opc = AArch64::FRINTAv4f16; 3746 else if (NumElts == 8) 3747 Opc = AArch64::FRINTAv8f16; 3748 break; 3749 case 32: 3750 if (NumElts == 2) 3751 Opc = AArch64::FRINTAv2f32; 3752 else if (NumElts == 4) 3753 Opc = AArch64::FRINTAv4f32; 3754 break; 3755 case 64: 3756 if (NumElts == 2) 3757 Opc = AArch64::FRINTAv2f64; 3758 break; 3759 } 3760 } 3761 3762 if (!Opc) { 3763 // Didn't get an opcode above, bail. 3764 LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_ROUND!\n"); 3765 return false; 3766 } 3767 3768 // Legalization would have set us up perfectly for this; we just need to 3769 // set the opcode and move on. 3770 I.setDesc(TII.get(Opc)); 3771 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3772 } 3773 3774 bool AArch64InstructionSelector::selectVectorICmp( 3775 MachineInstr &I, MachineRegisterInfo &MRI) { 3776 Register DstReg = I.getOperand(0).getReg(); 3777 LLT DstTy = MRI.getType(DstReg); 3778 Register SrcReg = I.getOperand(2).getReg(); 3779 Register Src2Reg = I.getOperand(3).getReg(); 3780 LLT SrcTy = MRI.getType(SrcReg); 3781 3782 unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits(); 3783 unsigned NumElts = DstTy.getNumElements(); 3784 3785 // First index is element size, 0 == 8b, 1 == 16b, 2 == 32b, 3 == 64b 3786 // Second index is num elts, 0 == v2, 1 == v4, 2 == v8, 3 == v16 3787 // Third index is cc opcode: 3788 // 0 == eq 3789 // 1 == ugt 3790 // 2 == uge 3791 // 3 == ult 3792 // 4 == ule 3793 // 5 == sgt 3794 // 6 == sge 3795 // 7 == slt 3796 // 8 == sle 3797 // ne is done by negating 'eq' result. 3798 3799 // This table below assumes that for some comparisons the operands will be 3800 // commuted. 3801 // ult op == commute + ugt op 3802 // ule op == commute + uge op 3803 // slt op == commute + sgt op 3804 // sle op == commute + sge op 3805 unsigned PredIdx = 0; 3806 bool SwapOperands = false; 3807 CmpInst::Predicate Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); 3808 switch (Pred) { 3809 case CmpInst::ICMP_NE: 3810 case CmpInst::ICMP_EQ: 3811 PredIdx = 0; 3812 break; 3813 case CmpInst::ICMP_UGT: 3814 PredIdx = 1; 3815 break; 3816 case CmpInst::ICMP_UGE: 3817 PredIdx = 2; 3818 break; 3819 case CmpInst::ICMP_ULT: 3820 PredIdx = 3; 3821 SwapOperands = true; 3822 break; 3823 case CmpInst::ICMP_ULE: 3824 PredIdx = 4; 3825 SwapOperands = true; 3826 break; 3827 case CmpInst::ICMP_SGT: 3828 PredIdx = 5; 3829 break; 3830 case CmpInst::ICMP_SGE: 3831 PredIdx = 6; 3832 break; 3833 case CmpInst::ICMP_SLT: 3834 PredIdx = 7; 3835 SwapOperands = true; 3836 break; 3837 case CmpInst::ICMP_SLE: 3838 PredIdx = 8; 3839 SwapOperands = true; 3840 break; 3841 default: 3842 llvm_unreachable("Unhandled icmp predicate"); 3843 return false; 3844 } 3845 3846 // This table obviously should be tablegen'd when we have our GISel native 3847 // tablegen selector. 3848 3849 static const unsigned OpcTable[4][4][9] = { 3850 { 3851 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3852 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3853 0 /* invalid */}, 3854 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3855 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3856 0 /* invalid */}, 3857 {AArch64::CMEQv8i8, AArch64::CMHIv8i8, AArch64::CMHSv8i8, 3858 AArch64::CMHIv8i8, AArch64::CMHSv8i8, AArch64::CMGTv8i8, 3859 AArch64::CMGEv8i8, AArch64::CMGTv8i8, AArch64::CMGEv8i8}, 3860 {AArch64::CMEQv16i8, AArch64::CMHIv16i8, AArch64::CMHSv16i8, 3861 AArch64::CMHIv16i8, AArch64::CMHSv16i8, AArch64::CMGTv16i8, 3862 AArch64::CMGEv16i8, AArch64::CMGTv16i8, AArch64::CMGEv16i8} 3863 }, 3864 { 3865 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3866 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3867 0 /* invalid */}, 3868 {AArch64::CMEQv4i16, AArch64::CMHIv4i16, AArch64::CMHSv4i16, 3869 AArch64::CMHIv4i16, AArch64::CMHSv4i16, AArch64::CMGTv4i16, 3870 AArch64::CMGEv4i16, AArch64::CMGTv4i16, AArch64::CMGEv4i16}, 3871 {AArch64::CMEQv8i16, AArch64::CMHIv8i16, AArch64::CMHSv8i16, 3872 AArch64::CMHIv8i16, AArch64::CMHSv8i16, AArch64::CMGTv8i16, 3873 AArch64::CMGEv8i16, AArch64::CMGTv8i16, AArch64::CMGEv8i16}, 3874 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3875 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3876 0 /* invalid */} 3877 }, 3878 { 3879 {AArch64::CMEQv2i32, AArch64::CMHIv2i32, AArch64::CMHSv2i32, 3880 AArch64::CMHIv2i32, AArch64::CMHSv2i32, AArch64::CMGTv2i32, 3881 AArch64::CMGEv2i32, AArch64::CMGTv2i32, AArch64::CMGEv2i32}, 3882 {AArch64::CMEQv4i32, AArch64::CMHIv4i32, AArch64::CMHSv4i32, 3883 AArch64::CMHIv4i32, AArch64::CMHSv4i32, AArch64::CMGTv4i32, 3884 AArch64::CMGEv4i32, AArch64::CMGTv4i32, AArch64::CMGEv4i32}, 3885 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3886 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3887 0 /* invalid */}, 3888 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3889 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3890 0 /* invalid */} 3891 }, 3892 { 3893 {AArch64::CMEQv2i64, AArch64::CMHIv2i64, AArch64::CMHSv2i64, 3894 AArch64::CMHIv2i64, AArch64::CMHSv2i64, AArch64::CMGTv2i64, 3895 AArch64::CMGEv2i64, AArch64::CMGTv2i64, AArch64::CMGEv2i64}, 3896 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3897 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3898 0 /* invalid */}, 3899 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3900 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3901 0 /* invalid */}, 3902 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3903 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3904 0 /* invalid */} 3905 }, 3906 }; 3907 unsigned EltIdx = Log2_32(SrcEltSize / 8); 3908 unsigned NumEltsIdx = Log2_32(NumElts / 2); 3909 unsigned Opc = OpcTable[EltIdx][NumEltsIdx][PredIdx]; 3910 if (!Opc) { 3911 LLVM_DEBUG(dbgs() << "Could not map G_ICMP to cmp opcode"); 3912 return false; 3913 } 3914 3915 const RegisterBank &VecRB = *RBI.getRegBank(SrcReg, MRI, TRI); 3916 const TargetRegisterClass *SrcRC = 3917 getRegClassForTypeOnBank(SrcTy, VecRB, true); 3918 if (!SrcRC) { 3919 LLVM_DEBUG(dbgs() << "Could not determine source register class.\n"); 3920 return false; 3921 } 3922 3923 unsigned NotOpc = Pred == ICmpInst::ICMP_NE ? AArch64::NOTv8i8 : 0; 3924 if (SrcTy.getSizeInBits() == 128) 3925 NotOpc = NotOpc ? AArch64::NOTv16i8 : 0; 3926 3927 if (SwapOperands) 3928 std::swap(SrcReg, Src2Reg); 3929 3930 auto Cmp = MIB.buildInstr(Opc, {SrcRC}, {SrcReg, Src2Reg}); 3931 constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI); 3932 3933 // Invert if we had a 'ne' cc. 3934 if (NotOpc) { 3935 Cmp = MIB.buildInstr(NotOpc, {DstReg}, {Cmp}); 3936 constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI); 3937 } else { 3938 MIB.buildCopy(DstReg, Cmp.getReg(0)); 3939 } 3940 RBI.constrainGenericRegister(DstReg, *SrcRC, MRI); 3941 I.eraseFromParent(); 3942 return true; 3943 } 3944 3945 MachineInstr *AArch64InstructionSelector::emitScalarToVector( 3946 unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar, 3947 MachineIRBuilder &MIRBuilder) const { 3948 auto Undef = MIRBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstRC}, {}); 3949 3950 auto BuildFn = [&](unsigned SubregIndex) { 3951 auto Ins = 3952 MIRBuilder 3953 .buildInstr(TargetOpcode::INSERT_SUBREG, {DstRC}, {Undef, Scalar}) 3954 .addImm(SubregIndex); 3955 constrainSelectedInstRegOperands(*Undef, TII, TRI, RBI); 3956 constrainSelectedInstRegOperands(*Ins, TII, TRI, RBI); 3957 return &*Ins; 3958 }; 3959 3960 switch (EltSize) { 3961 case 16: 3962 return BuildFn(AArch64::hsub); 3963 case 32: 3964 return BuildFn(AArch64::ssub); 3965 case 64: 3966 return BuildFn(AArch64::dsub); 3967 default: 3968 return nullptr; 3969 } 3970 } 3971 3972 bool AArch64InstructionSelector::selectMergeValues( 3973 MachineInstr &I, MachineRegisterInfo &MRI) { 3974 assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode"); 3975 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 3976 const LLT SrcTy = MRI.getType(I.getOperand(1).getReg()); 3977 assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation"); 3978 const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI); 3979 3980 if (I.getNumOperands() != 3) 3981 return false; 3982 3983 // Merging 2 s64s into an s128. 3984 if (DstTy == LLT::scalar(128)) { 3985 if (SrcTy.getSizeInBits() != 64) 3986 return false; 3987 Register DstReg = I.getOperand(0).getReg(); 3988 Register Src1Reg = I.getOperand(1).getReg(); 3989 Register Src2Reg = I.getOperand(2).getReg(); 3990 auto Tmp = MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstTy}, {}); 3991 MachineInstr *InsMI = 3992 emitLaneInsert(None, Tmp.getReg(0), Src1Reg, /* LaneIdx */ 0, RB, MIB); 3993 if (!InsMI) 3994 return false; 3995 MachineInstr *Ins2MI = emitLaneInsert(DstReg, InsMI->getOperand(0).getReg(), 3996 Src2Reg, /* LaneIdx */ 1, RB, MIB); 3997 if (!Ins2MI) 3998 return false; 3999 constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI); 4000 constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI); 4001 I.eraseFromParent(); 4002 return true; 4003 } 4004 4005 if (RB.getID() != AArch64::GPRRegBankID) 4006 return false; 4007 4008 if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32) 4009 return false; 4010 4011 auto *DstRC = &AArch64::GPR64RegClass; 4012 Register SubToRegDef = MRI.createVirtualRegister(DstRC); 4013 MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(), 4014 TII.get(TargetOpcode::SUBREG_TO_REG)) 4015 .addDef(SubToRegDef) 4016 .addImm(0) 4017 .addUse(I.getOperand(1).getReg()) 4018 .addImm(AArch64::sub_32); 4019 Register SubToRegDef2 = MRI.createVirtualRegister(DstRC); 4020 // Need to anyext the second scalar before we can use bfm 4021 MachineInstr &SubRegMI2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(), 4022 TII.get(TargetOpcode::SUBREG_TO_REG)) 4023 .addDef(SubToRegDef2) 4024 .addImm(0) 4025 .addUse(I.getOperand(2).getReg()) 4026 .addImm(AArch64::sub_32); 4027 MachineInstr &BFM = 4028 *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::BFMXri)) 4029 .addDef(I.getOperand(0).getReg()) 4030 .addUse(SubToRegDef) 4031 .addUse(SubToRegDef2) 4032 .addImm(32) 4033 .addImm(31); 4034 constrainSelectedInstRegOperands(SubRegMI, TII, TRI, RBI); 4035 constrainSelectedInstRegOperands(SubRegMI2, TII, TRI, RBI); 4036 constrainSelectedInstRegOperands(BFM, TII, TRI, RBI); 4037 I.eraseFromParent(); 4038 return true; 4039 } 4040 4041 static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg, 4042 const unsigned EltSize) { 4043 // Choose a lane copy opcode and subregister based off of the size of the 4044 // vector's elements. 4045 switch (EltSize) { 4046 case 8: 4047 CopyOpc = AArch64::DUPi8; 4048 ExtractSubReg = AArch64::bsub; 4049 break; 4050 case 16: 4051 CopyOpc = AArch64::DUPi16; 4052 ExtractSubReg = AArch64::hsub; 4053 break; 4054 case 32: 4055 CopyOpc = AArch64::DUPi32; 4056 ExtractSubReg = AArch64::ssub; 4057 break; 4058 case 64: 4059 CopyOpc = AArch64::DUPi64; 4060 ExtractSubReg = AArch64::dsub; 4061 break; 4062 default: 4063 // Unknown size, bail out. 4064 LLVM_DEBUG(dbgs() << "Elt size '" << EltSize << "' unsupported.\n"); 4065 return false; 4066 } 4067 return true; 4068 } 4069 4070 MachineInstr *AArch64InstructionSelector::emitExtractVectorElt( 4071 Optional<Register> DstReg, const RegisterBank &DstRB, LLT ScalarTy, 4072 Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const { 4073 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 4074 unsigned CopyOpc = 0; 4075 unsigned ExtractSubReg = 0; 4076 if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, ScalarTy.getSizeInBits())) { 4077 LLVM_DEBUG( 4078 dbgs() << "Couldn't determine lane copy opcode for instruction.\n"); 4079 return nullptr; 4080 } 4081 4082 const TargetRegisterClass *DstRC = 4083 getRegClassForTypeOnBank(ScalarTy, DstRB, true); 4084 if (!DstRC) { 4085 LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n"); 4086 return nullptr; 4087 } 4088 4089 const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI); 4090 const LLT &VecTy = MRI.getType(VecReg); 4091 const TargetRegisterClass *VecRC = 4092 getRegClassForTypeOnBank(VecTy, VecRB, true); 4093 if (!VecRC) { 4094 LLVM_DEBUG(dbgs() << "Could not determine source register class.\n"); 4095 return nullptr; 4096 } 4097 4098 // The register that we're going to copy into. 4099 Register InsertReg = VecReg; 4100 if (!DstReg) 4101 DstReg = MRI.createVirtualRegister(DstRC); 4102 // If the lane index is 0, we just use a subregister COPY. 4103 if (LaneIdx == 0) { 4104 auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {*DstReg}, {}) 4105 .addReg(VecReg, 0, ExtractSubReg); 4106 RBI.constrainGenericRegister(*DstReg, *DstRC, MRI); 4107 return &*Copy; 4108 } 4109 4110 // Lane copies require 128-bit wide registers. If we're dealing with an 4111 // unpacked vector, then we need to move up to that width. Insert an implicit 4112 // def and a subregister insert to get us there. 4113 if (VecTy.getSizeInBits() != 128) { 4114 MachineInstr *ScalarToVector = emitScalarToVector( 4115 VecTy.getSizeInBits(), &AArch64::FPR128RegClass, VecReg, MIRBuilder); 4116 if (!ScalarToVector) 4117 return nullptr; 4118 InsertReg = ScalarToVector->getOperand(0).getReg(); 4119 } 4120 4121 MachineInstr *LaneCopyMI = 4122 MIRBuilder.buildInstr(CopyOpc, {*DstReg}, {InsertReg}).addImm(LaneIdx); 4123 constrainSelectedInstRegOperands(*LaneCopyMI, TII, TRI, RBI); 4124 4125 // Make sure that we actually constrain the initial copy. 4126 RBI.constrainGenericRegister(*DstReg, *DstRC, MRI); 4127 return LaneCopyMI; 4128 } 4129 4130 bool AArch64InstructionSelector::selectExtractElt( 4131 MachineInstr &I, MachineRegisterInfo &MRI) { 4132 assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT && 4133 "unexpected opcode!"); 4134 Register DstReg = I.getOperand(0).getReg(); 4135 const LLT NarrowTy = MRI.getType(DstReg); 4136 const Register SrcReg = I.getOperand(1).getReg(); 4137 const LLT WideTy = MRI.getType(SrcReg); 4138 (void)WideTy; 4139 assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() && 4140 "source register size too small!"); 4141 assert(!NarrowTy.isVector() && "cannot extract vector into vector!"); 4142 4143 // Need the lane index to determine the correct copy opcode. 4144 MachineOperand &LaneIdxOp = I.getOperand(2); 4145 assert(LaneIdxOp.isReg() && "Lane index operand was not a register?"); 4146 4147 if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) { 4148 LLVM_DEBUG(dbgs() << "Cannot extract into GPR.\n"); 4149 return false; 4150 } 4151 4152 // Find the index to extract from. 4153 auto VRegAndVal = getIConstantVRegValWithLookThrough(LaneIdxOp.getReg(), MRI); 4154 if (!VRegAndVal) 4155 return false; 4156 unsigned LaneIdx = VRegAndVal->Value.getSExtValue(); 4157 4158 4159 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); 4160 MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, NarrowTy, SrcReg, 4161 LaneIdx, MIB); 4162 if (!Extract) 4163 return false; 4164 4165 I.eraseFromParent(); 4166 return true; 4167 } 4168 4169 bool AArch64InstructionSelector::selectSplitVectorUnmerge( 4170 MachineInstr &I, MachineRegisterInfo &MRI) { 4171 unsigned NumElts = I.getNumOperands() - 1; 4172 Register SrcReg = I.getOperand(NumElts).getReg(); 4173 const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg()); 4174 const LLT SrcTy = MRI.getType(SrcReg); 4175 4176 assert(NarrowTy.isVector() && "Expected an unmerge into vectors"); 4177 if (SrcTy.getSizeInBits() > 128) { 4178 LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge"); 4179 return false; 4180 } 4181 4182 // We implement a split vector operation by treating the sub-vectors as 4183 // scalars and extracting them. 4184 const RegisterBank &DstRB = 4185 *RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI); 4186 for (unsigned OpIdx = 0; OpIdx < NumElts; ++OpIdx) { 4187 Register Dst = I.getOperand(OpIdx).getReg(); 4188 MachineInstr *Extract = 4189 emitExtractVectorElt(Dst, DstRB, NarrowTy, SrcReg, OpIdx, MIB); 4190 if (!Extract) 4191 return false; 4192 } 4193 I.eraseFromParent(); 4194 return true; 4195 } 4196 4197 bool AArch64InstructionSelector::selectUnmergeValues(MachineInstr &I, 4198 MachineRegisterInfo &MRI) { 4199 assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES && 4200 "unexpected opcode"); 4201 4202 // TODO: Handle unmerging into GPRs and from scalars to scalars. 4203 if (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() != 4204 AArch64::FPRRegBankID || 4205 RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() != 4206 AArch64::FPRRegBankID) { 4207 LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar " 4208 "currently unsupported.\n"); 4209 return false; 4210 } 4211 4212 // The last operand is the vector source register, and every other operand is 4213 // a register to unpack into. 4214 unsigned NumElts = I.getNumOperands() - 1; 4215 Register SrcReg = I.getOperand(NumElts).getReg(); 4216 const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg()); 4217 const LLT WideTy = MRI.getType(SrcReg); 4218 (void)WideTy; 4219 assert((WideTy.isVector() || WideTy.getSizeInBits() == 128) && 4220 "can only unmerge from vector or s128 types!"); 4221 assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() && 4222 "source register size too small!"); 4223 4224 if (!NarrowTy.isScalar()) 4225 return selectSplitVectorUnmerge(I, MRI); 4226 4227 // Choose a lane copy opcode and subregister based off of the size of the 4228 // vector's elements. 4229 unsigned CopyOpc = 0; 4230 unsigned ExtractSubReg = 0; 4231 if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, NarrowTy.getSizeInBits())) 4232 return false; 4233 4234 // Set up for the lane copies. 4235 MachineBasicBlock &MBB = *I.getParent(); 4236 4237 // Stores the registers we'll be copying from. 4238 SmallVector<Register, 4> InsertRegs; 4239 4240 // We'll use the first register twice, so we only need NumElts-1 registers. 4241 unsigned NumInsertRegs = NumElts - 1; 4242 4243 // If our elements fit into exactly 128 bits, then we can copy from the source 4244 // directly. Otherwise, we need to do a bit of setup with some subregister 4245 // inserts. 4246 if (NarrowTy.getSizeInBits() * NumElts == 128) { 4247 InsertRegs = SmallVector<Register, 4>(NumInsertRegs, SrcReg); 4248 } else { 4249 // No. We have to perform subregister inserts. For each insert, create an 4250 // implicit def and a subregister insert, and save the register we create. 4251 const TargetRegisterClass *RC = getRegClassForTypeOnBank( 4252 LLT::fixed_vector(NumElts, WideTy.getScalarSizeInBits()), 4253 *RBI.getRegBank(SrcReg, MRI, TRI)); 4254 unsigned SubReg = 0; 4255 bool Found = getSubRegForClass(RC, TRI, SubReg); 4256 (void)Found; 4257 assert(Found && "expected to find last operand's subeg idx"); 4258 for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) { 4259 Register ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass); 4260 MachineInstr &ImpDefMI = 4261 *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::IMPLICIT_DEF), 4262 ImpDefReg); 4263 4264 // Now, create the subregister insert from SrcReg. 4265 Register InsertReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass); 4266 MachineInstr &InsMI = 4267 *BuildMI(MBB, I, I.getDebugLoc(), 4268 TII.get(TargetOpcode::INSERT_SUBREG), InsertReg) 4269 .addUse(ImpDefReg) 4270 .addUse(SrcReg) 4271 .addImm(SubReg); 4272 4273 constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI); 4274 constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI); 4275 4276 // Save the register so that we can copy from it after. 4277 InsertRegs.push_back(InsertReg); 4278 } 4279 } 4280 4281 // Now that we've created any necessary subregister inserts, we can 4282 // create the copies. 4283 // 4284 // Perform the first copy separately as a subregister copy. 4285 Register CopyTo = I.getOperand(0).getReg(); 4286 auto FirstCopy = MIB.buildInstr(TargetOpcode::COPY, {CopyTo}, {}) 4287 .addReg(InsertRegs[0], 0, ExtractSubReg); 4288 constrainSelectedInstRegOperands(*FirstCopy, TII, TRI, RBI); 4289 4290 // Now, perform the remaining copies as vector lane copies. 4291 unsigned LaneIdx = 1; 4292 for (Register InsReg : InsertRegs) { 4293 Register CopyTo = I.getOperand(LaneIdx).getReg(); 4294 MachineInstr &CopyInst = 4295 *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CopyOpc), CopyTo) 4296 .addUse(InsReg) 4297 .addImm(LaneIdx); 4298 constrainSelectedInstRegOperands(CopyInst, TII, TRI, RBI); 4299 ++LaneIdx; 4300 } 4301 4302 // Separately constrain the first copy's destination. Because of the 4303 // limitation in constrainOperandRegClass, we can't guarantee that this will 4304 // actually be constrained. So, do it ourselves using the second operand. 4305 const TargetRegisterClass *RC = 4306 MRI.getRegClassOrNull(I.getOperand(1).getReg()); 4307 if (!RC) { 4308 LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n"); 4309 return false; 4310 } 4311 4312 RBI.constrainGenericRegister(CopyTo, *RC, MRI); 4313 I.eraseFromParent(); 4314 return true; 4315 } 4316 4317 bool AArch64InstructionSelector::selectConcatVectors( 4318 MachineInstr &I, MachineRegisterInfo &MRI) { 4319 assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS && 4320 "Unexpected opcode"); 4321 Register Dst = I.getOperand(0).getReg(); 4322 Register Op1 = I.getOperand(1).getReg(); 4323 Register Op2 = I.getOperand(2).getReg(); 4324 MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIB); 4325 if (!ConcatMI) 4326 return false; 4327 I.eraseFromParent(); 4328 return true; 4329 } 4330 4331 unsigned 4332 AArch64InstructionSelector::emitConstantPoolEntry(const Constant *CPVal, 4333 MachineFunction &MF) const { 4334 Type *CPTy = CPVal->getType(); 4335 Align Alignment = MF.getDataLayout().getPrefTypeAlign(CPTy); 4336 4337 MachineConstantPool *MCP = MF.getConstantPool(); 4338 return MCP->getConstantPoolIndex(CPVal, Alignment); 4339 } 4340 4341 MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool( 4342 const Constant *CPVal, MachineIRBuilder &MIRBuilder) const { 4343 auto &MF = MIRBuilder.getMF(); 4344 unsigned CPIdx = emitConstantPoolEntry(CPVal, MF); 4345 4346 auto Adrp = 4347 MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {}) 4348 .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE); 4349 4350 MachineInstr *LoadMI = nullptr; 4351 MachinePointerInfo PtrInfo = MachinePointerInfo::getConstantPool(MF); 4352 unsigned Size = MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType()); 4353 switch (Size) { 4354 case 16: 4355 LoadMI = 4356 &*MIRBuilder 4357 .buildInstr(AArch64::LDRQui, {&AArch64::FPR128RegClass}, {Adrp}) 4358 .addConstantPoolIndex(CPIdx, 0, 4359 AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 4360 break; 4361 case 8: 4362 LoadMI = 4363 &*MIRBuilder 4364 .buildInstr(AArch64::LDRDui, {&AArch64::FPR64RegClass}, {Adrp}) 4365 .addConstantPoolIndex(CPIdx, 0, 4366 AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 4367 break; 4368 case 4: 4369 LoadMI = 4370 &*MIRBuilder 4371 .buildInstr(AArch64::LDRSui, {&AArch64::FPR32RegClass}, {Adrp}) 4372 .addConstantPoolIndex(CPIdx, 0, 4373 AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 4374 break; 4375 case 2: 4376 LoadMI = 4377 &*MIRBuilder 4378 .buildInstr(AArch64::LDRHui, {&AArch64::FPR16RegClass}, {Adrp}) 4379 .addConstantPoolIndex(CPIdx, 0, 4380 AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 4381 break; 4382 default: 4383 LLVM_DEBUG(dbgs() << "Could not load from constant pool of type " 4384 << *CPVal->getType()); 4385 return nullptr; 4386 } 4387 LoadMI->addMemOperand(MF, MF.getMachineMemOperand(PtrInfo, 4388 MachineMemOperand::MOLoad, 4389 Size, Align(Size))); 4390 constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI); 4391 constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI); 4392 return LoadMI; 4393 } 4394 4395 /// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given 4396 /// size and RB. 4397 static std::pair<unsigned, unsigned> 4398 getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) { 4399 unsigned Opc, SubregIdx; 4400 if (RB.getID() == AArch64::GPRRegBankID) { 4401 if (EltSize == 16) { 4402 Opc = AArch64::INSvi16gpr; 4403 SubregIdx = AArch64::ssub; 4404 } else if (EltSize == 32) { 4405 Opc = AArch64::INSvi32gpr; 4406 SubregIdx = AArch64::ssub; 4407 } else if (EltSize == 64) { 4408 Opc = AArch64::INSvi64gpr; 4409 SubregIdx = AArch64::dsub; 4410 } else { 4411 llvm_unreachable("invalid elt size!"); 4412 } 4413 } else { 4414 if (EltSize == 8) { 4415 Opc = AArch64::INSvi8lane; 4416 SubregIdx = AArch64::bsub; 4417 } else if (EltSize == 16) { 4418 Opc = AArch64::INSvi16lane; 4419 SubregIdx = AArch64::hsub; 4420 } else if (EltSize == 32) { 4421 Opc = AArch64::INSvi32lane; 4422 SubregIdx = AArch64::ssub; 4423 } else if (EltSize == 64) { 4424 Opc = AArch64::INSvi64lane; 4425 SubregIdx = AArch64::dsub; 4426 } else { 4427 llvm_unreachable("invalid elt size!"); 4428 } 4429 } 4430 return std::make_pair(Opc, SubregIdx); 4431 } 4432 4433 MachineInstr *AArch64InstructionSelector::emitInstr( 4434 unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps, 4435 std::initializer_list<llvm::SrcOp> SrcOps, MachineIRBuilder &MIRBuilder, 4436 const ComplexRendererFns &RenderFns) const { 4437 assert(Opcode && "Expected an opcode?"); 4438 assert(!isPreISelGenericOpcode(Opcode) && 4439 "Function should only be used to produce selected instructions!"); 4440 auto MI = MIRBuilder.buildInstr(Opcode, DstOps, SrcOps); 4441 if (RenderFns) 4442 for (auto &Fn : *RenderFns) 4443 Fn(MI); 4444 constrainSelectedInstRegOperands(*MI, TII, TRI, RBI); 4445 return &*MI; 4446 } 4447 4448 MachineInstr *AArch64InstructionSelector::emitAddSub( 4449 const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode, 4450 Register Dst, MachineOperand &LHS, MachineOperand &RHS, 4451 MachineIRBuilder &MIRBuilder) const { 4452 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); 4453 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?"); 4454 auto Ty = MRI.getType(LHS.getReg()); 4455 assert(!Ty.isVector() && "Expected a scalar or pointer?"); 4456 unsigned Size = Ty.getSizeInBits(); 4457 assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit type only"); 4458 bool Is32Bit = Size == 32; 4459 4460 // INSTRri form with positive arithmetic immediate. 4461 if (auto Fns = selectArithImmed(RHS)) 4462 return emitInstr(AddrModeAndSizeToOpcode[0][Is32Bit], {Dst}, {LHS}, 4463 MIRBuilder, Fns); 4464 4465 // INSTRri form with negative arithmetic immediate. 4466 if (auto Fns = selectNegArithImmed(RHS)) 4467 return emitInstr(AddrModeAndSizeToOpcode[3][Is32Bit], {Dst}, {LHS}, 4468 MIRBuilder, Fns); 4469 4470 // INSTRrx form. 4471 if (auto Fns = selectArithExtendedRegister(RHS)) 4472 return emitInstr(AddrModeAndSizeToOpcode[4][Is32Bit], {Dst}, {LHS}, 4473 MIRBuilder, Fns); 4474 4475 // INSTRrs form. 4476 if (auto Fns = selectShiftedRegister(RHS)) 4477 return emitInstr(AddrModeAndSizeToOpcode[1][Is32Bit], {Dst}, {LHS}, 4478 MIRBuilder, Fns); 4479 return emitInstr(AddrModeAndSizeToOpcode[2][Is32Bit], {Dst}, {LHS, RHS}, 4480 MIRBuilder); 4481 } 4482 4483 MachineInstr * 4484 AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS, 4485 MachineOperand &RHS, 4486 MachineIRBuilder &MIRBuilder) const { 4487 const std::array<std::array<unsigned, 2>, 5> OpcTable{ 4488 {{AArch64::ADDXri, AArch64::ADDWri}, 4489 {AArch64::ADDXrs, AArch64::ADDWrs}, 4490 {AArch64::ADDXrr, AArch64::ADDWrr}, 4491 {AArch64::SUBXri, AArch64::SUBWri}, 4492 {AArch64::ADDXrx, AArch64::ADDWrx}}}; 4493 return emitAddSub(OpcTable, DefReg, LHS, RHS, MIRBuilder); 4494 } 4495 4496 MachineInstr * 4497 AArch64InstructionSelector::emitADDS(Register Dst, MachineOperand &LHS, 4498 MachineOperand &RHS, 4499 MachineIRBuilder &MIRBuilder) const { 4500 const std::array<std::array<unsigned, 2>, 5> OpcTable{ 4501 {{AArch64::ADDSXri, AArch64::ADDSWri}, 4502 {AArch64::ADDSXrs, AArch64::ADDSWrs}, 4503 {AArch64::ADDSXrr, AArch64::ADDSWrr}, 4504 {AArch64::SUBSXri, AArch64::SUBSWri}, 4505 {AArch64::ADDSXrx, AArch64::ADDSWrx}}}; 4506 return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder); 4507 } 4508 4509 MachineInstr * 4510 AArch64InstructionSelector::emitSUBS(Register Dst, MachineOperand &LHS, 4511 MachineOperand &RHS, 4512 MachineIRBuilder &MIRBuilder) const { 4513 const std::array<std::array<unsigned, 2>, 5> OpcTable{ 4514 {{AArch64::SUBSXri, AArch64::SUBSWri}, 4515 {AArch64::SUBSXrs, AArch64::SUBSWrs}, 4516 {AArch64::SUBSXrr, AArch64::SUBSWrr}, 4517 {AArch64::ADDSXri, AArch64::ADDSWri}, 4518 {AArch64::SUBSXrx, AArch64::SUBSWrx}}}; 4519 return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder); 4520 } 4521 4522 MachineInstr * 4523 AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS, 4524 MachineIRBuilder &MIRBuilder) const { 4525 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); 4526 bool Is32Bit = (MRI.getType(LHS.getReg()).getSizeInBits() == 32); 4527 auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass; 4528 return emitADDS(MRI.createVirtualRegister(RC), LHS, RHS, MIRBuilder); 4529 } 4530 4531 MachineInstr * 4532 AArch64InstructionSelector::emitTST(MachineOperand &LHS, MachineOperand &RHS, 4533 MachineIRBuilder &MIRBuilder) const { 4534 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?"); 4535 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); 4536 LLT Ty = MRI.getType(LHS.getReg()); 4537 unsigned RegSize = Ty.getSizeInBits(); 4538 bool Is32Bit = (RegSize == 32); 4539 const unsigned OpcTable[3][2] = {{AArch64::ANDSXri, AArch64::ANDSWri}, 4540 {AArch64::ANDSXrs, AArch64::ANDSWrs}, 4541 {AArch64::ANDSXrr, AArch64::ANDSWrr}}; 4542 // ANDS needs a logical immediate for its immediate form. Check if we can 4543 // fold one in. 4544 if (auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI)) { 4545 int64_t Imm = ValAndVReg->Value.getSExtValue(); 4546 4547 if (AArch64_AM::isLogicalImmediate(Imm, RegSize)) { 4548 auto TstMI = MIRBuilder.buildInstr(OpcTable[0][Is32Bit], {Ty}, {LHS}); 4549 TstMI.addImm(AArch64_AM::encodeLogicalImmediate(Imm, RegSize)); 4550 constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI); 4551 return &*TstMI; 4552 } 4553 } 4554 4555 if (auto Fns = selectLogicalShiftedRegister(RHS)) 4556 return emitInstr(OpcTable[1][Is32Bit], {Ty}, {LHS}, MIRBuilder, Fns); 4557 return emitInstr(OpcTable[2][Is32Bit], {Ty}, {LHS, RHS}, MIRBuilder); 4558 } 4559 4560 MachineInstr *AArch64InstructionSelector::emitIntegerCompare( 4561 MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate, 4562 MachineIRBuilder &MIRBuilder) const { 4563 assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!"); 4564 assert(Predicate.isPredicate() && "Expected predicate?"); 4565 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); 4566 LLT CmpTy = MRI.getType(LHS.getReg()); 4567 assert(!CmpTy.isVector() && "Expected scalar or pointer"); 4568 unsigned Size = CmpTy.getSizeInBits(); 4569 (void)Size; 4570 assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit LHS/RHS?"); 4571 // Fold the compare into a cmn or tst if possible. 4572 if (auto FoldCmp = tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder)) 4573 return FoldCmp; 4574 auto Dst = MRI.cloneVirtualRegister(LHS.getReg()); 4575 return emitSUBS(Dst, LHS, RHS, MIRBuilder); 4576 } 4577 4578 MachineInstr *AArch64InstructionSelector::emitCSetForFCmp( 4579 Register Dst, CmpInst::Predicate Pred, MachineIRBuilder &MIRBuilder) const { 4580 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 4581 #ifndef NDEBUG 4582 LLT Ty = MRI.getType(Dst); 4583 assert(!Ty.isVector() && Ty.getSizeInBits() == 32 && 4584 "Expected a 32-bit scalar register?"); 4585 #endif 4586 const Register ZReg = AArch64::WZR; 4587 AArch64CC::CondCode CC1, CC2; 4588 changeFCMPPredToAArch64CC(Pred, CC1, CC2); 4589 auto InvCC1 = AArch64CC::getInvertedCondCode(CC1); 4590 if (CC2 == AArch64CC::AL) 4591 return emitCSINC(/*Dst=*/Dst, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1, 4592 MIRBuilder); 4593 const TargetRegisterClass *RC = &AArch64::GPR32RegClass; 4594 Register Def1Reg = MRI.createVirtualRegister(RC); 4595 Register Def2Reg = MRI.createVirtualRegister(RC); 4596 auto InvCC2 = AArch64CC::getInvertedCondCode(CC2); 4597 emitCSINC(/*Dst=*/Def1Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1, MIRBuilder); 4598 emitCSINC(/*Dst=*/Def2Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC2, MIRBuilder); 4599 auto OrMI = MIRBuilder.buildInstr(AArch64::ORRWrr, {Dst}, {Def1Reg, Def2Reg}); 4600 constrainSelectedInstRegOperands(*OrMI, TII, TRI, RBI); 4601 return &*OrMI; 4602 } 4603 4604 MachineInstr * 4605 AArch64InstructionSelector::emitFPCompare(Register LHS, Register RHS, 4606 MachineIRBuilder &MIRBuilder, 4607 Optional<CmpInst::Predicate> Pred) const { 4608 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 4609 LLT Ty = MRI.getType(LHS); 4610 if (Ty.isVector()) 4611 return nullptr; 4612 unsigned OpSize = Ty.getSizeInBits(); 4613 if (OpSize != 32 && OpSize != 64) 4614 return nullptr; 4615 4616 // If this is a compare against +0.0, then we don't have 4617 // to explicitly materialize a constant. 4618 const ConstantFP *FPImm = getConstantFPVRegVal(RHS, MRI); 4619 bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative()); 4620 4621 auto IsEqualityPred = [](CmpInst::Predicate P) { 4622 return P == CmpInst::FCMP_OEQ || P == CmpInst::FCMP_ONE || 4623 P == CmpInst::FCMP_UEQ || P == CmpInst::FCMP_UNE; 4624 }; 4625 if (!ShouldUseImm && Pred && IsEqualityPred(*Pred)) { 4626 // Try commutating the operands. 4627 const ConstantFP *LHSImm = getConstantFPVRegVal(LHS, MRI); 4628 if (LHSImm && (LHSImm->isZero() && !LHSImm->isNegative())) { 4629 ShouldUseImm = true; 4630 std::swap(LHS, RHS); 4631 } 4632 } 4633 unsigned CmpOpcTbl[2][2] = {{AArch64::FCMPSrr, AArch64::FCMPDrr}, 4634 {AArch64::FCMPSri, AArch64::FCMPDri}}; 4635 unsigned CmpOpc = CmpOpcTbl[ShouldUseImm][OpSize == 64]; 4636 4637 // Partially build the compare. Decide if we need to add a use for the 4638 // third operand based off whether or not we're comparing against 0.0. 4639 auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addUse(LHS); 4640 CmpMI.setMIFlags(MachineInstr::NoFPExcept); 4641 if (!ShouldUseImm) 4642 CmpMI.addUse(RHS); 4643 constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI); 4644 return &*CmpMI; 4645 } 4646 4647 MachineInstr *AArch64InstructionSelector::emitVectorConcat( 4648 Optional<Register> Dst, Register Op1, Register Op2, 4649 MachineIRBuilder &MIRBuilder) const { 4650 // We implement a vector concat by: 4651 // 1. Use scalar_to_vector to insert the lower vector into the larger dest 4652 // 2. Insert the upper vector into the destination's upper element 4653 // TODO: some of this code is common with G_BUILD_VECTOR handling. 4654 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); 4655 4656 const LLT Op1Ty = MRI.getType(Op1); 4657 const LLT Op2Ty = MRI.getType(Op2); 4658 4659 if (Op1Ty != Op2Ty) { 4660 LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys"); 4661 return nullptr; 4662 } 4663 assert(Op1Ty.isVector() && "Expected a vector for vector concat"); 4664 4665 if (Op1Ty.getSizeInBits() >= 128) { 4666 LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors"); 4667 return nullptr; 4668 } 4669 4670 // At the moment we just support 64 bit vector concats. 4671 if (Op1Ty.getSizeInBits() != 64) { 4672 LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors"); 4673 return nullptr; 4674 } 4675 4676 const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits()); 4677 const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI); 4678 const TargetRegisterClass *DstRC = 4679 getRegClassForTypeOnBank(Op1Ty.multiplyElements(2), FPRBank); 4680 4681 MachineInstr *WidenedOp1 = 4682 emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder); 4683 MachineInstr *WidenedOp2 = 4684 emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op2, MIRBuilder); 4685 if (!WidenedOp1 || !WidenedOp2) { 4686 LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value"); 4687 return nullptr; 4688 } 4689 4690 // Now do the insert of the upper element. 4691 unsigned InsertOpc, InsSubRegIdx; 4692 std::tie(InsertOpc, InsSubRegIdx) = 4693 getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits()); 4694 4695 if (!Dst) 4696 Dst = MRI.createVirtualRegister(DstRC); 4697 auto InsElt = 4698 MIRBuilder 4699 .buildInstr(InsertOpc, {*Dst}, {WidenedOp1->getOperand(0).getReg()}) 4700 .addImm(1) /* Lane index */ 4701 .addUse(WidenedOp2->getOperand(0).getReg()) 4702 .addImm(0); 4703 constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI); 4704 return &*InsElt; 4705 } 4706 4707 MachineInstr * 4708 AArch64InstructionSelector::emitCSINC(Register Dst, Register Src1, 4709 Register Src2, AArch64CC::CondCode Pred, 4710 MachineIRBuilder &MIRBuilder) const { 4711 auto &MRI = *MIRBuilder.getMRI(); 4712 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Dst); 4713 // If we used a register class, then this won't necessarily have an LLT. 4714 // Compute the size based off whether or not we have a class or bank. 4715 unsigned Size; 4716 if (const auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>()) 4717 Size = TRI.getRegSizeInBits(*RC); 4718 else 4719 Size = MRI.getType(Dst).getSizeInBits(); 4720 // Some opcodes use s1. 4721 assert(Size <= 64 && "Expected 64 bits or less only!"); 4722 static const unsigned OpcTable[2] = {AArch64::CSINCWr, AArch64::CSINCXr}; 4723 unsigned Opc = OpcTable[Size == 64]; 4724 auto CSINC = MIRBuilder.buildInstr(Opc, {Dst}, {Src1, Src2}).addImm(Pred); 4725 constrainSelectedInstRegOperands(*CSINC, TII, TRI, RBI); 4726 return &*CSINC; 4727 } 4728 4729 std::pair<MachineInstr *, AArch64CC::CondCode> 4730 AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst, 4731 MachineOperand &LHS, 4732 MachineOperand &RHS, 4733 MachineIRBuilder &MIRBuilder) const { 4734 switch (Opcode) { 4735 default: 4736 llvm_unreachable("Unexpected opcode!"); 4737 case TargetOpcode::G_SADDO: 4738 return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS); 4739 case TargetOpcode::G_UADDO: 4740 return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS); 4741 case TargetOpcode::G_SSUBO: 4742 return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS); 4743 case TargetOpcode::G_USUBO: 4744 return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO); 4745 } 4746 } 4747 4748 /// Returns true if @p Val is a tree of AND/OR/CMP operations that can be 4749 /// expressed as a conjunction. 4750 /// \param CanNegate Set to true if we can negate the whole sub-tree just by 4751 /// changing the conditions on the CMP tests. 4752 /// (this means we can call emitConjunctionRec() with 4753 /// Negate==true on this sub-tree) 4754 /// \param MustBeFirst Set to true if this subtree needs to be negated and we 4755 /// cannot do the negation naturally. We are required to 4756 /// emit the subtree first in this case. 4757 /// \param WillNegate Is true if are called when the result of this 4758 /// subexpression must be negated. This happens when the 4759 /// outer expression is an OR. We can use this fact to know 4760 /// that we have a double negation (or (or ...) ...) that 4761 /// can be implemented for free. 4762 static bool canEmitConjunction(Register Val, bool &CanNegate, bool &MustBeFirst, 4763 bool WillNegate, MachineRegisterInfo &MRI, 4764 unsigned Depth = 0) { 4765 if (!MRI.hasOneNonDBGUse(Val)) 4766 return false; 4767 MachineInstr *ValDef = MRI.getVRegDef(Val); 4768 unsigned Opcode = ValDef->getOpcode(); 4769 if (isa<GAnyCmp>(ValDef)) { 4770 CanNegate = true; 4771 MustBeFirst = false; 4772 return true; 4773 } 4774 // Protect against exponential runtime and stack overflow. 4775 if (Depth > 6) 4776 return false; 4777 if (Opcode == TargetOpcode::G_AND || Opcode == TargetOpcode::G_OR) { 4778 bool IsOR = Opcode == TargetOpcode::G_OR; 4779 Register O0 = ValDef->getOperand(1).getReg(); 4780 Register O1 = ValDef->getOperand(2).getReg(); 4781 bool CanNegateL; 4782 bool MustBeFirstL; 4783 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, MRI, Depth + 1)) 4784 return false; 4785 bool CanNegateR; 4786 bool MustBeFirstR; 4787 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, MRI, Depth + 1)) 4788 return false; 4789 4790 if (MustBeFirstL && MustBeFirstR) 4791 return false; 4792 4793 if (IsOR) { 4794 // For an OR expression we need to be able to naturally negate at least 4795 // one side or we cannot do the transformation at all. 4796 if (!CanNegateL && !CanNegateR) 4797 return false; 4798 // If we the result of the OR will be negated and we can naturally negate 4799 // the leaves, then this sub-tree as a whole negates naturally. 4800 CanNegate = WillNegate && CanNegateL && CanNegateR; 4801 // If we cannot naturally negate the whole sub-tree, then this must be 4802 // emitted first. 4803 MustBeFirst = !CanNegate; 4804 } else { 4805 assert(Opcode == TargetOpcode::G_AND && "Must be G_AND"); 4806 // We cannot naturally negate an AND operation. 4807 CanNegate = false; 4808 MustBeFirst = MustBeFirstL || MustBeFirstR; 4809 } 4810 return true; 4811 } 4812 return false; 4813 } 4814 4815 MachineInstr *AArch64InstructionSelector::emitConditionalComparison( 4816 Register LHS, Register RHS, CmpInst::Predicate CC, 4817 AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, 4818 MachineIRBuilder &MIB) const { 4819 // TODO: emit CMN as an optimization. 4820 auto &MRI = *MIB.getMRI(); 4821 LLT OpTy = MRI.getType(LHS); 4822 assert(OpTy.getSizeInBits() == 32 || OpTy.getSizeInBits() == 64); 4823 unsigned CCmpOpc; 4824 if (CmpInst::isIntPredicate(CC)) { 4825 CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWr : AArch64::CCMPXr; 4826 } else { 4827 switch (OpTy.getSizeInBits()) { 4828 case 16: 4829 CCmpOpc = AArch64::FCCMPHrr; 4830 break; 4831 case 32: 4832 CCmpOpc = AArch64::FCCMPSrr; 4833 break; 4834 case 64: 4835 CCmpOpc = AArch64::FCCMPDrr; 4836 break; 4837 default: 4838 return nullptr; 4839 } 4840 } 4841 AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); 4842 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); 4843 auto CCmp = 4844 MIB.buildInstr(CCmpOpc, {}, {LHS, RHS}).addImm(NZCV).addImm(Predicate); 4845 constrainSelectedInstRegOperands(*CCmp, TII, TRI, RBI); 4846 return &*CCmp; 4847 } 4848 4849 MachineInstr *AArch64InstructionSelector::emitConjunctionRec( 4850 Register Val, AArch64CC::CondCode &OutCC, bool Negate, Register CCOp, 4851 AArch64CC::CondCode Predicate, MachineIRBuilder &MIB) const { 4852 // We're at a tree leaf, produce a conditional comparison operation. 4853 auto &MRI = *MIB.getMRI(); 4854 MachineInstr *ValDef = MRI.getVRegDef(Val); 4855 unsigned Opcode = ValDef->getOpcode(); 4856 if (auto *Cmp = dyn_cast<GAnyCmp>(ValDef)) { 4857 Register LHS = Cmp->getLHSReg(); 4858 Register RHS = Cmp->getRHSReg(); 4859 CmpInst::Predicate CC = Cmp->getCond(); 4860 if (Negate) 4861 CC = CmpInst::getInversePredicate(CC); 4862 if (isa<GICmp>(Cmp)) { 4863 OutCC = changeICMPPredToAArch64CC(CC); 4864 } else { 4865 // Handle special FP cases. 4866 AArch64CC::CondCode ExtraCC; 4867 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC); 4868 // Some floating point conditions can't be tested with a single condition 4869 // code. Construct an additional comparison in this case. 4870 if (ExtraCC != AArch64CC::AL) { 4871 MachineInstr *ExtraCmp; 4872 if (!CCOp) 4873 ExtraCmp = emitFPCompare(LHS, RHS, MIB, CC); 4874 else 4875 ExtraCmp = 4876 emitConditionalComparison(LHS, RHS, CC, Predicate, ExtraCC, MIB); 4877 CCOp = ExtraCmp->getOperand(0).getReg(); 4878 Predicate = ExtraCC; 4879 } 4880 } 4881 4882 // Produce a normal comparison if we are first in the chain 4883 if (!CCOp) { 4884 auto Dst = MRI.cloneVirtualRegister(LHS); 4885 if (isa<GICmp>(Cmp)) 4886 return emitSUBS(Dst, Cmp->getOperand(2), Cmp->getOperand(3), MIB); 4887 return emitFPCompare(Cmp->getOperand(2).getReg(), 4888 Cmp->getOperand(3).getReg(), MIB); 4889 } 4890 // Otherwise produce a ccmp. 4891 return emitConditionalComparison(LHS, RHS, CC, Predicate, OutCC, MIB); 4892 } 4893 assert(MRI.hasOneNonDBGUse(Val) && "Valid conjunction/disjunction tree"); 4894 4895 bool IsOR = Opcode == TargetOpcode::G_OR; 4896 4897 Register LHS = ValDef->getOperand(1).getReg(); 4898 bool CanNegateL; 4899 bool MustBeFirstL; 4900 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR, MRI); 4901 assert(ValidL && "Valid conjunction/disjunction tree"); 4902 (void)ValidL; 4903 4904 Register RHS = ValDef->getOperand(2).getReg(); 4905 bool CanNegateR; 4906 bool MustBeFirstR; 4907 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR, MRI); 4908 assert(ValidR && "Valid conjunction/disjunction tree"); 4909 (void)ValidR; 4910 4911 // Swap sub-tree that must come first to the right side. 4912 if (MustBeFirstL) { 4913 assert(!MustBeFirstR && "Valid conjunction/disjunction tree"); 4914 std::swap(LHS, RHS); 4915 std::swap(CanNegateL, CanNegateR); 4916 std::swap(MustBeFirstL, MustBeFirstR); 4917 } 4918 4919 bool NegateR; 4920 bool NegateAfterR; 4921 bool NegateL; 4922 bool NegateAfterAll; 4923 if (Opcode == TargetOpcode::G_OR) { 4924 // Swap the sub-tree that we can negate naturally to the left. 4925 if (!CanNegateL) { 4926 assert(CanNegateR && "at least one side must be negatable"); 4927 assert(!MustBeFirstR && "invalid conjunction/disjunction tree"); 4928 assert(!Negate); 4929 std::swap(LHS, RHS); 4930 NegateR = false; 4931 NegateAfterR = true; 4932 } else { 4933 // Negate the left sub-tree if possible, otherwise negate the result. 4934 NegateR = CanNegateR; 4935 NegateAfterR = !CanNegateR; 4936 } 4937 NegateL = true; 4938 NegateAfterAll = !Negate; 4939 } else { 4940 assert(Opcode == TargetOpcode::G_AND && 4941 "Valid conjunction/disjunction tree"); 4942 assert(!Negate && "Valid conjunction/disjunction tree"); 4943 4944 NegateL = false; 4945 NegateR = false; 4946 NegateAfterR = false; 4947 NegateAfterAll = false; 4948 } 4949 4950 // Emit sub-trees. 4951 AArch64CC::CondCode RHSCC; 4952 MachineInstr *CmpR = 4953 emitConjunctionRec(RHS, RHSCC, NegateR, CCOp, Predicate, MIB); 4954 if (NegateAfterR) 4955 RHSCC = AArch64CC::getInvertedCondCode(RHSCC); 4956 MachineInstr *CmpL = emitConjunctionRec( 4957 LHS, OutCC, NegateL, CmpR->getOperand(0).getReg(), RHSCC, MIB); 4958 if (NegateAfterAll) 4959 OutCC = AArch64CC::getInvertedCondCode(OutCC); 4960 return CmpL; 4961 } 4962 4963 MachineInstr *AArch64InstructionSelector::emitConjunction( 4964 Register Val, AArch64CC::CondCode &OutCC, MachineIRBuilder &MIB) const { 4965 bool DummyCanNegate; 4966 bool DummyMustBeFirst; 4967 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false, 4968 *MIB.getMRI())) 4969 return nullptr; 4970 return emitConjunctionRec(Val, OutCC, false, Register(), AArch64CC::AL, MIB); 4971 } 4972 4973 bool AArch64InstructionSelector::tryOptSelectConjunction(GSelect &SelI, 4974 MachineInstr &CondMI) { 4975 AArch64CC::CondCode AArch64CC; 4976 MachineInstr *ConjMI = emitConjunction(SelI.getCondReg(), AArch64CC, MIB); 4977 if (!ConjMI) 4978 return false; 4979 4980 emitSelect(SelI.getReg(0), SelI.getTrueReg(), SelI.getFalseReg(), AArch64CC, MIB); 4981 SelI.eraseFromParent(); 4982 return true; 4983 } 4984 4985 bool AArch64InstructionSelector::tryOptSelect(GSelect &I) { 4986 MachineRegisterInfo &MRI = *MIB.getMRI(); 4987 // We want to recognize this pattern: 4988 // 4989 // $z = G_FCMP pred, $x, $y 4990 // ... 4991 // $w = G_SELECT $z, $a, $b 4992 // 4993 // Where the value of $z is *only* ever used by the G_SELECT (possibly with 4994 // some copies/truncs in between.) 4995 // 4996 // If we see this, then we can emit something like this: 4997 // 4998 // fcmp $x, $y 4999 // fcsel $w, $a, $b, pred 5000 // 5001 // Rather than emitting both of the rather long sequences in the standard 5002 // G_FCMP/G_SELECT select methods. 5003 5004 // First, check if the condition is defined by a compare. 5005 MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg()); 5006 5007 // We can only fold if all of the defs have one use. 5008 Register CondDefReg = CondDef->getOperand(0).getReg(); 5009 if (!MRI.hasOneNonDBGUse(CondDefReg)) { 5010 // Unless it's another select. 5011 for (const MachineInstr &UI : MRI.use_nodbg_instructions(CondDefReg)) { 5012 if (CondDef == &UI) 5013 continue; 5014 if (UI.getOpcode() != TargetOpcode::G_SELECT) 5015 return false; 5016 } 5017 } 5018 5019 // Is the condition defined by a compare? 5020 unsigned CondOpc = CondDef->getOpcode(); 5021 if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP) { 5022 if (tryOptSelectConjunction(I, *CondDef)) 5023 return true; 5024 return false; 5025 } 5026 5027 AArch64CC::CondCode CondCode; 5028 if (CondOpc == TargetOpcode::G_ICMP) { 5029 auto Pred = 5030 static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate()); 5031 CondCode = changeICMPPredToAArch64CC(Pred); 5032 emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3), 5033 CondDef->getOperand(1), MIB); 5034 } else { 5035 // Get the condition code for the select. 5036 auto Pred = 5037 static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate()); 5038 AArch64CC::CondCode CondCode2; 5039 changeFCMPPredToAArch64CC(Pred, CondCode, CondCode2); 5040 5041 // changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two 5042 // instructions to emit the comparison. 5043 // TODO: Handle FCMP_UEQ and FCMP_ONE. After that, this check will be 5044 // unnecessary. 5045 if (CondCode2 != AArch64CC::AL) 5046 return false; 5047 5048 if (!emitFPCompare(CondDef->getOperand(2).getReg(), 5049 CondDef->getOperand(3).getReg(), MIB)) { 5050 LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n"); 5051 return false; 5052 } 5053 } 5054 5055 // Emit the select. 5056 emitSelect(I.getOperand(0).getReg(), I.getOperand(2).getReg(), 5057 I.getOperand(3).getReg(), CondCode, MIB); 5058 I.eraseFromParent(); 5059 return true; 5060 } 5061 5062 MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare( 5063 MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate, 5064 MachineIRBuilder &MIRBuilder) const { 5065 assert(LHS.isReg() && RHS.isReg() && Predicate.isPredicate() && 5066 "Unexpected MachineOperand"); 5067 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 5068 // We want to find this sort of thing: 5069 // x = G_SUB 0, y 5070 // G_ICMP z, x 5071 // 5072 // In this case, we can fold the G_SUB into the G_ICMP using a CMN instead. 5073 // e.g: 5074 // 5075 // cmn z, y 5076 5077 // Check if the RHS or LHS of the G_ICMP is defined by a SUB 5078 MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI); 5079 MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI); 5080 auto P = static_cast<CmpInst::Predicate>(Predicate.getPredicate()); 5081 // Given this: 5082 // 5083 // x = G_SUB 0, y 5084 // G_ICMP x, z 5085 // 5086 // Produce this: 5087 // 5088 // cmn y, z 5089 if (isCMN(LHSDef, P, MRI)) 5090 return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder); 5091 5092 // Same idea here, but with the RHS of the compare instead: 5093 // 5094 // Given this: 5095 // 5096 // x = G_SUB 0, y 5097 // G_ICMP z, x 5098 // 5099 // Produce this: 5100 // 5101 // cmn z, y 5102 if (isCMN(RHSDef, P, MRI)) 5103 return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder); 5104 5105 // Given this: 5106 // 5107 // z = G_AND x, y 5108 // G_ICMP z, 0 5109 // 5110 // Produce this if the compare is signed: 5111 // 5112 // tst x, y 5113 if (!CmpInst::isUnsigned(P) && LHSDef && 5114 LHSDef->getOpcode() == TargetOpcode::G_AND) { 5115 // Make sure that the RHS is 0. 5116 auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI); 5117 if (!ValAndVReg || ValAndVReg->Value != 0) 5118 return nullptr; 5119 5120 return emitTST(LHSDef->getOperand(1), 5121 LHSDef->getOperand(2), MIRBuilder); 5122 } 5123 5124 return nullptr; 5125 } 5126 5127 bool AArch64InstructionSelector::selectShuffleVector( 5128 MachineInstr &I, MachineRegisterInfo &MRI) { 5129 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 5130 Register Src1Reg = I.getOperand(1).getReg(); 5131 const LLT Src1Ty = MRI.getType(Src1Reg); 5132 Register Src2Reg = I.getOperand(2).getReg(); 5133 const LLT Src2Ty = MRI.getType(Src2Reg); 5134 ArrayRef<int> Mask = I.getOperand(3).getShuffleMask(); 5135 5136 MachineBasicBlock &MBB = *I.getParent(); 5137 MachineFunction &MF = *MBB.getParent(); 5138 LLVMContext &Ctx = MF.getFunction().getContext(); 5139 5140 // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if 5141 // it's originated from a <1 x T> type. Those should have been lowered into 5142 // G_BUILD_VECTOR earlier. 5143 if (!Src1Ty.isVector() || !Src2Ty.isVector()) { 5144 LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n"); 5145 return false; 5146 } 5147 5148 unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8; 5149 5150 SmallVector<Constant *, 64> CstIdxs; 5151 for (int Val : Mask) { 5152 // For now, any undef indexes we'll just assume to be 0. This should be 5153 // optimized in future, e.g. to select DUP etc. 5154 Val = Val < 0 ? 0 : Val; 5155 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { 5156 unsigned Offset = Byte + Val * BytesPerElt; 5157 CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset)); 5158 } 5159 } 5160 5161 // Use a constant pool to load the index vector for TBL. 5162 Constant *CPVal = ConstantVector::get(CstIdxs); 5163 MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIB); 5164 if (!IndexLoad) { 5165 LLVM_DEBUG(dbgs() << "Could not load from a constant pool"); 5166 return false; 5167 } 5168 5169 if (DstTy.getSizeInBits() != 128) { 5170 assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty"); 5171 // This case can be done with TBL1. 5172 MachineInstr *Concat = emitVectorConcat(None, Src1Reg, Src2Reg, MIB); 5173 if (!Concat) { 5174 LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1"); 5175 return false; 5176 } 5177 5178 // The constant pool load will be 64 bits, so need to convert to FPR128 reg. 5179 IndexLoad = emitScalarToVector(64, &AArch64::FPR128RegClass, 5180 IndexLoad->getOperand(0).getReg(), MIB); 5181 5182 auto TBL1 = MIB.buildInstr( 5183 AArch64::TBLv16i8One, {&AArch64::FPR128RegClass}, 5184 {Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()}); 5185 constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI); 5186 5187 auto Copy = 5188 MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {}) 5189 .addReg(TBL1.getReg(0), 0, AArch64::dsub); 5190 RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI); 5191 I.eraseFromParent(); 5192 return true; 5193 } 5194 5195 // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive 5196 // Q registers for regalloc. 5197 SmallVector<Register, 2> Regs = {Src1Reg, Src2Reg}; 5198 auto RegSeq = createQTuple(Regs, MIB); 5199 auto TBL2 = MIB.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0)}, 5200 {RegSeq, IndexLoad->getOperand(0)}); 5201 constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI); 5202 I.eraseFromParent(); 5203 return true; 5204 } 5205 5206 MachineInstr *AArch64InstructionSelector::emitLaneInsert( 5207 Optional<Register> DstReg, Register SrcReg, Register EltReg, 5208 unsigned LaneIdx, const RegisterBank &RB, 5209 MachineIRBuilder &MIRBuilder) const { 5210 MachineInstr *InsElt = nullptr; 5211 const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass; 5212 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 5213 5214 // Create a register to define with the insert if one wasn't passed in. 5215 if (!DstReg) 5216 DstReg = MRI.createVirtualRegister(DstRC); 5217 5218 unsigned EltSize = MRI.getType(EltReg).getSizeInBits(); 5219 unsigned Opc = getInsertVecEltOpInfo(RB, EltSize).first; 5220 5221 if (RB.getID() == AArch64::FPRRegBankID) { 5222 auto InsSub = emitScalarToVector(EltSize, DstRC, EltReg, MIRBuilder); 5223 InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg}) 5224 .addImm(LaneIdx) 5225 .addUse(InsSub->getOperand(0).getReg()) 5226 .addImm(0); 5227 } else { 5228 InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg}) 5229 .addImm(LaneIdx) 5230 .addUse(EltReg); 5231 } 5232 5233 constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI); 5234 return InsElt; 5235 } 5236 5237 bool AArch64InstructionSelector::selectUSMovFromExtend( 5238 MachineInstr &MI, MachineRegisterInfo &MRI) { 5239 if (MI.getOpcode() != TargetOpcode::G_SEXT && 5240 MI.getOpcode() != TargetOpcode::G_ZEXT && 5241 MI.getOpcode() != TargetOpcode::G_ANYEXT) 5242 return false; 5243 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SEXT; 5244 const Register DefReg = MI.getOperand(0).getReg(); 5245 const LLT DstTy = MRI.getType(DefReg); 5246 unsigned DstSize = DstTy.getSizeInBits(); 5247 5248 if (DstSize != 32 && DstSize != 64) 5249 return false; 5250 5251 MachineInstr *Extract = getOpcodeDef(TargetOpcode::G_EXTRACT_VECTOR_ELT, 5252 MI.getOperand(1).getReg(), MRI); 5253 int64_t Lane; 5254 if (!Extract || !mi_match(Extract->getOperand(2).getReg(), MRI, m_ICst(Lane))) 5255 return false; 5256 Register Src0 = Extract->getOperand(1).getReg(); 5257 5258 const LLT &VecTy = MRI.getType(Src0); 5259 5260 if (VecTy.getSizeInBits() != 128) { 5261 const MachineInstr *ScalarToVector = emitScalarToVector( 5262 VecTy.getSizeInBits(), &AArch64::FPR128RegClass, Src0, MIB); 5263 assert(ScalarToVector && "Didn't expect emitScalarToVector to fail!"); 5264 Src0 = ScalarToVector->getOperand(0).getReg(); 5265 } 5266 5267 unsigned Opcode; 5268 if (DstSize == 64 && VecTy.getScalarSizeInBits() == 32) 5269 Opcode = IsSigned ? AArch64::SMOVvi32to64 : AArch64::UMOVvi32; 5270 else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 16) 5271 Opcode = IsSigned ? AArch64::SMOVvi16to64 : AArch64::UMOVvi16; 5272 else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 8) 5273 Opcode = IsSigned ? AArch64::SMOVvi8to64 : AArch64::UMOVvi8; 5274 else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 16) 5275 Opcode = IsSigned ? AArch64::SMOVvi16to32 : AArch64::UMOVvi16; 5276 else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 8) 5277 Opcode = IsSigned ? AArch64::SMOVvi8to32 : AArch64::UMOVvi8; 5278 else 5279 llvm_unreachable("Unexpected type combo for S/UMov!"); 5280 5281 // We may need to generate one of these, depending on the type and sign of the 5282 // input: 5283 // DstReg = SMOV Src0, Lane; 5284 // NewReg = UMOV Src0, Lane; DstReg = SUBREG_TO_REG NewReg, sub_32; 5285 MachineInstr *ExtI = nullptr; 5286 if (DstSize == 64 && !IsSigned) { 5287 Register NewReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 5288 MIB.buildInstr(Opcode, {NewReg}, {Src0}).addImm(Lane); 5289 ExtI = MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {}) 5290 .addImm(0) 5291 .addUse(NewReg) 5292 .addImm(AArch64::sub_32); 5293 RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI); 5294 } else 5295 ExtI = MIB.buildInstr(Opcode, {DefReg}, {Src0}).addImm(Lane); 5296 5297 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 5298 MI.eraseFromParent(); 5299 return true; 5300 } 5301 5302 bool AArch64InstructionSelector::selectInsertElt(MachineInstr &I, 5303 MachineRegisterInfo &MRI) { 5304 assert(I.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT); 5305 5306 // Get information on the destination. 5307 Register DstReg = I.getOperand(0).getReg(); 5308 const LLT DstTy = MRI.getType(DstReg); 5309 unsigned VecSize = DstTy.getSizeInBits(); 5310 5311 // Get information on the element we want to insert into the destination. 5312 Register EltReg = I.getOperand(2).getReg(); 5313 const LLT EltTy = MRI.getType(EltReg); 5314 unsigned EltSize = EltTy.getSizeInBits(); 5315 if (EltSize < 16 || EltSize > 64) 5316 return false; // Don't support all element types yet. 5317 5318 // Find the definition of the index. Bail out if it's not defined by a 5319 // G_CONSTANT. 5320 Register IdxReg = I.getOperand(3).getReg(); 5321 auto VRegAndVal = getIConstantVRegValWithLookThrough(IdxReg, MRI); 5322 if (!VRegAndVal) 5323 return false; 5324 unsigned LaneIdx = VRegAndVal->Value.getSExtValue(); 5325 5326 // Perform the lane insert. 5327 Register SrcReg = I.getOperand(1).getReg(); 5328 const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI); 5329 5330 if (VecSize < 128) { 5331 // If the vector we're inserting into is smaller than 128 bits, widen it 5332 // to 128 to do the insert. 5333 MachineInstr *ScalarToVec = 5334 emitScalarToVector(VecSize, &AArch64::FPR128RegClass, SrcReg, MIB); 5335 if (!ScalarToVec) 5336 return false; 5337 SrcReg = ScalarToVec->getOperand(0).getReg(); 5338 } 5339 5340 // Create an insert into a new FPR128 register. 5341 // Note that if our vector is already 128 bits, we end up emitting an extra 5342 // register. 5343 MachineInstr *InsMI = 5344 emitLaneInsert(None, SrcReg, EltReg, LaneIdx, EltRB, MIB); 5345 5346 if (VecSize < 128) { 5347 // If we had to widen to perform the insert, then we have to demote back to 5348 // the original size to get the result we want. 5349 Register DemoteVec = InsMI->getOperand(0).getReg(); 5350 const TargetRegisterClass *RC = 5351 getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DemoteVec, MRI, TRI)); 5352 if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) { 5353 LLVM_DEBUG(dbgs() << "Unsupported register class!\n"); 5354 return false; 5355 } 5356 unsigned SubReg = 0; 5357 if (!getSubRegForClass(RC, TRI, SubReg)) 5358 return false; 5359 if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) { 5360 LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << VecSize 5361 << "\n"); 5362 return false; 5363 } 5364 MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}) 5365 .addReg(DemoteVec, 0, SubReg); 5366 RBI.constrainGenericRegister(DstReg, *RC, MRI); 5367 } else { 5368 // No widening needed. 5369 InsMI->getOperand(0).setReg(DstReg); 5370 constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI); 5371 } 5372 5373 I.eraseFromParent(); 5374 return true; 5375 } 5376 5377 MachineInstr * 5378 AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV, 5379 MachineIRBuilder &MIRBuilder, 5380 MachineRegisterInfo &MRI) { 5381 LLT DstTy = MRI.getType(Dst); 5382 unsigned DstSize = DstTy.getSizeInBits(); 5383 if (CV->isNullValue()) { 5384 if (DstSize == 128) { 5385 auto Mov = 5386 MIRBuilder.buildInstr(AArch64::MOVIv2d_ns, {Dst}, {}).addImm(0); 5387 constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI); 5388 return &*Mov; 5389 } 5390 5391 if (DstSize == 64) { 5392 auto Mov = 5393 MIRBuilder 5394 .buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {}) 5395 .addImm(0); 5396 auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {Dst}, {}) 5397 .addReg(Mov.getReg(0), 0, AArch64::dsub); 5398 RBI.constrainGenericRegister(Dst, AArch64::FPR64RegClass, MRI); 5399 return &*Copy; 5400 } 5401 } 5402 5403 auto *CPLoad = emitLoadFromConstantPool(CV, MIRBuilder); 5404 if (!CPLoad) { 5405 LLVM_DEBUG(dbgs() << "Could not generate cp load for constant vector!"); 5406 return nullptr; 5407 } 5408 5409 auto Copy = MIRBuilder.buildCopy(Dst, CPLoad->getOperand(0)); 5410 RBI.constrainGenericRegister( 5411 Dst, *MRI.getRegClass(CPLoad->getOperand(0).getReg()), MRI); 5412 return &*Copy; 5413 } 5414 5415 bool AArch64InstructionSelector::tryOptConstantBuildVec( 5416 MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) { 5417 assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR); 5418 unsigned DstSize = DstTy.getSizeInBits(); 5419 assert(DstSize <= 128 && "Unexpected build_vec type!"); 5420 if (DstSize < 32) 5421 return false; 5422 // Check if we're building a constant vector, in which case we want to 5423 // generate a constant pool load instead of a vector insert sequence. 5424 SmallVector<Constant *, 16> Csts; 5425 for (unsigned Idx = 1; Idx < I.getNumOperands(); ++Idx) { 5426 // Try to find G_CONSTANT or G_FCONSTANT 5427 auto *OpMI = 5428 getOpcodeDef(TargetOpcode::G_CONSTANT, I.getOperand(Idx).getReg(), MRI); 5429 if (OpMI) 5430 Csts.emplace_back( 5431 const_cast<ConstantInt *>(OpMI->getOperand(1).getCImm())); 5432 else if ((OpMI = getOpcodeDef(TargetOpcode::G_FCONSTANT, 5433 I.getOperand(Idx).getReg(), MRI))) 5434 Csts.emplace_back( 5435 const_cast<ConstantFP *>(OpMI->getOperand(1).getFPImm())); 5436 else 5437 return false; 5438 } 5439 Constant *CV = ConstantVector::get(Csts); 5440 if (!emitConstantVector(I.getOperand(0).getReg(), CV, MIB, MRI)) 5441 return false; 5442 I.eraseFromParent(); 5443 return true; 5444 } 5445 5446 bool AArch64InstructionSelector::tryOptBuildVecToSubregToReg( 5447 MachineInstr &I, MachineRegisterInfo &MRI) { 5448 // Given: 5449 // %vec = G_BUILD_VECTOR %elt, %undef, %undef, ... %undef 5450 // 5451 // Select the G_BUILD_VECTOR as a SUBREG_TO_REG from %elt. 5452 Register Dst = I.getOperand(0).getReg(); 5453 Register EltReg = I.getOperand(1).getReg(); 5454 LLT EltTy = MRI.getType(EltReg); 5455 // If the index isn't on the same bank as its elements, then this can't be a 5456 // SUBREG_TO_REG. 5457 const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI); 5458 const RegisterBank &DstRB = *RBI.getRegBank(Dst, MRI, TRI); 5459 if (EltRB != DstRB) 5460 return false; 5461 if (any_of(make_range(I.operands_begin() + 2, I.operands_end()), 5462 [&MRI](const MachineOperand &Op) { 5463 return !getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, Op.getReg(), 5464 MRI); 5465 })) 5466 return false; 5467 unsigned SubReg; 5468 const TargetRegisterClass *EltRC = getRegClassForTypeOnBank(EltTy, EltRB); 5469 if (!EltRC) 5470 return false; 5471 const TargetRegisterClass *DstRC = 5472 getRegClassForTypeOnBank(MRI.getType(Dst), DstRB); 5473 if (!DstRC) 5474 return false; 5475 if (!getSubRegForClass(EltRC, TRI, SubReg)) 5476 return false; 5477 auto SubregToReg = MIB.buildInstr(AArch64::SUBREG_TO_REG, {Dst}, {}) 5478 .addImm(0) 5479 .addUse(EltReg) 5480 .addImm(SubReg); 5481 I.eraseFromParent(); 5482 constrainSelectedInstRegOperands(*SubregToReg, TII, TRI, RBI); 5483 return RBI.constrainGenericRegister(Dst, *DstRC, MRI); 5484 } 5485 5486 bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I, 5487 MachineRegisterInfo &MRI) { 5488 assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR); 5489 // Until we port more of the optimized selections, for now just use a vector 5490 // insert sequence. 5491 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 5492 const LLT EltTy = MRI.getType(I.getOperand(1).getReg()); 5493 unsigned EltSize = EltTy.getSizeInBits(); 5494 5495 if (tryOptConstantBuildVec(I, DstTy, MRI)) 5496 return true; 5497 if (tryOptBuildVecToSubregToReg(I, MRI)) 5498 return true; 5499 5500 if (EltSize < 16 || EltSize > 64) 5501 return false; // Don't support all element types yet. 5502 const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI); 5503 5504 const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass; 5505 MachineInstr *ScalarToVec = 5506 emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC, 5507 I.getOperand(1).getReg(), MIB); 5508 if (!ScalarToVec) 5509 return false; 5510 5511 Register DstVec = ScalarToVec->getOperand(0).getReg(); 5512 unsigned DstSize = DstTy.getSizeInBits(); 5513 5514 // Keep track of the last MI we inserted. Later on, we might be able to save 5515 // a copy using it. 5516 MachineInstr *PrevMI = nullptr; 5517 for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) { 5518 // Note that if we don't do a subregister copy, we can end up making an 5519 // extra register. 5520 PrevMI = &*emitLaneInsert(None, DstVec, I.getOperand(i).getReg(), i - 1, RB, 5521 MIB); 5522 DstVec = PrevMI->getOperand(0).getReg(); 5523 } 5524 5525 // If DstTy's size in bits is less than 128, then emit a subregister copy 5526 // from DstVec to the last register we've defined. 5527 if (DstSize < 128) { 5528 // Force this to be FPR using the destination vector. 5529 const TargetRegisterClass *RC = 5530 getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DstVec, MRI, TRI)); 5531 if (!RC) 5532 return false; 5533 if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) { 5534 LLVM_DEBUG(dbgs() << "Unsupported register class!\n"); 5535 return false; 5536 } 5537 5538 unsigned SubReg = 0; 5539 if (!getSubRegForClass(RC, TRI, SubReg)) 5540 return false; 5541 if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) { 5542 LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << DstSize 5543 << "\n"); 5544 return false; 5545 } 5546 5547 Register Reg = MRI.createVirtualRegister(RC); 5548 Register DstReg = I.getOperand(0).getReg(); 5549 5550 MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}).addReg(DstVec, 0, SubReg); 5551 MachineOperand &RegOp = I.getOperand(1); 5552 RegOp.setReg(Reg); 5553 RBI.constrainGenericRegister(DstReg, *RC, MRI); 5554 } else { 5555 // We don't need a subregister copy. Save a copy by re-using the 5556 // destination register on the final insert. 5557 assert(PrevMI && "PrevMI was null?"); 5558 PrevMI->getOperand(0).setReg(I.getOperand(0).getReg()); 5559 constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI); 5560 } 5561 5562 I.eraseFromParent(); 5563 return true; 5564 } 5565 5566 bool AArch64InstructionSelector::selectVectorLoadIntrinsic(unsigned Opc, 5567 unsigned NumVecs, 5568 MachineInstr &I) { 5569 assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS); 5570 assert(Opc && "Expected an opcode?"); 5571 assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors"); 5572 auto &MRI = *MIB.getMRI(); 5573 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 5574 unsigned Size = Ty.getSizeInBits(); 5575 assert((Size == 64 || Size == 128) && 5576 "Destination must be 64 bits or 128 bits?"); 5577 unsigned SubReg = Size == 64 ? AArch64::dsub0 : AArch64::qsub0; 5578 auto Ptr = I.getOperand(I.getNumOperands() - 1).getReg(); 5579 assert(MRI.getType(Ptr).isPointer() && "Expected a pointer type?"); 5580 auto Load = MIB.buildInstr(Opc, {Ty}, {Ptr}); 5581 Load.cloneMemRefs(I); 5582 constrainSelectedInstRegOperands(*Load, TII, TRI, RBI); 5583 Register SelectedLoadDst = Load->getOperand(0).getReg(); 5584 for (unsigned Idx = 0; Idx < NumVecs; ++Idx) { 5585 auto Vec = MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(Idx)}, {}) 5586 .addReg(SelectedLoadDst, 0, SubReg + Idx); 5587 // Emit the subreg copies and immediately select them. 5588 // FIXME: We should refactor our copy code into an emitCopy helper and 5589 // clean up uses of this pattern elsewhere in the selector. 5590 selectCopy(*Vec, TII, MRI, TRI, RBI); 5591 } 5592 return true; 5593 } 5594 5595 bool AArch64InstructionSelector::selectIntrinsicWithSideEffects( 5596 MachineInstr &I, MachineRegisterInfo &MRI) { 5597 // Find the intrinsic ID. 5598 unsigned IntrinID = I.getIntrinsicID(); 5599 5600 const LLT S8 = LLT::scalar(8); 5601 const LLT S16 = LLT::scalar(16); 5602 const LLT S32 = LLT::scalar(32); 5603 const LLT S64 = LLT::scalar(64); 5604 const LLT P0 = LLT::pointer(0, 64); 5605 // Select the instruction. 5606 switch (IntrinID) { 5607 default: 5608 return false; 5609 case Intrinsic::aarch64_ldxp: 5610 case Intrinsic::aarch64_ldaxp: { 5611 auto NewI = MIB.buildInstr( 5612 IntrinID == Intrinsic::aarch64_ldxp ? AArch64::LDXPX : AArch64::LDAXPX, 5613 {I.getOperand(0).getReg(), I.getOperand(1).getReg()}, 5614 {I.getOperand(3)}); 5615 NewI.cloneMemRefs(I); 5616 constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI); 5617 break; 5618 } 5619 case Intrinsic::trap: 5620 MIB.buildInstr(AArch64::BRK, {}, {}).addImm(1); 5621 break; 5622 case Intrinsic::debugtrap: 5623 MIB.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000); 5624 break; 5625 case Intrinsic::ubsantrap: 5626 MIB.buildInstr(AArch64::BRK, {}, {}) 5627 .addImm(I.getOperand(1).getImm() | ('U' << 8)); 5628 break; 5629 case Intrinsic::aarch64_neon_ld2: { 5630 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 5631 unsigned Opc = 0; 5632 if (Ty == LLT::fixed_vector(8, S8)) 5633 Opc = AArch64::LD2Twov8b; 5634 else if (Ty == LLT::fixed_vector(16, S8)) 5635 Opc = AArch64::LD2Twov16b; 5636 else if (Ty == LLT::fixed_vector(4, S16)) 5637 Opc = AArch64::LD2Twov4h; 5638 else if (Ty == LLT::fixed_vector(8, S16)) 5639 Opc = AArch64::LD2Twov8h; 5640 else if (Ty == LLT::fixed_vector(2, S32)) 5641 Opc = AArch64::LD2Twov2s; 5642 else if (Ty == LLT::fixed_vector(4, S32)) 5643 Opc = AArch64::LD2Twov4s; 5644 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 5645 Opc = AArch64::LD2Twov2d; 5646 else if (Ty == S64 || Ty == P0) 5647 Opc = AArch64::LD1Twov1d; 5648 else 5649 llvm_unreachable("Unexpected type for ld2!"); 5650 selectVectorLoadIntrinsic(Opc, 2, I); 5651 break; 5652 } 5653 case Intrinsic::aarch64_neon_ld4: { 5654 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 5655 unsigned Opc = 0; 5656 if (Ty == LLT::fixed_vector(8, S8)) 5657 Opc = AArch64::LD4Fourv8b; 5658 else if (Ty == LLT::fixed_vector(16, S8)) 5659 Opc = AArch64::LD4Fourv16b; 5660 else if (Ty == LLT::fixed_vector(4, S16)) 5661 Opc = AArch64::LD4Fourv4h; 5662 else if (Ty == LLT::fixed_vector(8, S16)) 5663 Opc = AArch64::LD4Fourv8h; 5664 else if (Ty == LLT::fixed_vector(2, S32)) 5665 Opc = AArch64::LD4Fourv2s; 5666 else if (Ty == LLT::fixed_vector(4, S32)) 5667 Opc = AArch64::LD4Fourv4s; 5668 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 5669 Opc = AArch64::LD4Fourv2d; 5670 else if (Ty == S64 || Ty == P0) 5671 Opc = AArch64::LD1Fourv1d; 5672 else 5673 llvm_unreachable("Unexpected type for ld4!"); 5674 selectVectorLoadIntrinsic(Opc, 4, I); 5675 break; 5676 } 5677 case Intrinsic::aarch64_neon_st2: { 5678 Register Src1 = I.getOperand(1).getReg(); 5679 Register Src2 = I.getOperand(2).getReg(); 5680 Register Ptr = I.getOperand(3).getReg(); 5681 LLT Ty = MRI.getType(Src1); 5682 unsigned Opc; 5683 if (Ty == LLT::fixed_vector(8, S8)) 5684 Opc = AArch64::ST2Twov8b; 5685 else if (Ty == LLT::fixed_vector(16, S8)) 5686 Opc = AArch64::ST2Twov16b; 5687 else if (Ty == LLT::fixed_vector(4, S16)) 5688 Opc = AArch64::ST2Twov4h; 5689 else if (Ty == LLT::fixed_vector(8, S16)) 5690 Opc = AArch64::ST2Twov8h; 5691 else if (Ty == LLT::fixed_vector(2, S32)) 5692 Opc = AArch64::ST2Twov2s; 5693 else if (Ty == LLT::fixed_vector(4, S32)) 5694 Opc = AArch64::ST2Twov4s; 5695 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 5696 Opc = AArch64::ST2Twov2d; 5697 else if (Ty == S64 || Ty == P0) 5698 Opc = AArch64::ST1Twov1d; 5699 else 5700 llvm_unreachable("Unexpected type for st2!"); 5701 SmallVector<Register, 2> Regs = {Src1, Src2}; 5702 Register Tuple = Ty.getSizeInBits() == 128 ? createQTuple(Regs, MIB) 5703 : createDTuple(Regs, MIB); 5704 auto Store = MIB.buildInstr(Opc, {}, {Tuple, Ptr}); 5705 Store.cloneMemRefs(I); 5706 constrainSelectedInstRegOperands(*Store, TII, TRI, RBI); 5707 break; 5708 } 5709 case Intrinsic::aarch64_mops_memset_tag: { 5710 // Transform 5711 // %dst:gpr(p0) = \ 5712 // G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.mops.memset.tag), 5713 // \ %dst:gpr(p0), %val:gpr(s64), %n:gpr(s64) 5714 // where %dst is updated, into 5715 // %Rd:GPR64common, %Rn:GPR64) = \ 5716 // MOPSMemorySetTaggingPseudo \ 5717 // %Rd:GPR64common, %Rn:GPR64, %Rm:GPR64 5718 // where Rd and Rn are tied. 5719 // It is expected that %val has been extended to s64 in legalization. 5720 // Note that the order of the size/value operands are swapped. 5721 5722 Register DstDef = I.getOperand(0).getReg(); 5723 // I.getOperand(1) is the intrinsic function 5724 Register DstUse = I.getOperand(2).getReg(); 5725 Register ValUse = I.getOperand(3).getReg(); 5726 Register SizeUse = I.getOperand(4).getReg(); 5727 5728 // MOPSMemorySetTaggingPseudo has two defs; the intrinsic call has only one. 5729 // Therefore an additional virtual register is requried for the updated size 5730 // operand. This value is not accessible via the semantics of the intrinsic. 5731 Register SizeDef = MRI.createGenericVirtualRegister(LLT::scalar(64)); 5732 5733 auto Memset = MIB.buildInstr(AArch64::MOPSMemorySetTaggingPseudo, 5734 {DstDef, SizeDef}, {DstUse, SizeUse, ValUse}); 5735 Memset.cloneMemRefs(I); 5736 constrainSelectedInstRegOperands(*Memset, TII, TRI, RBI); 5737 break; 5738 } 5739 } 5740 5741 I.eraseFromParent(); 5742 return true; 5743 } 5744 5745 bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I, 5746 MachineRegisterInfo &MRI) { 5747 unsigned IntrinID = I.getIntrinsicID(); 5748 5749 switch (IntrinID) { 5750 default: 5751 break; 5752 case Intrinsic::aarch64_crypto_sha1h: { 5753 Register DstReg = I.getOperand(0).getReg(); 5754 Register SrcReg = I.getOperand(2).getReg(); 5755 5756 // FIXME: Should this be an assert? 5757 if (MRI.getType(DstReg).getSizeInBits() != 32 || 5758 MRI.getType(SrcReg).getSizeInBits() != 32) 5759 return false; 5760 5761 // The operation has to happen on FPRs. Set up some new FPR registers for 5762 // the source and destination if they are on GPRs. 5763 if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) { 5764 SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); 5765 MIB.buildCopy({SrcReg}, {I.getOperand(2)}); 5766 5767 // Make sure the copy ends up getting constrained properly. 5768 RBI.constrainGenericRegister(I.getOperand(2).getReg(), 5769 AArch64::GPR32RegClass, MRI); 5770 } 5771 5772 if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) 5773 DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); 5774 5775 // Actually insert the instruction. 5776 auto SHA1Inst = MIB.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg}); 5777 constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI); 5778 5779 // Did we create a new register for the destination? 5780 if (DstReg != I.getOperand(0).getReg()) { 5781 // Yep. Copy the result of the instruction back into the original 5782 // destination. 5783 MIB.buildCopy({I.getOperand(0)}, {DstReg}); 5784 RBI.constrainGenericRegister(I.getOperand(0).getReg(), 5785 AArch64::GPR32RegClass, MRI); 5786 } 5787 5788 I.eraseFromParent(); 5789 return true; 5790 } 5791 case Intrinsic::ptrauth_sign: { 5792 Register DstReg = I.getOperand(0).getReg(); 5793 Register ValReg = I.getOperand(2).getReg(); 5794 uint64_t Key = I.getOperand(3).getImm(); 5795 Register DiscReg = I.getOperand(4).getReg(); 5796 auto DiscVal = getIConstantVRegVal(DiscReg, MRI); 5797 bool IsDiscZero = DiscVal && DiscVal->isNullValue(); 5798 5799 if (Key > 3) 5800 return false; 5801 5802 unsigned Opcodes[][4] = { 5803 {AArch64::PACIA, AArch64::PACIB, AArch64::PACDA, AArch64::PACDB}, 5804 {AArch64::PACIZA, AArch64::PACIZB, AArch64::PACDZA, AArch64::PACDZB}}; 5805 unsigned Opcode = Opcodes[IsDiscZero][Key]; 5806 5807 auto PAC = MIB.buildInstr(Opcode, {DstReg}, {ValReg}); 5808 5809 if (!IsDiscZero) { 5810 PAC.addUse(DiscReg); 5811 RBI.constrainGenericRegister(DiscReg, AArch64::GPR64spRegClass, MRI); 5812 } 5813 5814 RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI); 5815 I.eraseFromParent(); 5816 return true; 5817 } 5818 case Intrinsic::frameaddress: 5819 case Intrinsic::returnaddress: { 5820 MachineFunction &MF = *I.getParent()->getParent(); 5821 MachineFrameInfo &MFI = MF.getFrameInfo(); 5822 5823 unsigned Depth = I.getOperand(2).getImm(); 5824 Register DstReg = I.getOperand(0).getReg(); 5825 RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI); 5826 5827 if (Depth == 0 && IntrinID == Intrinsic::returnaddress) { 5828 if (!MFReturnAddr) { 5829 // Insert the copy from LR/X30 into the entry block, before it can be 5830 // clobbered by anything. 5831 MFI.setReturnAddressIsTaken(true); 5832 MFReturnAddr = getFunctionLiveInPhysReg( 5833 MF, TII, AArch64::LR, AArch64::GPR64RegClass, I.getDebugLoc()); 5834 } 5835 5836 if (STI.hasPAuth()) { 5837 MIB.buildInstr(AArch64::XPACI, {DstReg}, {MFReturnAddr}); 5838 } else { 5839 MIB.buildCopy({Register(AArch64::LR)}, {MFReturnAddr}); 5840 MIB.buildInstr(AArch64::XPACLRI); 5841 MIB.buildCopy({DstReg}, {Register(AArch64::LR)}); 5842 } 5843 5844 I.eraseFromParent(); 5845 return true; 5846 } 5847 5848 MFI.setFrameAddressIsTaken(true); 5849 Register FrameAddr(AArch64::FP); 5850 while (Depth--) { 5851 Register NextFrame = MRI.createVirtualRegister(&AArch64::GPR64spRegClass); 5852 auto Ldr = 5853 MIB.buildInstr(AArch64::LDRXui, {NextFrame}, {FrameAddr}).addImm(0); 5854 constrainSelectedInstRegOperands(*Ldr, TII, TRI, RBI); 5855 FrameAddr = NextFrame; 5856 } 5857 5858 if (IntrinID == Intrinsic::frameaddress) 5859 MIB.buildCopy({DstReg}, {FrameAddr}); 5860 else { 5861 MFI.setReturnAddressIsTaken(true); 5862 5863 if (STI.hasPAuth()) { 5864 Register TmpReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); 5865 MIB.buildInstr(AArch64::LDRXui, {TmpReg}, {FrameAddr}).addImm(1); 5866 MIB.buildInstr(AArch64::XPACI, {DstReg}, {TmpReg}); 5867 } else { 5868 MIB.buildInstr(AArch64::LDRXui, {Register(AArch64::LR)}, {FrameAddr}) 5869 .addImm(1); 5870 MIB.buildInstr(AArch64::XPACLRI); 5871 MIB.buildCopy({DstReg}, {Register(AArch64::LR)}); 5872 } 5873 } 5874 5875 I.eraseFromParent(); 5876 return true; 5877 } 5878 case Intrinsic::swift_async_context_addr: 5879 auto Sub = MIB.buildInstr(AArch64::SUBXri, {I.getOperand(0).getReg()}, 5880 {Register(AArch64::FP)}) 5881 .addImm(8) 5882 .addImm(0); 5883 constrainSelectedInstRegOperands(*Sub, TII, TRI, RBI); 5884 5885 MF->getFrameInfo().setFrameAddressIsTaken(true); 5886 MF->getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true); 5887 I.eraseFromParent(); 5888 return true; 5889 } 5890 return false; 5891 } 5892 5893 InstructionSelector::ComplexRendererFns 5894 AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const { 5895 auto MaybeImmed = getImmedFromMO(Root); 5896 if (MaybeImmed == None || *MaybeImmed > 31) 5897 return None; 5898 uint64_t Enc = (32 - *MaybeImmed) & 0x1f; 5899 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; 5900 } 5901 5902 InstructionSelector::ComplexRendererFns 5903 AArch64InstructionSelector::selectShiftB_32(const MachineOperand &Root) const { 5904 auto MaybeImmed = getImmedFromMO(Root); 5905 if (MaybeImmed == None || *MaybeImmed > 31) 5906 return None; 5907 uint64_t Enc = 31 - *MaybeImmed; 5908 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; 5909 } 5910 5911 InstructionSelector::ComplexRendererFns 5912 AArch64InstructionSelector::selectShiftA_64(const MachineOperand &Root) const { 5913 auto MaybeImmed = getImmedFromMO(Root); 5914 if (MaybeImmed == None || *MaybeImmed > 63) 5915 return None; 5916 uint64_t Enc = (64 - *MaybeImmed) & 0x3f; 5917 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; 5918 } 5919 5920 InstructionSelector::ComplexRendererFns 5921 AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const { 5922 auto MaybeImmed = getImmedFromMO(Root); 5923 if (MaybeImmed == None || *MaybeImmed > 63) 5924 return None; 5925 uint64_t Enc = 63 - *MaybeImmed; 5926 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; 5927 } 5928 5929 /// Helper to select an immediate value that can be represented as a 12-bit 5930 /// value shifted left by either 0 or 12. If it is possible to do so, return 5931 /// the immediate and shift value. If not, return None. 5932 /// 5933 /// Used by selectArithImmed and selectNegArithImmed. 5934 InstructionSelector::ComplexRendererFns 5935 AArch64InstructionSelector::select12BitValueWithLeftShift( 5936 uint64_t Immed) const { 5937 unsigned ShiftAmt; 5938 if (Immed >> 12 == 0) { 5939 ShiftAmt = 0; 5940 } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) { 5941 ShiftAmt = 12; 5942 Immed = Immed >> 12; 5943 } else 5944 return None; 5945 5946 unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt); 5947 return {{ 5948 [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); }, 5949 [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); }, 5950 }}; 5951 } 5952 5953 /// SelectArithImmed - Select an immediate value that can be represented as 5954 /// a 12-bit value shifted left by either 0 or 12. If so, return true with 5955 /// Val set to the 12-bit value and Shift set to the shifter operand. 5956 InstructionSelector::ComplexRendererFns 5957 AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const { 5958 // This function is called from the addsub_shifted_imm ComplexPattern, 5959 // which lists [imm] as the list of opcode it's interested in, however 5960 // we still need to check whether the operand is actually an immediate 5961 // here because the ComplexPattern opcode list is only used in 5962 // root-level opcode matching. 5963 auto MaybeImmed = getImmedFromMO(Root); 5964 if (MaybeImmed == None) 5965 return None; 5966 return select12BitValueWithLeftShift(*MaybeImmed); 5967 } 5968 5969 /// SelectNegArithImmed - As above, but negates the value before trying to 5970 /// select it. 5971 InstructionSelector::ComplexRendererFns 5972 AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const { 5973 // We need a register here, because we need to know if we have a 64 or 32 5974 // bit immediate. 5975 if (!Root.isReg()) 5976 return None; 5977 auto MaybeImmed = getImmedFromMO(Root); 5978 if (MaybeImmed == None) 5979 return None; 5980 uint64_t Immed = *MaybeImmed; 5981 5982 // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0" 5983 // have the opposite effect on the C flag, so this pattern mustn't match under 5984 // those circumstances. 5985 if (Immed == 0) 5986 return None; 5987 5988 // Check if we're dealing with a 32-bit type on the root or a 64-bit type on 5989 // the root. 5990 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 5991 if (MRI.getType(Root.getReg()).getSizeInBits() == 32) 5992 Immed = ~((uint32_t)Immed) + 1; 5993 else 5994 Immed = ~Immed + 1ULL; 5995 5996 if (Immed & 0xFFFFFFFFFF000000ULL) 5997 return None; 5998 5999 Immed &= 0xFFFFFFULL; 6000 return select12BitValueWithLeftShift(Immed); 6001 } 6002 6003 /// Return true if it is worth folding MI into an extended register. That is, 6004 /// if it's safe to pull it into the addressing mode of a load or store as a 6005 /// shift. 6006 bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg( 6007 MachineInstr &MI, const MachineRegisterInfo &MRI) const { 6008 // Always fold if there is one use, or if we're optimizing for size. 6009 Register DefReg = MI.getOperand(0).getReg(); 6010 if (MRI.hasOneNonDBGUse(DefReg) || 6011 MI.getParent()->getParent()->getFunction().hasOptSize()) 6012 return true; 6013 6014 // It's better to avoid folding and recomputing shifts when we don't have a 6015 // fastpath. 6016 if (!STI.hasLSLFast()) 6017 return false; 6018 6019 // We have a fastpath, so folding a shift in and potentially computing it 6020 // many times may be beneficial. Check if this is only used in memory ops. 6021 // If it is, then we should fold. 6022 return all_of(MRI.use_nodbg_instructions(DefReg), 6023 [](MachineInstr &Use) { return Use.mayLoadOrStore(); }); 6024 } 6025 6026 static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) { 6027 switch (Type) { 6028 case AArch64_AM::SXTB: 6029 case AArch64_AM::SXTH: 6030 case AArch64_AM::SXTW: 6031 return true; 6032 default: 6033 return false; 6034 } 6035 } 6036 6037 InstructionSelector::ComplexRendererFns 6038 AArch64InstructionSelector::selectExtendedSHL( 6039 MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset, 6040 unsigned SizeInBytes, bool WantsExt) const { 6041 assert(Base.isReg() && "Expected base to be a register operand"); 6042 assert(Offset.isReg() && "Expected offset to be a register operand"); 6043 6044 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 6045 MachineInstr *OffsetInst = MRI.getVRegDef(Offset.getReg()); 6046 6047 unsigned OffsetOpc = OffsetInst->getOpcode(); 6048 bool LookedThroughZExt = false; 6049 if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) { 6050 // Try to look through a ZEXT. 6051 if (OffsetOpc != TargetOpcode::G_ZEXT || !WantsExt) 6052 return None; 6053 6054 OffsetInst = MRI.getVRegDef(OffsetInst->getOperand(1).getReg()); 6055 OffsetOpc = OffsetInst->getOpcode(); 6056 LookedThroughZExt = true; 6057 6058 if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) 6059 return None; 6060 } 6061 // Make sure that the memory op is a valid size. 6062 int64_t LegalShiftVal = Log2_32(SizeInBytes); 6063 if (LegalShiftVal == 0) 6064 return None; 6065 if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI)) 6066 return None; 6067 6068 // Now, try to find the specific G_CONSTANT. Start by assuming that the 6069 // register we will offset is the LHS, and the register containing the 6070 // constant is the RHS. 6071 Register OffsetReg = OffsetInst->getOperand(1).getReg(); 6072 Register ConstantReg = OffsetInst->getOperand(2).getReg(); 6073 auto ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI); 6074 if (!ValAndVReg) { 6075 // We didn't get a constant on the RHS. If the opcode is a shift, then 6076 // we're done. 6077 if (OffsetOpc == TargetOpcode::G_SHL) 6078 return None; 6079 6080 // If we have a G_MUL, we can use either register. Try looking at the RHS. 6081 std::swap(OffsetReg, ConstantReg); 6082 ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI); 6083 if (!ValAndVReg) 6084 return None; 6085 } 6086 6087 // The value must fit into 3 bits, and must be positive. Make sure that is 6088 // true. 6089 int64_t ImmVal = ValAndVReg->Value.getSExtValue(); 6090 6091 // Since we're going to pull this into a shift, the constant value must be 6092 // a power of 2. If we got a multiply, then we need to check this. 6093 if (OffsetOpc == TargetOpcode::G_MUL) { 6094 if (!isPowerOf2_32(ImmVal)) 6095 return None; 6096 6097 // Got a power of 2. So, the amount we'll shift is the log base-2 of that. 6098 ImmVal = Log2_32(ImmVal); 6099 } 6100 6101 if ((ImmVal & 0x7) != ImmVal) 6102 return None; 6103 6104 // We are only allowed to shift by LegalShiftVal. This shift value is built 6105 // into the instruction, so we can't just use whatever we want. 6106 if (ImmVal != LegalShiftVal) 6107 return None; 6108 6109 unsigned SignExtend = 0; 6110 if (WantsExt) { 6111 // Check if the offset is defined by an extend, unless we looked through a 6112 // G_ZEXT earlier. 6113 if (!LookedThroughZExt) { 6114 MachineInstr *ExtInst = getDefIgnoringCopies(OffsetReg, MRI); 6115 auto Ext = getExtendTypeForInst(*ExtInst, MRI, true); 6116 if (Ext == AArch64_AM::InvalidShiftExtend) 6117 return None; 6118 6119 SignExtend = isSignExtendShiftType(Ext) ? 1 : 0; 6120 // We only support SXTW for signed extension here. 6121 if (SignExtend && Ext != AArch64_AM::SXTW) 6122 return None; 6123 OffsetReg = ExtInst->getOperand(1).getReg(); 6124 } 6125 6126 // Need a 32-bit wide register here. 6127 MachineIRBuilder MIB(*MRI.getVRegDef(Root.getReg())); 6128 OffsetReg = moveScalarRegClass(OffsetReg, AArch64::GPR32RegClass, MIB); 6129 } 6130 6131 // We can use the LHS of the GEP as the base, and the LHS of the shift as an 6132 // offset. Signify that we are shifting by setting the shift flag to 1. 6133 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(Base.getReg()); }, 6134 [=](MachineInstrBuilder &MIB) { MIB.addUse(OffsetReg); }, 6135 [=](MachineInstrBuilder &MIB) { 6136 // Need to add both immediates here to make sure that they are both 6137 // added to the instruction. 6138 MIB.addImm(SignExtend); 6139 MIB.addImm(1); 6140 }}}; 6141 } 6142 6143 /// This is used for computing addresses like this: 6144 /// 6145 /// ldr x1, [x2, x3, lsl #3] 6146 /// 6147 /// Where x2 is the base register, and x3 is an offset register. The shift-left 6148 /// is a constant value specific to this load instruction. That is, we'll never 6149 /// see anything other than a 3 here (which corresponds to the size of the 6150 /// element being loaded.) 6151 InstructionSelector::ComplexRendererFns 6152 AArch64InstructionSelector::selectAddrModeShiftedExtendXReg( 6153 MachineOperand &Root, unsigned SizeInBytes) const { 6154 if (!Root.isReg()) 6155 return None; 6156 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 6157 6158 // We want to find something like this: 6159 // 6160 // val = G_CONSTANT LegalShiftVal 6161 // shift = G_SHL off_reg val 6162 // ptr = G_PTR_ADD base_reg shift 6163 // x = G_LOAD ptr 6164 // 6165 // And fold it into this addressing mode: 6166 // 6167 // ldr x, [base_reg, off_reg, lsl #LegalShiftVal] 6168 6169 // Check if we can find the G_PTR_ADD. 6170 MachineInstr *PtrAdd = 6171 getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); 6172 if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI)) 6173 return None; 6174 6175 // Now, try to match an opcode which will match our specific offset. 6176 // We want a G_SHL or a G_MUL. 6177 MachineInstr *OffsetInst = 6178 getDefIgnoringCopies(PtrAdd->getOperand(2).getReg(), MRI); 6179 return selectExtendedSHL(Root, PtrAdd->getOperand(1), 6180 OffsetInst->getOperand(0), SizeInBytes, 6181 /*WantsExt=*/false); 6182 } 6183 6184 /// This is used for computing addresses like this: 6185 /// 6186 /// ldr x1, [x2, x3] 6187 /// 6188 /// Where x2 is the base register, and x3 is an offset register. 6189 /// 6190 /// When possible (or profitable) to fold a G_PTR_ADD into the address calculation, 6191 /// this will do so. Otherwise, it will return None. 6192 InstructionSelector::ComplexRendererFns 6193 AArch64InstructionSelector::selectAddrModeRegisterOffset( 6194 MachineOperand &Root) const { 6195 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 6196 6197 // We need a GEP. 6198 MachineInstr *Gep = MRI.getVRegDef(Root.getReg()); 6199 if (Gep->getOpcode() != TargetOpcode::G_PTR_ADD) 6200 return None; 6201 6202 // If this is used more than once, let's not bother folding. 6203 // TODO: Check if they are memory ops. If they are, then we can still fold 6204 // without having to recompute anything. 6205 if (!MRI.hasOneNonDBGUse(Gep->getOperand(0).getReg())) 6206 return None; 6207 6208 // Base is the GEP's LHS, offset is its RHS. 6209 return {{[=](MachineInstrBuilder &MIB) { 6210 MIB.addUse(Gep->getOperand(1).getReg()); 6211 }, 6212 [=](MachineInstrBuilder &MIB) { 6213 MIB.addUse(Gep->getOperand(2).getReg()); 6214 }, 6215 [=](MachineInstrBuilder &MIB) { 6216 // Need to add both immediates here to make sure that they are both 6217 // added to the instruction. 6218 MIB.addImm(0); 6219 MIB.addImm(0); 6220 }}}; 6221 } 6222 6223 /// This is intended to be equivalent to selectAddrModeXRO in 6224 /// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads. 6225 InstructionSelector::ComplexRendererFns 6226 AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root, 6227 unsigned SizeInBytes) const { 6228 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 6229 if (!Root.isReg()) 6230 return None; 6231 MachineInstr *PtrAdd = 6232 getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); 6233 if (!PtrAdd) 6234 return None; 6235 6236 // Check for an immediates which cannot be encoded in the [base + imm] 6237 // addressing mode, and can't be encoded in an add/sub. If this happens, we'll 6238 // end up with code like: 6239 // 6240 // mov x0, wide 6241 // add x1 base, x0 6242 // ldr x2, [x1, x0] 6243 // 6244 // In this situation, we can use the [base, xreg] addressing mode to save an 6245 // add/sub: 6246 // 6247 // mov x0, wide 6248 // ldr x2, [base, x0] 6249 auto ValAndVReg = 6250 getIConstantVRegValWithLookThrough(PtrAdd->getOperand(2).getReg(), MRI); 6251 if (ValAndVReg) { 6252 unsigned Scale = Log2_32(SizeInBytes); 6253 int64_t ImmOff = ValAndVReg->Value.getSExtValue(); 6254 6255 // Skip immediates that can be selected in the load/store addresing 6256 // mode. 6257 if (ImmOff % SizeInBytes == 0 && ImmOff >= 0 && 6258 ImmOff < (0x1000 << Scale)) 6259 return None; 6260 6261 // Helper lambda to decide whether or not it is preferable to emit an add. 6262 auto isPreferredADD = [](int64_t ImmOff) { 6263 // Constants in [0x0, 0xfff] can be encoded in an add. 6264 if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL) 6265 return true; 6266 6267 // Can it be encoded in an add lsl #12? 6268 if ((ImmOff & 0xffffffffff000fffLL) != 0x0LL) 6269 return false; 6270 6271 // It can be encoded in an add lsl #12, but we may not want to. If it is 6272 // possible to select this as a single movz, then prefer that. A single 6273 // movz is faster than an add with a shift. 6274 return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL && 6275 (ImmOff & 0xffffffffffff0fffLL) != 0x0LL; 6276 }; 6277 6278 // If the immediate can be encoded in a single add/sub, then bail out. 6279 if (isPreferredADD(ImmOff) || isPreferredADD(-ImmOff)) 6280 return None; 6281 } 6282 6283 // Try to fold shifts into the addressing mode. 6284 auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes); 6285 if (AddrModeFns) 6286 return AddrModeFns; 6287 6288 // If that doesn't work, see if it's possible to fold in registers from 6289 // a GEP. 6290 return selectAddrModeRegisterOffset(Root); 6291 } 6292 6293 /// This is used for computing addresses like this: 6294 /// 6295 /// ldr x0, [xBase, wOffset, sxtw #LegalShiftVal] 6296 /// 6297 /// Where we have a 64-bit base register, a 32-bit offset register, and an 6298 /// extend (which may or may not be signed). 6299 InstructionSelector::ComplexRendererFns 6300 AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root, 6301 unsigned SizeInBytes) const { 6302 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 6303 6304 MachineInstr *PtrAdd = 6305 getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); 6306 if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI)) 6307 return None; 6308 6309 MachineOperand &LHS = PtrAdd->getOperand(1); 6310 MachineOperand &RHS = PtrAdd->getOperand(2); 6311 MachineInstr *OffsetInst = getDefIgnoringCopies(RHS.getReg(), MRI); 6312 6313 // The first case is the same as selectAddrModeXRO, except we need an extend. 6314 // In this case, we try to find a shift and extend, and fold them into the 6315 // addressing mode. 6316 // 6317 // E.g. 6318 // 6319 // off_reg = G_Z/S/ANYEXT ext_reg 6320 // val = G_CONSTANT LegalShiftVal 6321 // shift = G_SHL off_reg val 6322 // ptr = G_PTR_ADD base_reg shift 6323 // x = G_LOAD ptr 6324 // 6325 // In this case we can get a load like this: 6326 // 6327 // ldr x0, [base_reg, ext_reg, sxtw #LegalShiftVal] 6328 auto ExtendedShl = selectExtendedSHL(Root, LHS, OffsetInst->getOperand(0), 6329 SizeInBytes, /*WantsExt=*/true); 6330 if (ExtendedShl) 6331 return ExtendedShl; 6332 6333 // There was no shift. We can try and fold a G_Z/S/ANYEXT in alone though. 6334 // 6335 // e.g. 6336 // ldr something, [base_reg, ext_reg, sxtw] 6337 if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI)) 6338 return None; 6339 6340 // Check if this is an extend. We'll get an extend type if it is. 6341 AArch64_AM::ShiftExtendType Ext = 6342 getExtendTypeForInst(*OffsetInst, MRI, /*IsLoadStore=*/true); 6343 if (Ext == AArch64_AM::InvalidShiftExtend) 6344 return None; 6345 6346 // Need a 32-bit wide register. 6347 MachineIRBuilder MIB(*PtrAdd); 6348 Register ExtReg = moveScalarRegClass(OffsetInst->getOperand(1).getReg(), 6349 AArch64::GPR32RegClass, MIB); 6350 unsigned SignExtend = Ext == AArch64_AM::SXTW; 6351 6352 // Base is LHS, offset is ExtReg. 6353 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(LHS.getReg()); }, 6354 [=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }, 6355 [=](MachineInstrBuilder &MIB) { 6356 MIB.addImm(SignExtend); 6357 MIB.addImm(0); 6358 }}}; 6359 } 6360 6361 /// Select a "register plus unscaled signed 9-bit immediate" address. This 6362 /// should only match when there is an offset that is not valid for a scaled 6363 /// immediate addressing mode. The "Size" argument is the size in bytes of the 6364 /// memory reference, which is needed here to know what is valid for a scaled 6365 /// immediate. 6366 InstructionSelector::ComplexRendererFns 6367 AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root, 6368 unsigned Size) const { 6369 MachineRegisterInfo &MRI = 6370 Root.getParent()->getParent()->getParent()->getRegInfo(); 6371 6372 if (!Root.isReg()) 6373 return None; 6374 6375 if (!isBaseWithConstantOffset(Root, MRI)) 6376 return None; 6377 6378 MachineInstr *RootDef = MRI.getVRegDef(Root.getReg()); 6379 6380 MachineOperand &OffImm = RootDef->getOperand(2); 6381 if (!OffImm.isReg()) 6382 return None; 6383 MachineInstr *RHS = MRI.getVRegDef(OffImm.getReg()); 6384 if (RHS->getOpcode() != TargetOpcode::G_CONSTANT) 6385 return None; 6386 int64_t RHSC; 6387 MachineOperand &RHSOp1 = RHS->getOperand(1); 6388 if (!RHSOp1.isCImm() || RHSOp1.getCImm()->getBitWidth() > 64) 6389 return None; 6390 RHSC = RHSOp1.getCImm()->getSExtValue(); 6391 6392 // If the offset is valid as a scaled immediate, don't match here. 6393 if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Log2_32(Size))) 6394 return None; 6395 if (RHSC >= -256 && RHSC < 256) { 6396 MachineOperand &Base = RootDef->getOperand(1); 6397 return {{ 6398 [=](MachineInstrBuilder &MIB) { MIB.add(Base); }, 6399 [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC); }, 6400 }}; 6401 } 6402 return None; 6403 } 6404 6405 InstructionSelector::ComplexRendererFns 6406 AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef, 6407 unsigned Size, 6408 MachineRegisterInfo &MRI) const { 6409 if (RootDef.getOpcode() != AArch64::G_ADD_LOW) 6410 return None; 6411 MachineInstr &Adrp = *MRI.getVRegDef(RootDef.getOperand(1).getReg()); 6412 if (Adrp.getOpcode() != AArch64::ADRP) 6413 return None; 6414 6415 // TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG. 6416 auto Offset = Adrp.getOperand(1).getOffset(); 6417 if (Offset % Size != 0) 6418 return None; 6419 6420 auto GV = Adrp.getOperand(1).getGlobal(); 6421 if (GV->isThreadLocal()) 6422 return None; 6423 6424 auto &MF = *RootDef.getParent()->getParent(); 6425 if (GV->getPointerAlignment(MF.getDataLayout()) < Size) 6426 return None; 6427 6428 unsigned OpFlags = STI.ClassifyGlobalReference(GV, MF.getTarget()); 6429 MachineIRBuilder MIRBuilder(RootDef); 6430 Register AdrpReg = Adrp.getOperand(0).getReg(); 6431 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(AdrpReg); }, 6432 [=](MachineInstrBuilder &MIB) { 6433 MIB.addGlobalAddress(GV, Offset, 6434 OpFlags | AArch64II::MO_PAGEOFF | 6435 AArch64II::MO_NC); 6436 }}}; 6437 } 6438 6439 /// Select a "register plus scaled unsigned 12-bit immediate" address. The 6440 /// "Size" argument is the size in bytes of the memory reference, which 6441 /// determines the scale. 6442 InstructionSelector::ComplexRendererFns 6443 AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root, 6444 unsigned Size) const { 6445 MachineFunction &MF = *Root.getParent()->getParent()->getParent(); 6446 MachineRegisterInfo &MRI = MF.getRegInfo(); 6447 6448 if (!Root.isReg()) 6449 return None; 6450 6451 MachineInstr *RootDef = MRI.getVRegDef(Root.getReg()); 6452 if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) { 6453 return {{ 6454 [=](MachineInstrBuilder &MIB) { MIB.add(RootDef->getOperand(1)); }, 6455 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, 6456 }}; 6457 } 6458 6459 CodeModel::Model CM = MF.getTarget().getCodeModel(); 6460 // Check if we can fold in the ADD of small code model ADRP + ADD address. 6461 if (CM == CodeModel::Small) { 6462 auto OpFns = tryFoldAddLowIntoImm(*RootDef, Size, MRI); 6463 if (OpFns) 6464 return OpFns; 6465 } 6466 6467 if (isBaseWithConstantOffset(Root, MRI)) { 6468 MachineOperand &LHS = RootDef->getOperand(1); 6469 MachineOperand &RHS = RootDef->getOperand(2); 6470 MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg()); 6471 MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg()); 6472 6473 int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue(); 6474 unsigned Scale = Log2_32(Size); 6475 if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) { 6476 if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) 6477 return {{ 6478 [=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); }, 6479 [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); }, 6480 }}; 6481 6482 return {{ 6483 [=](MachineInstrBuilder &MIB) { MIB.add(LHS); }, 6484 [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); }, 6485 }}; 6486 } 6487 } 6488 6489 // Before falling back to our general case, check if the unscaled 6490 // instructions can handle this. If so, that's preferable. 6491 if (selectAddrModeUnscaled(Root, Size)) 6492 return None; 6493 6494 return {{ 6495 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 6496 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, 6497 }}; 6498 } 6499 6500 /// Given a shift instruction, return the correct shift type for that 6501 /// instruction. 6502 static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) { 6503 switch (MI.getOpcode()) { 6504 default: 6505 return AArch64_AM::InvalidShiftExtend; 6506 case TargetOpcode::G_SHL: 6507 return AArch64_AM::LSL; 6508 case TargetOpcode::G_LSHR: 6509 return AArch64_AM::LSR; 6510 case TargetOpcode::G_ASHR: 6511 return AArch64_AM::ASR; 6512 case TargetOpcode::G_ROTR: 6513 return AArch64_AM::ROR; 6514 } 6515 } 6516 6517 /// Select a "shifted register" operand. If the value is not shifted, set the 6518 /// shift operand to a default value of "lsl 0". 6519 InstructionSelector::ComplexRendererFns 6520 AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root, 6521 bool AllowROR) const { 6522 if (!Root.isReg()) 6523 return None; 6524 MachineRegisterInfo &MRI = 6525 Root.getParent()->getParent()->getParent()->getRegInfo(); 6526 6527 // Check if the operand is defined by an instruction which corresponds to 6528 // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc. 6529 MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg()); 6530 AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst); 6531 if (ShType == AArch64_AM::InvalidShiftExtend) 6532 return None; 6533 if (ShType == AArch64_AM::ROR && !AllowROR) 6534 return None; 6535 if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI)) 6536 return None; 6537 6538 // Need an immediate on the RHS. 6539 MachineOperand &ShiftRHS = ShiftInst->getOperand(2); 6540 auto Immed = getImmedFromMO(ShiftRHS); 6541 if (!Immed) 6542 return None; 6543 6544 // We have something that we can fold. Fold in the shift's LHS and RHS into 6545 // the instruction. 6546 MachineOperand &ShiftLHS = ShiftInst->getOperand(1); 6547 Register ShiftReg = ShiftLHS.getReg(); 6548 6549 unsigned NumBits = MRI.getType(ShiftReg).getSizeInBits(); 6550 unsigned Val = *Immed & (NumBits - 1); 6551 unsigned ShiftVal = AArch64_AM::getShifterImm(ShType, Val); 6552 6553 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ShiftReg); }, 6554 [=](MachineInstrBuilder &MIB) { MIB.addImm(ShiftVal); }}}; 6555 } 6556 6557 AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst( 6558 MachineInstr &MI, MachineRegisterInfo &MRI, bool IsLoadStore) const { 6559 unsigned Opc = MI.getOpcode(); 6560 6561 // Handle explicit extend instructions first. 6562 if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) { 6563 unsigned Size; 6564 if (Opc == TargetOpcode::G_SEXT) 6565 Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 6566 else 6567 Size = MI.getOperand(2).getImm(); 6568 assert(Size != 64 && "Extend from 64 bits?"); 6569 switch (Size) { 6570 case 8: 6571 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTB; 6572 case 16: 6573 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTH; 6574 case 32: 6575 return AArch64_AM::SXTW; 6576 default: 6577 return AArch64_AM::InvalidShiftExtend; 6578 } 6579 } 6580 6581 if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) { 6582 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 6583 assert(Size != 64 && "Extend from 64 bits?"); 6584 switch (Size) { 6585 case 8: 6586 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTB; 6587 case 16: 6588 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTH; 6589 case 32: 6590 return AArch64_AM::UXTW; 6591 default: 6592 return AArch64_AM::InvalidShiftExtend; 6593 } 6594 } 6595 6596 // Don't have an explicit extend. Try to handle a G_AND with a constant mask 6597 // on the RHS. 6598 if (Opc != TargetOpcode::G_AND) 6599 return AArch64_AM::InvalidShiftExtend; 6600 6601 Optional<uint64_t> MaybeAndMask = getImmedFromMO(MI.getOperand(2)); 6602 if (!MaybeAndMask) 6603 return AArch64_AM::InvalidShiftExtend; 6604 uint64_t AndMask = *MaybeAndMask; 6605 switch (AndMask) { 6606 default: 6607 return AArch64_AM::InvalidShiftExtend; 6608 case 0xFF: 6609 return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend; 6610 case 0xFFFF: 6611 return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend; 6612 case 0xFFFFFFFF: 6613 return AArch64_AM::UXTW; 6614 } 6615 } 6616 6617 Register AArch64InstructionSelector::moveScalarRegClass( 6618 Register Reg, const TargetRegisterClass &RC, MachineIRBuilder &MIB) const { 6619 MachineRegisterInfo &MRI = *MIB.getMRI(); 6620 auto Ty = MRI.getType(Reg); 6621 assert(!Ty.isVector() && "Expected scalars only!"); 6622 if (Ty.getSizeInBits() == TRI.getRegSizeInBits(RC)) 6623 return Reg; 6624 6625 // Create a copy and immediately select it. 6626 // FIXME: We should have an emitCopy function? 6627 auto Copy = MIB.buildCopy({&RC}, {Reg}); 6628 selectCopy(*Copy, TII, MRI, TRI, RBI); 6629 return Copy.getReg(0); 6630 } 6631 6632 /// Select an "extended register" operand. This operand folds in an extend 6633 /// followed by an optional left shift. 6634 InstructionSelector::ComplexRendererFns 6635 AArch64InstructionSelector::selectArithExtendedRegister( 6636 MachineOperand &Root) const { 6637 if (!Root.isReg()) 6638 return None; 6639 MachineRegisterInfo &MRI = 6640 Root.getParent()->getParent()->getParent()->getRegInfo(); 6641 6642 uint64_t ShiftVal = 0; 6643 Register ExtReg; 6644 AArch64_AM::ShiftExtendType Ext; 6645 MachineInstr *RootDef = getDefIgnoringCopies(Root.getReg(), MRI); 6646 if (!RootDef) 6647 return None; 6648 6649 if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI)) 6650 return None; 6651 6652 // Check if we can fold a shift and an extend. 6653 if (RootDef->getOpcode() == TargetOpcode::G_SHL) { 6654 // Look for a constant on the RHS of the shift. 6655 MachineOperand &RHS = RootDef->getOperand(2); 6656 Optional<uint64_t> MaybeShiftVal = getImmedFromMO(RHS); 6657 if (!MaybeShiftVal) 6658 return None; 6659 ShiftVal = *MaybeShiftVal; 6660 if (ShiftVal > 4) 6661 return None; 6662 // Look for a valid extend instruction on the LHS of the shift. 6663 MachineOperand &LHS = RootDef->getOperand(1); 6664 MachineInstr *ExtDef = getDefIgnoringCopies(LHS.getReg(), MRI); 6665 if (!ExtDef) 6666 return None; 6667 Ext = getExtendTypeForInst(*ExtDef, MRI); 6668 if (Ext == AArch64_AM::InvalidShiftExtend) 6669 return None; 6670 ExtReg = ExtDef->getOperand(1).getReg(); 6671 } else { 6672 // Didn't get a shift. Try just folding an extend. 6673 Ext = getExtendTypeForInst(*RootDef, MRI); 6674 if (Ext == AArch64_AM::InvalidShiftExtend) 6675 return None; 6676 ExtReg = RootDef->getOperand(1).getReg(); 6677 6678 // If we have a 32 bit instruction which zeroes out the high half of a 6679 // register, we get an implicit zero extend for free. Check if we have one. 6680 // FIXME: We actually emit the extend right now even though we don't have 6681 // to. 6682 if (Ext == AArch64_AM::UXTW && MRI.getType(ExtReg).getSizeInBits() == 32) { 6683 MachineInstr *ExtInst = MRI.getVRegDef(ExtReg); 6684 if (isDef32(*ExtInst)) 6685 return None; 6686 } 6687 } 6688 6689 // We require a GPR32 here. Narrow the ExtReg if needed using a subregister 6690 // copy. 6691 MachineIRBuilder MIB(*RootDef); 6692 ExtReg = moveScalarRegClass(ExtReg, AArch64::GPR32RegClass, MIB); 6693 6694 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }, 6695 [=](MachineInstrBuilder &MIB) { 6696 MIB.addImm(getArithExtendImm(Ext, ShiftVal)); 6697 }}}; 6698 } 6699 6700 void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB, 6701 const MachineInstr &MI, 6702 int OpIdx) const { 6703 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 6704 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 6705 "Expected G_CONSTANT"); 6706 Optional<int64_t> CstVal = 6707 getIConstantVRegSExtVal(MI.getOperand(0).getReg(), MRI); 6708 assert(CstVal && "Expected constant value"); 6709 MIB.addImm(*CstVal); 6710 } 6711 6712 void AArch64InstructionSelector::renderLogicalImm32( 6713 MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const { 6714 assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 6715 "Expected G_CONSTANT"); 6716 uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue(); 6717 uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 32); 6718 MIB.addImm(Enc); 6719 } 6720 6721 void AArch64InstructionSelector::renderLogicalImm64( 6722 MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const { 6723 assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 6724 "Expected G_CONSTANT"); 6725 uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue(); 6726 uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 64); 6727 MIB.addImm(Enc); 6728 } 6729 6730 void AArch64InstructionSelector::renderFPImm16(MachineInstrBuilder &MIB, 6731 const MachineInstr &MI, 6732 int OpIdx) const { 6733 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 && 6734 "Expected G_FCONSTANT"); 6735 MIB.addImm( 6736 AArch64_AM::getFP16Imm(MI.getOperand(1).getFPImm()->getValueAPF())); 6737 } 6738 6739 void AArch64InstructionSelector::renderFPImm32(MachineInstrBuilder &MIB, 6740 const MachineInstr &MI, 6741 int OpIdx) const { 6742 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 && 6743 "Expected G_FCONSTANT"); 6744 MIB.addImm( 6745 AArch64_AM::getFP32Imm(MI.getOperand(1).getFPImm()->getValueAPF())); 6746 } 6747 6748 void AArch64InstructionSelector::renderFPImm64(MachineInstrBuilder &MIB, 6749 const MachineInstr &MI, 6750 int OpIdx) const { 6751 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 && 6752 "Expected G_FCONSTANT"); 6753 MIB.addImm( 6754 AArch64_AM::getFP64Imm(MI.getOperand(1).getFPImm()->getValueAPF())); 6755 } 6756 6757 void AArch64InstructionSelector::renderFPImm32SIMDModImmType4( 6758 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { 6759 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 && 6760 "Expected G_FCONSTANT"); 6761 MIB.addImm(AArch64_AM::encodeAdvSIMDModImmType4(MI.getOperand(1) 6762 .getFPImm() 6763 ->getValueAPF() 6764 .bitcastToAPInt() 6765 .getZExtValue())); 6766 } 6767 6768 bool AArch64InstructionSelector::isLoadStoreOfNumBytes( 6769 const MachineInstr &MI, unsigned NumBytes) const { 6770 if (!MI.mayLoadOrStore()) 6771 return false; 6772 assert(MI.hasOneMemOperand() && 6773 "Expected load/store to have only one mem op!"); 6774 return (*MI.memoperands_begin())->getSize() == NumBytes; 6775 } 6776 6777 bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const { 6778 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 6779 if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() != 32) 6780 return false; 6781 6782 // Only return true if we know the operation will zero-out the high half of 6783 // the 64-bit register. Truncates can be subregister copies, which don't 6784 // zero out the high bits. Copies and other copy-like instructions can be 6785 // fed by truncates, or could be lowered as subregister copies. 6786 switch (MI.getOpcode()) { 6787 default: 6788 return true; 6789 case TargetOpcode::COPY: 6790 case TargetOpcode::G_BITCAST: 6791 case TargetOpcode::G_TRUNC: 6792 case TargetOpcode::G_PHI: 6793 return false; 6794 } 6795 } 6796 6797 6798 // Perform fixups on the given PHI instruction's operands to force them all 6799 // to be the same as the destination regbank. 6800 static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI, 6801 const AArch64RegisterBankInfo &RBI) { 6802 assert(MI.getOpcode() == TargetOpcode::G_PHI && "Expected a G_PHI"); 6803 Register DstReg = MI.getOperand(0).getReg(); 6804 const RegisterBank *DstRB = MRI.getRegBankOrNull(DstReg); 6805 assert(DstRB && "Expected PHI dst to have regbank assigned"); 6806 MachineIRBuilder MIB(MI); 6807 6808 // Go through each operand and ensure it has the same regbank. 6809 for (MachineOperand &MO : llvm::drop_begin(MI.operands())) { 6810 if (!MO.isReg()) 6811 continue; 6812 Register OpReg = MO.getReg(); 6813 const RegisterBank *RB = MRI.getRegBankOrNull(OpReg); 6814 if (RB != DstRB) { 6815 // Insert a cross-bank copy. 6816 auto *OpDef = MRI.getVRegDef(OpReg); 6817 const LLT &Ty = MRI.getType(OpReg); 6818 MachineBasicBlock &OpDefBB = *OpDef->getParent(); 6819 6820 // Any instruction we insert must appear after all PHIs in the block 6821 // for the block to be valid MIR. 6822 MachineBasicBlock::iterator InsertPt = std::next(OpDef->getIterator()); 6823 if (InsertPt != OpDefBB.end() && InsertPt->isPHI()) 6824 InsertPt = OpDefBB.getFirstNonPHI(); 6825 MIB.setInsertPt(*OpDef->getParent(), InsertPt); 6826 auto Copy = MIB.buildCopy(Ty, OpReg); 6827 MRI.setRegBank(Copy.getReg(0), *DstRB); 6828 MO.setReg(Copy.getReg(0)); 6829 } 6830 } 6831 } 6832 6833 void AArch64InstructionSelector::processPHIs(MachineFunction &MF) { 6834 // We're looking for PHIs, build a list so we don't invalidate iterators. 6835 MachineRegisterInfo &MRI = MF.getRegInfo(); 6836 SmallVector<MachineInstr *, 32> Phis; 6837 for (auto &BB : MF) { 6838 for (auto &MI : BB) { 6839 if (MI.getOpcode() == TargetOpcode::G_PHI) 6840 Phis.emplace_back(&MI); 6841 } 6842 } 6843 6844 for (auto *MI : Phis) { 6845 // We need to do some work here if the operand types are < 16 bit and they 6846 // are split across fpr/gpr banks. Since all types <32b on gpr 6847 // end up being assigned gpr32 regclasses, we can end up with PHIs here 6848 // which try to select between a gpr32 and an fpr16. Ideally RBS shouldn't 6849 // be selecting heterogenous regbanks for operands if possible, but we 6850 // still need to be able to deal with it here. 6851 // 6852 // To fix this, if we have a gpr-bank operand < 32b in size and at least 6853 // one other operand is on the fpr bank, then we add cross-bank copies 6854 // to homogenize the operand banks. For simplicity the bank that we choose 6855 // to settle on is whatever bank the def operand has. For example: 6856 // 6857 // %endbb: 6858 // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2:fpr(s16), %bb2 6859 // => 6860 // %bb2: 6861 // ... 6862 // %in2_copy:gpr(s16) = COPY %in2:fpr(s16) 6863 // ... 6864 // %endbb: 6865 // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2 6866 bool HasGPROp = false, HasFPROp = false; 6867 for (const MachineOperand &MO : llvm::drop_begin(MI->operands())) { 6868 if (!MO.isReg()) 6869 continue; 6870 const LLT &Ty = MRI.getType(MO.getReg()); 6871 if (!Ty.isValid() || !Ty.isScalar()) 6872 break; 6873 if (Ty.getSizeInBits() >= 32) 6874 break; 6875 const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg()); 6876 // If for some reason we don't have a regbank yet. Don't try anything. 6877 if (!RB) 6878 break; 6879 6880 if (RB->getID() == AArch64::GPRRegBankID) 6881 HasGPROp = true; 6882 else 6883 HasFPROp = true; 6884 } 6885 // We have heterogenous regbanks, need to fixup. 6886 if (HasGPROp && HasFPROp) 6887 fixupPHIOpBanks(*MI, MRI, RBI); 6888 } 6889 } 6890 6891 namespace llvm { 6892 InstructionSelector * 6893 createAArch64InstructionSelector(const AArch64TargetMachine &TM, 6894 AArch64Subtarget &Subtarget, 6895 AArch64RegisterBankInfo &RBI) { 6896 return new AArch64InstructionSelector(TM, Subtarget, RBI); 6897 } 6898 } 6899