1 //===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the InstructionSelector class for 10 /// AArch64. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AArch64GlobalISelUtils.h" 15 #include "AArch64InstrInfo.h" 16 #include "AArch64MachineFunctionInfo.h" 17 #include "AArch64RegisterBankInfo.h" 18 #include "AArch64RegisterInfo.h" 19 #include "AArch64Subtarget.h" 20 #include "AArch64TargetMachine.h" 21 #include "MCTargetDesc/AArch64AddressingModes.h" 22 #include "MCTargetDesc/AArch64MCTargetDesc.h" 23 #include "llvm/BinaryFormat/Dwarf.h" 24 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 25 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" 26 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" 27 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 29 #include "llvm/CodeGen/GlobalISel/Utils.h" 30 #include "llvm/CodeGen/MachineBasicBlock.h" 31 #include "llvm/CodeGen/MachineConstantPool.h" 32 #include "llvm/CodeGen/MachineFrameInfo.h" 33 #include "llvm/CodeGen/MachineFunction.h" 34 #include "llvm/CodeGen/MachineInstr.h" 35 #include "llvm/CodeGen/MachineInstrBuilder.h" 36 #include "llvm/CodeGen/MachineMemOperand.h" 37 #include "llvm/CodeGen/MachineOperand.h" 38 #include "llvm/CodeGen/MachineRegisterInfo.h" 39 #include "llvm/CodeGen/TargetOpcodes.h" 40 #include "llvm/IR/Constants.h" 41 #include "llvm/IR/DerivedTypes.h" 42 #include "llvm/IR/Instructions.h" 43 #include "llvm/IR/IntrinsicsAArch64.h" 44 #include "llvm/IR/PatternMatch.h" 45 #include "llvm/IR/Type.h" 46 #include "llvm/Pass.h" 47 #include "llvm/Support/Debug.h" 48 #include "llvm/Support/raw_ostream.h" 49 #include <optional> 50 51 #define DEBUG_TYPE "aarch64-isel" 52 53 using namespace llvm; 54 using namespace MIPatternMatch; 55 using namespace AArch64GISelUtils; 56 57 namespace llvm { 58 class BlockFrequencyInfo; 59 class ProfileSummaryInfo; 60 } 61 62 namespace { 63 64 #define GET_GLOBALISEL_PREDICATE_BITSET 65 #include "AArch64GenGlobalISel.inc" 66 #undef GET_GLOBALISEL_PREDICATE_BITSET 67 68 69 class AArch64InstructionSelector : public InstructionSelector { 70 public: 71 AArch64InstructionSelector(const AArch64TargetMachine &TM, 72 const AArch64Subtarget &STI, 73 const AArch64RegisterBankInfo &RBI); 74 75 bool select(MachineInstr &I) override; 76 static const char *getName() { return DEBUG_TYPE; } 77 78 void setupMF(MachineFunction &MF, GISelKnownBits *KB, 79 CodeGenCoverage &CoverageInfo, ProfileSummaryInfo *PSI, 80 BlockFrequencyInfo *BFI) override { 81 InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI); 82 MIB.setMF(MF); 83 84 // hasFnAttribute() is expensive to call on every BRCOND selection, so 85 // cache it here for each run of the selector. 86 ProduceNonFlagSettingCondBr = 87 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening); 88 MFReturnAddr = Register(); 89 90 processPHIs(MF); 91 } 92 93 private: 94 /// tblgen-erated 'select' implementation, used as the initial selector for 95 /// the patterns that don't require complex C++. 96 bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; 97 98 // A lowering phase that runs before any selection attempts. 99 // Returns true if the instruction was modified. 100 bool preISelLower(MachineInstr &I); 101 102 // An early selection function that runs before the selectImpl() call. 103 bool earlySelect(MachineInstr &I); 104 105 // Do some preprocessing of G_PHIs before we begin selection. 106 void processPHIs(MachineFunction &MF); 107 108 bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI); 109 110 /// Eliminate same-sized cross-bank copies into stores before selectImpl(). 111 bool contractCrossBankCopyIntoStore(MachineInstr &I, 112 MachineRegisterInfo &MRI); 113 114 bool convertPtrAddToAdd(MachineInstr &I, MachineRegisterInfo &MRI); 115 116 bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF, 117 MachineRegisterInfo &MRI) const; 118 bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF, 119 MachineRegisterInfo &MRI) const; 120 121 ///@{ 122 /// Helper functions for selectCompareBranch. 123 bool selectCompareBranchFedByFCmp(MachineInstr &I, MachineInstr &FCmp, 124 MachineIRBuilder &MIB) const; 125 bool selectCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp, 126 MachineIRBuilder &MIB) const; 127 bool tryOptCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp, 128 MachineIRBuilder &MIB) const; 129 bool tryOptAndIntoCompareBranch(MachineInstr &AndInst, bool Invert, 130 MachineBasicBlock *DstMBB, 131 MachineIRBuilder &MIB) const; 132 ///@} 133 134 bool selectCompareBranch(MachineInstr &I, MachineFunction &MF, 135 MachineRegisterInfo &MRI); 136 137 bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI); 138 bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI); 139 140 // Helper to generate an equivalent of scalar_to_vector into a new register, 141 // returned via 'Dst'. 142 MachineInstr *emitScalarToVector(unsigned EltSize, 143 const TargetRegisterClass *DstRC, 144 Register Scalar, 145 MachineIRBuilder &MIRBuilder) const; 146 147 /// Emit a lane insert into \p DstReg, or a new vector register if 148 /// std::nullopt is provided. 149 /// 150 /// The lane inserted into is defined by \p LaneIdx. The vector source 151 /// register is given by \p SrcReg. The register containing the element is 152 /// given by \p EltReg. 153 MachineInstr *emitLaneInsert(std::optional<Register> DstReg, Register SrcReg, 154 Register EltReg, unsigned LaneIdx, 155 const RegisterBank &RB, 156 MachineIRBuilder &MIRBuilder) const; 157 158 /// Emit a sequence of instructions representing a constant \p CV for a 159 /// vector register \p Dst. (E.g. a MOV, or a load from a constant pool.) 160 /// 161 /// \returns the last instruction in the sequence on success, and nullptr 162 /// otherwise. 163 MachineInstr *emitConstantVector(Register Dst, Constant *CV, 164 MachineIRBuilder &MIRBuilder, 165 MachineRegisterInfo &MRI); 166 167 bool selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI); 168 bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy, 169 MachineRegisterInfo &MRI); 170 /// \returns true if a G_BUILD_VECTOR instruction \p MI can be selected as a 171 /// SUBREG_TO_REG. 172 bool tryOptBuildVecToSubregToReg(MachineInstr &MI, MachineRegisterInfo &MRI); 173 bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI); 174 bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI); 175 bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI); 176 177 bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI); 178 bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI); 179 bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI); 180 bool selectSplitVectorUnmerge(MachineInstr &I, MachineRegisterInfo &MRI); 181 182 /// Helper function to select vector load intrinsics like 183 /// @llvm.aarch64.neon.ld2.*, @llvm.aarch64.neon.ld4.*, etc. 184 /// \p Opc is the opcode that the selected instruction should use. 185 /// \p NumVecs is the number of vector destinations for the instruction. 186 /// \p I is the original G_INTRINSIC_W_SIDE_EFFECTS instruction. 187 bool selectVectorLoadIntrinsic(unsigned Opc, unsigned NumVecs, 188 MachineInstr &I); 189 bool selectIntrinsicWithSideEffects(MachineInstr &I, 190 MachineRegisterInfo &MRI); 191 bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI); 192 bool selectVectorICmp(MachineInstr &I, MachineRegisterInfo &MRI); 193 bool selectIntrinsicTrunc(MachineInstr &I, MachineRegisterInfo &MRI) const; 194 bool selectIntrinsicRound(MachineInstr &I, MachineRegisterInfo &MRI) const; 195 bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI); 196 bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI); 197 bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI); 198 bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI); 199 bool selectMOPS(MachineInstr &I, MachineRegisterInfo &MRI); 200 bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI); 201 202 unsigned emitConstantPoolEntry(const Constant *CPVal, 203 MachineFunction &MF) const; 204 MachineInstr *emitLoadFromConstantPool(const Constant *CPVal, 205 MachineIRBuilder &MIRBuilder) const; 206 207 // Emit a vector concat operation. 208 MachineInstr *emitVectorConcat(std::optional<Register> Dst, Register Op1, 209 Register Op2, 210 MachineIRBuilder &MIRBuilder) const; 211 212 // Emit an integer compare between LHS and RHS, which checks for Predicate. 213 MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS, 214 MachineOperand &Predicate, 215 MachineIRBuilder &MIRBuilder) const; 216 217 /// Emit a floating point comparison between \p LHS and \p RHS. 218 /// \p Pred if given is the intended predicate to use. 219 MachineInstr * 220 emitFPCompare(Register LHS, Register RHS, MachineIRBuilder &MIRBuilder, 221 std::optional<CmpInst::Predicate> = std::nullopt) const; 222 223 MachineInstr * 224 emitInstr(unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps, 225 std::initializer_list<llvm::SrcOp> SrcOps, 226 MachineIRBuilder &MIRBuilder, 227 const ComplexRendererFns &RenderFns = std::nullopt) const; 228 /// Helper function to emit an add or sub instruction. 229 /// 230 /// \p AddrModeAndSizeToOpcode must contain each of the opcode variants above 231 /// in a specific order. 232 /// 233 /// Below is an example of the expected input to \p AddrModeAndSizeToOpcode. 234 /// 235 /// \code 236 /// const std::array<std::array<unsigned, 2>, 4> Table { 237 /// {{AArch64::ADDXri, AArch64::ADDWri}, 238 /// {AArch64::ADDXrs, AArch64::ADDWrs}, 239 /// {AArch64::ADDXrr, AArch64::ADDWrr}, 240 /// {AArch64::SUBXri, AArch64::SUBWri}, 241 /// {AArch64::ADDXrx, AArch64::ADDWrx}}}; 242 /// \endcode 243 /// 244 /// Each row in the table corresponds to a different addressing mode. Each 245 /// column corresponds to a different register size. 246 /// 247 /// \attention Rows must be structured as follows: 248 /// - Row 0: The ri opcode variants 249 /// - Row 1: The rs opcode variants 250 /// - Row 2: The rr opcode variants 251 /// - Row 3: The ri opcode variants for negative immediates 252 /// - Row 4: The rx opcode variants 253 /// 254 /// \attention Columns must be structured as follows: 255 /// - Column 0: The 64-bit opcode variants 256 /// - Column 1: The 32-bit opcode variants 257 /// 258 /// \p Dst is the destination register of the binop to emit. 259 /// \p LHS is the left-hand operand of the binop to emit. 260 /// \p RHS is the right-hand operand of the binop to emit. 261 MachineInstr *emitAddSub( 262 const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode, 263 Register Dst, MachineOperand &LHS, MachineOperand &RHS, 264 MachineIRBuilder &MIRBuilder) const; 265 MachineInstr *emitADD(Register DefReg, MachineOperand &LHS, 266 MachineOperand &RHS, 267 MachineIRBuilder &MIRBuilder) const; 268 MachineInstr *emitADDS(Register Dst, MachineOperand &LHS, MachineOperand &RHS, 269 MachineIRBuilder &MIRBuilder) const; 270 MachineInstr *emitSUBS(Register Dst, MachineOperand &LHS, MachineOperand &RHS, 271 MachineIRBuilder &MIRBuilder) const; 272 MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS, 273 MachineIRBuilder &MIRBuilder) const; 274 MachineInstr *emitTST(MachineOperand &LHS, MachineOperand &RHS, 275 MachineIRBuilder &MIRBuilder) const; 276 MachineInstr *emitSelect(Register Dst, Register LHS, Register RHS, 277 AArch64CC::CondCode CC, 278 MachineIRBuilder &MIRBuilder) const; 279 MachineInstr *emitExtractVectorElt(std::optional<Register> DstReg, 280 const RegisterBank &DstRB, LLT ScalarTy, 281 Register VecReg, unsigned LaneIdx, 282 MachineIRBuilder &MIRBuilder) const; 283 MachineInstr *emitCSINC(Register Dst, Register Src1, Register Src2, 284 AArch64CC::CondCode Pred, 285 MachineIRBuilder &MIRBuilder) const; 286 /// Emit a CSet for a FP compare. 287 /// 288 /// \p Dst is expected to be a 32-bit scalar register. 289 MachineInstr *emitCSetForFCmp(Register Dst, CmpInst::Predicate Pred, 290 MachineIRBuilder &MIRBuilder) const; 291 292 /// Emit the overflow op for \p Opcode. 293 /// 294 /// \p Opcode is expected to be an overflow op's opcode, e.g. G_UADDO, 295 /// G_USUBO, etc. 296 std::pair<MachineInstr *, AArch64CC::CondCode> 297 emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS, 298 MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const; 299 300 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops). 301 /// In some cases this is even possible with OR operations in the expression. 302 MachineInstr *emitConjunction(Register Val, AArch64CC::CondCode &OutCC, 303 MachineIRBuilder &MIB) const; 304 MachineInstr *emitConditionalComparison(Register LHS, Register RHS, 305 CmpInst::Predicate CC, 306 AArch64CC::CondCode Predicate, 307 AArch64CC::CondCode OutCC, 308 MachineIRBuilder &MIB) const; 309 MachineInstr *emitConjunctionRec(Register Val, AArch64CC::CondCode &OutCC, 310 bool Negate, Register CCOp, 311 AArch64CC::CondCode Predicate, 312 MachineIRBuilder &MIB) const; 313 314 /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg. 315 /// \p IsNegative is true if the test should be "not zero". 316 /// This will also optimize the test bit instruction when possible. 317 MachineInstr *emitTestBit(Register TestReg, uint64_t Bit, bool IsNegative, 318 MachineBasicBlock *DstMBB, 319 MachineIRBuilder &MIB) const; 320 321 /// Emit a CB(N)Z instruction which branches to \p DestMBB. 322 MachineInstr *emitCBZ(Register CompareReg, bool IsNegative, 323 MachineBasicBlock *DestMBB, 324 MachineIRBuilder &MIB) const; 325 326 // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td. 327 // We use these manually instead of using the importer since it doesn't 328 // support SDNodeXForm. 329 ComplexRendererFns selectShiftA_32(const MachineOperand &Root) const; 330 ComplexRendererFns selectShiftB_32(const MachineOperand &Root) const; 331 ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const; 332 ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const; 333 334 ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const; 335 ComplexRendererFns selectArithImmed(MachineOperand &Root) const; 336 ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const; 337 338 ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root, 339 unsigned Size) const; 340 341 ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const { 342 return selectAddrModeUnscaled(Root, 1); 343 } 344 ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const { 345 return selectAddrModeUnscaled(Root, 2); 346 } 347 ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const { 348 return selectAddrModeUnscaled(Root, 4); 349 } 350 ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const { 351 return selectAddrModeUnscaled(Root, 8); 352 } 353 ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const { 354 return selectAddrModeUnscaled(Root, 16); 355 } 356 357 /// Helper to try to fold in a GISEL_ADD_LOW into an immediate, to be used 358 /// from complex pattern matchers like selectAddrModeIndexed(). 359 ComplexRendererFns tryFoldAddLowIntoImm(MachineInstr &RootDef, unsigned Size, 360 MachineRegisterInfo &MRI) const; 361 362 ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root, 363 unsigned Size) const; 364 template <int Width> 365 ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const { 366 return selectAddrModeIndexed(Root, Width / 8); 367 } 368 369 bool isWorthFoldingIntoExtendedReg(MachineInstr &MI, 370 const MachineRegisterInfo &MRI) const; 371 ComplexRendererFns 372 selectAddrModeShiftedExtendXReg(MachineOperand &Root, 373 unsigned SizeInBytes) const; 374 375 /// Returns a \p ComplexRendererFns which contains a base, offset, and whether 376 /// or not a shift + extend should be folded into an addressing mode. Returns 377 /// None when this is not profitable or possible. 378 ComplexRendererFns 379 selectExtendedSHL(MachineOperand &Root, MachineOperand &Base, 380 MachineOperand &Offset, unsigned SizeInBytes, 381 bool WantsExt) const; 382 ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const; 383 ComplexRendererFns selectAddrModeXRO(MachineOperand &Root, 384 unsigned SizeInBytes) const; 385 template <int Width> 386 ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const { 387 return selectAddrModeXRO(Root, Width / 8); 388 } 389 390 ComplexRendererFns selectAddrModeWRO(MachineOperand &Root, 391 unsigned SizeInBytes) const; 392 template <int Width> 393 ComplexRendererFns selectAddrModeWRO(MachineOperand &Root) const { 394 return selectAddrModeWRO(Root, Width / 8); 395 } 396 397 ComplexRendererFns selectShiftedRegister(MachineOperand &Root, 398 bool AllowROR = false) const; 399 400 ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const { 401 return selectShiftedRegister(Root); 402 } 403 404 ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const { 405 return selectShiftedRegister(Root, true); 406 } 407 408 /// Given an extend instruction, determine the correct shift-extend type for 409 /// that instruction. 410 /// 411 /// If the instruction is going to be used in a load or store, pass 412 /// \p IsLoadStore = true. 413 AArch64_AM::ShiftExtendType 414 getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI, 415 bool IsLoadStore = false) const; 416 417 /// Move \p Reg to \p RC if \p Reg is not already on \p RC. 418 /// 419 /// \returns Either \p Reg if no change was necessary, or the new register 420 /// created by moving \p Reg. 421 /// 422 /// Note: This uses emitCopy right now. 423 Register moveScalarRegClass(Register Reg, const TargetRegisterClass &RC, 424 MachineIRBuilder &MIB) const; 425 426 ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const; 427 428 void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI, 429 int OpIdx = -1) const; 430 void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I, 431 int OpIdx = -1) const; 432 void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I, 433 int OpIdx = -1) const; 434 void renderFPImm16(MachineInstrBuilder &MIB, const MachineInstr &MI, 435 int OpIdx = -1) const; 436 void renderFPImm32(MachineInstrBuilder &MIB, const MachineInstr &MI, 437 int OpIdx = -1) const; 438 void renderFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI, 439 int OpIdx = -1) const; 440 void renderFPImm32SIMDModImmType4(MachineInstrBuilder &MIB, 441 const MachineInstr &MI, 442 int OpIdx = -1) const; 443 444 // Materialize a GlobalValue or BlockAddress using a movz+movk sequence. 445 void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags); 446 447 // Optimization methods. 448 bool tryOptSelect(GSelect &Sel); 449 bool tryOptSelectConjunction(GSelect &Sel, MachineInstr &CondMI); 450 MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS, 451 MachineOperand &Predicate, 452 MachineIRBuilder &MIRBuilder) const; 453 454 /// Return true if \p MI is a load or store of \p NumBytes bytes. 455 bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const; 456 457 /// Returns true if \p MI is guaranteed to have the high-half of a 64-bit 458 /// register zeroed out. In other words, the result of MI has been explicitly 459 /// zero extended. 460 bool isDef32(const MachineInstr &MI) const; 461 462 const AArch64TargetMachine &TM; 463 const AArch64Subtarget &STI; 464 const AArch64InstrInfo &TII; 465 const AArch64RegisterInfo &TRI; 466 const AArch64RegisterBankInfo &RBI; 467 468 bool ProduceNonFlagSettingCondBr = false; 469 470 // Some cached values used during selection. 471 // We use LR as a live-in register, and we keep track of it here as it can be 472 // clobbered by calls. 473 Register MFReturnAddr; 474 475 MachineIRBuilder MIB; 476 477 #define GET_GLOBALISEL_PREDICATES_DECL 478 #include "AArch64GenGlobalISel.inc" 479 #undef GET_GLOBALISEL_PREDICATES_DECL 480 481 // We declare the temporaries used by selectImpl() in the class to minimize the 482 // cost of constructing placeholder values. 483 #define GET_GLOBALISEL_TEMPORARIES_DECL 484 #include "AArch64GenGlobalISel.inc" 485 #undef GET_GLOBALISEL_TEMPORARIES_DECL 486 }; 487 488 } // end anonymous namespace 489 490 #define GET_GLOBALISEL_IMPL 491 #include "AArch64GenGlobalISel.inc" 492 #undef GET_GLOBALISEL_IMPL 493 494 AArch64InstructionSelector::AArch64InstructionSelector( 495 const AArch64TargetMachine &TM, const AArch64Subtarget &STI, 496 const AArch64RegisterBankInfo &RBI) 497 : TM(TM), STI(STI), TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), 498 RBI(RBI), 499 #define GET_GLOBALISEL_PREDICATES_INIT 500 #include "AArch64GenGlobalISel.inc" 501 #undef GET_GLOBALISEL_PREDICATES_INIT 502 #define GET_GLOBALISEL_TEMPORARIES_INIT 503 #include "AArch64GenGlobalISel.inc" 504 #undef GET_GLOBALISEL_TEMPORARIES_INIT 505 { 506 } 507 508 // FIXME: This should be target-independent, inferred from the types declared 509 // for each class in the bank. 510 // 511 /// Given a register bank, and a type, return the smallest register class that 512 /// can represent that combination. 513 static const TargetRegisterClass * 514 getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB, 515 bool GetAllRegSet = false) { 516 if (RB.getID() == AArch64::GPRRegBankID) { 517 if (Ty.getSizeInBits() <= 32) 518 return GetAllRegSet ? &AArch64::GPR32allRegClass 519 : &AArch64::GPR32RegClass; 520 if (Ty.getSizeInBits() == 64) 521 return GetAllRegSet ? &AArch64::GPR64allRegClass 522 : &AArch64::GPR64RegClass; 523 if (Ty.getSizeInBits() == 128) 524 return &AArch64::XSeqPairsClassRegClass; 525 return nullptr; 526 } 527 528 if (RB.getID() == AArch64::FPRRegBankID) { 529 switch (Ty.getSizeInBits()) { 530 case 8: 531 return &AArch64::FPR8RegClass; 532 case 16: 533 return &AArch64::FPR16RegClass; 534 case 32: 535 return &AArch64::FPR32RegClass; 536 case 64: 537 return &AArch64::FPR64RegClass; 538 case 128: 539 return &AArch64::FPR128RegClass; 540 } 541 return nullptr; 542 } 543 544 return nullptr; 545 } 546 547 /// Given a register bank, and size in bits, return the smallest register class 548 /// that can represent that combination. 549 static const TargetRegisterClass * 550 getMinClassForRegBank(const RegisterBank &RB, unsigned SizeInBits, 551 bool GetAllRegSet = false) { 552 unsigned RegBankID = RB.getID(); 553 554 if (RegBankID == AArch64::GPRRegBankID) { 555 if (SizeInBits <= 32) 556 return GetAllRegSet ? &AArch64::GPR32allRegClass 557 : &AArch64::GPR32RegClass; 558 if (SizeInBits == 64) 559 return GetAllRegSet ? &AArch64::GPR64allRegClass 560 : &AArch64::GPR64RegClass; 561 if (SizeInBits == 128) 562 return &AArch64::XSeqPairsClassRegClass; 563 } 564 565 if (RegBankID == AArch64::FPRRegBankID) { 566 switch (SizeInBits) { 567 default: 568 return nullptr; 569 case 8: 570 return &AArch64::FPR8RegClass; 571 case 16: 572 return &AArch64::FPR16RegClass; 573 case 32: 574 return &AArch64::FPR32RegClass; 575 case 64: 576 return &AArch64::FPR64RegClass; 577 case 128: 578 return &AArch64::FPR128RegClass; 579 } 580 } 581 582 return nullptr; 583 } 584 585 /// Returns the correct subregister to use for a given register class. 586 static bool getSubRegForClass(const TargetRegisterClass *RC, 587 const TargetRegisterInfo &TRI, unsigned &SubReg) { 588 switch (TRI.getRegSizeInBits(*RC)) { 589 case 8: 590 SubReg = AArch64::bsub; 591 break; 592 case 16: 593 SubReg = AArch64::hsub; 594 break; 595 case 32: 596 if (RC != &AArch64::FPR32RegClass) 597 SubReg = AArch64::sub_32; 598 else 599 SubReg = AArch64::ssub; 600 break; 601 case 64: 602 SubReg = AArch64::dsub; 603 break; 604 default: 605 LLVM_DEBUG( 606 dbgs() << "Couldn't find appropriate subregister for register class."); 607 return false; 608 } 609 610 return true; 611 } 612 613 /// Returns the minimum size the given register bank can hold. 614 static unsigned getMinSizeForRegBank(const RegisterBank &RB) { 615 switch (RB.getID()) { 616 case AArch64::GPRRegBankID: 617 return 32; 618 case AArch64::FPRRegBankID: 619 return 8; 620 default: 621 llvm_unreachable("Tried to get minimum size for unknown register bank."); 622 } 623 } 624 625 /// Create a REG_SEQUENCE instruction using the registers in \p Regs. 626 /// Helper function for functions like createDTuple and createQTuple. 627 /// 628 /// \p RegClassIDs - The list of register class IDs available for some tuple of 629 /// a scalar class. E.g. QQRegClassID, QQQRegClassID, QQQQRegClassID. This is 630 /// expected to contain between 2 and 4 tuple classes. 631 /// 632 /// \p SubRegs - The list of subregister classes associated with each register 633 /// class ID in \p RegClassIDs. E.g., QQRegClassID should use the qsub0 634 /// subregister class. The index of each subregister class is expected to 635 /// correspond with the index of each register class. 636 /// 637 /// \returns Either the destination register of REG_SEQUENCE instruction that 638 /// was created, or the 0th element of \p Regs if \p Regs contains a single 639 /// element. 640 static Register createTuple(ArrayRef<Register> Regs, 641 const unsigned RegClassIDs[], 642 const unsigned SubRegs[], MachineIRBuilder &MIB) { 643 unsigned NumRegs = Regs.size(); 644 if (NumRegs == 1) 645 return Regs[0]; 646 assert(NumRegs >= 2 && NumRegs <= 4 && 647 "Only support between two and 4 registers in a tuple!"); 648 const TargetRegisterInfo *TRI = MIB.getMF().getSubtarget().getRegisterInfo(); 649 auto *DesiredClass = TRI->getRegClass(RegClassIDs[NumRegs - 2]); 650 auto RegSequence = 651 MIB.buildInstr(TargetOpcode::REG_SEQUENCE, {DesiredClass}, {}); 652 for (unsigned I = 0, E = Regs.size(); I < E; ++I) { 653 RegSequence.addUse(Regs[I]); 654 RegSequence.addImm(SubRegs[I]); 655 } 656 return RegSequence.getReg(0); 657 } 658 659 /// Create a tuple of D-registers using the registers in \p Regs. 660 static Register createDTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) { 661 static const unsigned RegClassIDs[] = { 662 AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID}; 663 static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1, 664 AArch64::dsub2, AArch64::dsub3}; 665 return createTuple(Regs, RegClassIDs, SubRegs, MIB); 666 } 667 668 /// Create a tuple of Q-registers using the registers in \p Regs. 669 static Register createQTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) { 670 static const unsigned RegClassIDs[] = { 671 AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID}; 672 static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1, 673 AArch64::qsub2, AArch64::qsub3}; 674 return createTuple(Regs, RegClassIDs, SubRegs, MIB); 675 } 676 677 static std::optional<uint64_t> getImmedFromMO(const MachineOperand &Root) { 678 auto &MI = *Root.getParent(); 679 auto &MBB = *MI.getParent(); 680 auto &MF = *MBB.getParent(); 681 auto &MRI = MF.getRegInfo(); 682 uint64_t Immed; 683 if (Root.isImm()) 684 Immed = Root.getImm(); 685 else if (Root.isCImm()) 686 Immed = Root.getCImm()->getZExtValue(); 687 else if (Root.isReg()) { 688 auto ValAndVReg = 689 getIConstantVRegValWithLookThrough(Root.getReg(), MRI, true); 690 if (!ValAndVReg) 691 return std::nullopt; 692 Immed = ValAndVReg->Value.getSExtValue(); 693 } else 694 return std::nullopt; 695 return Immed; 696 } 697 698 /// Check whether \p I is a currently unsupported binary operation: 699 /// - it has an unsized type 700 /// - an operand is not a vreg 701 /// - all operands are not in the same bank 702 /// These are checks that should someday live in the verifier, but right now, 703 /// these are mostly limitations of the aarch64 selector. 704 static bool unsupportedBinOp(const MachineInstr &I, 705 const AArch64RegisterBankInfo &RBI, 706 const MachineRegisterInfo &MRI, 707 const AArch64RegisterInfo &TRI) { 708 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 709 if (!Ty.isValid()) { 710 LLVM_DEBUG(dbgs() << "Generic binop register should be typed\n"); 711 return true; 712 } 713 714 const RegisterBank *PrevOpBank = nullptr; 715 for (auto &MO : I.operands()) { 716 // FIXME: Support non-register operands. 717 if (!MO.isReg()) { 718 LLVM_DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n"); 719 return true; 720 } 721 722 // FIXME: Can generic operations have physical registers operands? If 723 // so, this will need to be taught about that, and we'll need to get the 724 // bank out of the minimal class for the register. 725 // Either way, this needs to be documented (and possibly verified). 726 if (!MO.getReg().isVirtual()) { 727 LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n"); 728 return true; 729 } 730 731 const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI); 732 if (!OpBank) { 733 LLVM_DEBUG(dbgs() << "Generic register has no bank or class\n"); 734 return true; 735 } 736 737 if (PrevOpBank && OpBank != PrevOpBank) { 738 LLVM_DEBUG(dbgs() << "Generic inst operands have different banks\n"); 739 return true; 740 } 741 PrevOpBank = OpBank; 742 } 743 return false; 744 } 745 746 /// Select the AArch64 opcode for the basic binary operation \p GenericOpc 747 /// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID 748 /// and of size \p OpSize. 749 /// \returns \p GenericOpc if the combination is unsupported. 750 static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID, 751 unsigned OpSize) { 752 switch (RegBankID) { 753 case AArch64::GPRRegBankID: 754 if (OpSize == 32) { 755 switch (GenericOpc) { 756 case TargetOpcode::G_SHL: 757 return AArch64::LSLVWr; 758 case TargetOpcode::G_LSHR: 759 return AArch64::LSRVWr; 760 case TargetOpcode::G_ASHR: 761 return AArch64::ASRVWr; 762 default: 763 return GenericOpc; 764 } 765 } else if (OpSize == 64) { 766 switch (GenericOpc) { 767 case TargetOpcode::G_PTR_ADD: 768 return AArch64::ADDXrr; 769 case TargetOpcode::G_SHL: 770 return AArch64::LSLVXr; 771 case TargetOpcode::G_LSHR: 772 return AArch64::LSRVXr; 773 case TargetOpcode::G_ASHR: 774 return AArch64::ASRVXr; 775 default: 776 return GenericOpc; 777 } 778 } 779 break; 780 case AArch64::FPRRegBankID: 781 switch (OpSize) { 782 case 32: 783 switch (GenericOpc) { 784 case TargetOpcode::G_FADD: 785 return AArch64::FADDSrr; 786 case TargetOpcode::G_FSUB: 787 return AArch64::FSUBSrr; 788 case TargetOpcode::G_FMUL: 789 return AArch64::FMULSrr; 790 case TargetOpcode::G_FDIV: 791 return AArch64::FDIVSrr; 792 default: 793 return GenericOpc; 794 } 795 case 64: 796 switch (GenericOpc) { 797 case TargetOpcode::G_FADD: 798 return AArch64::FADDDrr; 799 case TargetOpcode::G_FSUB: 800 return AArch64::FSUBDrr; 801 case TargetOpcode::G_FMUL: 802 return AArch64::FMULDrr; 803 case TargetOpcode::G_FDIV: 804 return AArch64::FDIVDrr; 805 case TargetOpcode::G_OR: 806 return AArch64::ORRv8i8; 807 default: 808 return GenericOpc; 809 } 810 } 811 break; 812 } 813 return GenericOpc; 814 } 815 816 /// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc, 817 /// appropriate for the (value) register bank \p RegBankID and of memory access 818 /// size \p OpSize. This returns the variant with the base+unsigned-immediate 819 /// addressing mode (e.g., LDRXui). 820 /// \returns \p GenericOpc if the combination is unsupported. 821 static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID, 822 unsigned OpSize) { 823 const bool isStore = GenericOpc == TargetOpcode::G_STORE; 824 switch (RegBankID) { 825 case AArch64::GPRRegBankID: 826 switch (OpSize) { 827 case 8: 828 return isStore ? AArch64::STRBBui : AArch64::LDRBBui; 829 case 16: 830 return isStore ? AArch64::STRHHui : AArch64::LDRHHui; 831 case 32: 832 return isStore ? AArch64::STRWui : AArch64::LDRWui; 833 case 64: 834 return isStore ? AArch64::STRXui : AArch64::LDRXui; 835 } 836 break; 837 case AArch64::FPRRegBankID: 838 switch (OpSize) { 839 case 8: 840 return isStore ? AArch64::STRBui : AArch64::LDRBui; 841 case 16: 842 return isStore ? AArch64::STRHui : AArch64::LDRHui; 843 case 32: 844 return isStore ? AArch64::STRSui : AArch64::LDRSui; 845 case 64: 846 return isStore ? AArch64::STRDui : AArch64::LDRDui; 847 case 128: 848 return isStore ? AArch64::STRQui : AArch64::LDRQui; 849 } 850 break; 851 } 852 return GenericOpc; 853 } 854 855 /// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg 856 /// to \p *To. 857 /// 858 /// E.g "To = COPY SrcReg:SubReg" 859 static bool copySubReg(MachineInstr &I, MachineRegisterInfo &MRI, 860 const RegisterBankInfo &RBI, Register SrcReg, 861 const TargetRegisterClass *To, unsigned SubReg) { 862 assert(SrcReg.isValid() && "Expected a valid source register?"); 863 assert(To && "Destination register class cannot be null"); 864 assert(SubReg && "Expected a valid subregister"); 865 866 MachineIRBuilder MIB(I); 867 auto SubRegCopy = 868 MIB.buildInstr(TargetOpcode::COPY, {To}, {}).addReg(SrcReg, 0, SubReg); 869 MachineOperand &RegOp = I.getOperand(1); 870 RegOp.setReg(SubRegCopy.getReg(0)); 871 872 // It's possible that the destination register won't be constrained. Make 873 // sure that happens. 874 if (!I.getOperand(0).getReg().isPhysical()) 875 RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI); 876 877 return true; 878 } 879 880 /// Helper function to get the source and destination register classes for a 881 /// copy. Returns a std::pair containing the source register class for the 882 /// copy, and the destination register class for the copy. If a register class 883 /// cannot be determined, then it will be nullptr. 884 static std::pair<const TargetRegisterClass *, const TargetRegisterClass *> 885 getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII, 886 MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, 887 const RegisterBankInfo &RBI) { 888 Register DstReg = I.getOperand(0).getReg(); 889 Register SrcReg = I.getOperand(1).getReg(); 890 const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); 891 const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); 892 unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI); 893 unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); 894 895 // Special casing for cross-bank copies of s1s. We can technically represent 896 // a 1-bit value with any size of register. The minimum size for a GPR is 32 897 // bits. So, we need to put the FPR on 32 bits as well. 898 // 899 // FIXME: I'm not sure if this case holds true outside of copies. If it does, 900 // then we can pull it into the helpers that get the appropriate class for a 901 // register bank. Or make a new helper that carries along some constraint 902 // information. 903 if (SrcRegBank != DstRegBank && (DstSize == 1 && SrcSize == 1)) 904 SrcSize = DstSize = 32; 905 906 return {getMinClassForRegBank(SrcRegBank, SrcSize, true), 907 getMinClassForRegBank(DstRegBank, DstSize, true)}; 908 } 909 910 // FIXME: We need some sort of API in RBI/TRI to allow generic code to 911 // constrain operands of simple instructions given a TargetRegisterClass 912 // and LLT 913 static bool selectDebugInstr(MachineInstr &I, MachineRegisterInfo &MRI, 914 const RegisterBankInfo &RBI) { 915 for (MachineOperand &MO : I.operands()) { 916 if (!MO.isReg()) 917 continue; 918 Register Reg = MO.getReg(); 919 if (!Reg) 920 continue; 921 if (Reg.isPhysical()) 922 continue; 923 LLT Ty = MRI.getType(Reg); 924 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 925 const TargetRegisterClass *RC = 926 RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); 927 if (!RC) { 928 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); 929 RC = getRegClassForTypeOnBank(Ty, RB); 930 if (!RC) { 931 LLVM_DEBUG( 932 dbgs() << "Warning: DBG_VALUE operand has unexpected size/bank\n"); 933 break; 934 } 935 } 936 RBI.constrainGenericRegister(Reg, *RC, MRI); 937 } 938 939 return true; 940 } 941 942 static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, 943 MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, 944 const RegisterBankInfo &RBI) { 945 Register DstReg = I.getOperand(0).getReg(); 946 Register SrcReg = I.getOperand(1).getReg(); 947 const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); 948 const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); 949 950 // Find the correct register classes for the source and destination registers. 951 const TargetRegisterClass *SrcRC; 952 const TargetRegisterClass *DstRC; 953 std::tie(SrcRC, DstRC) = getRegClassesForCopy(I, TII, MRI, TRI, RBI); 954 955 if (!DstRC) { 956 LLVM_DEBUG(dbgs() << "Unexpected dest size " 957 << RBI.getSizeInBits(DstReg, MRI, TRI) << '\n'); 958 return false; 959 } 960 961 // Is this a copy? If so, then we may need to insert a subregister copy. 962 if (I.isCopy()) { 963 // Yes. Check if there's anything to fix up. 964 if (!SrcRC) { 965 LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n"); 966 return false; 967 } 968 969 unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC); 970 unsigned DstSize = TRI.getRegSizeInBits(*DstRC); 971 unsigned SubReg; 972 973 // If the source bank doesn't support a subregister copy small enough, 974 // then we first need to copy to the destination bank. 975 if (getMinSizeForRegBank(SrcRegBank) > DstSize) { 976 const TargetRegisterClass *DstTempRC = 977 getMinClassForRegBank(DstRegBank, SrcSize, /* GetAllRegSet */ true); 978 getSubRegForClass(DstRC, TRI, SubReg); 979 980 MachineIRBuilder MIB(I); 981 auto Copy = MIB.buildCopy({DstTempRC}, {SrcReg}); 982 copySubReg(I, MRI, RBI, Copy.getReg(0), DstRC, SubReg); 983 } else if (SrcSize > DstSize) { 984 // If the source register is bigger than the destination we need to 985 // perform a subregister copy. 986 const TargetRegisterClass *SubRegRC = 987 getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true); 988 getSubRegForClass(SubRegRC, TRI, SubReg); 989 copySubReg(I, MRI, RBI, SrcReg, DstRC, SubReg); 990 } else if (DstSize > SrcSize) { 991 // If the destination register is bigger than the source we need to do 992 // a promotion using SUBREG_TO_REG. 993 const TargetRegisterClass *PromotionRC = 994 getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true); 995 getSubRegForClass(SrcRC, TRI, SubReg); 996 997 Register PromoteReg = MRI.createVirtualRegister(PromotionRC); 998 BuildMI(*I.getParent(), I, I.getDebugLoc(), 999 TII.get(AArch64::SUBREG_TO_REG), PromoteReg) 1000 .addImm(0) 1001 .addUse(SrcReg) 1002 .addImm(SubReg); 1003 MachineOperand &RegOp = I.getOperand(1); 1004 RegOp.setReg(PromoteReg); 1005 } 1006 1007 // If the destination is a physical register, then there's nothing to 1008 // change, so we're done. 1009 if (DstReg.isPhysical()) 1010 return true; 1011 } 1012 1013 // No need to constrain SrcReg. It will get constrained when we hit another 1014 // of its use or its defs. Copies do not have constraints. 1015 if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { 1016 LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode()) 1017 << " operand\n"); 1018 return false; 1019 } 1020 1021 // If this a GPR ZEXT that we want to just reduce down into a copy. 1022 // The sizes will be mismatched with the source < 32b but that's ok. 1023 if (I.getOpcode() == TargetOpcode::G_ZEXT) { 1024 I.setDesc(TII.get(AArch64::COPY)); 1025 assert(SrcRegBank.getID() == AArch64::GPRRegBankID); 1026 return selectCopy(I, TII, MRI, TRI, RBI); 1027 } 1028 1029 I.setDesc(TII.get(AArch64::COPY)); 1030 return true; 1031 } 1032 1033 static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) { 1034 if (!DstTy.isScalar() || !SrcTy.isScalar()) 1035 return GenericOpc; 1036 1037 const unsigned DstSize = DstTy.getSizeInBits(); 1038 const unsigned SrcSize = SrcTy.getSizeInBits(); 1039 1040 switch (DstSize) { 1041 case 32: 1042 switch (SrcSize) { 1043 case 32: 1044 switch (GenericOpc) { 1045 case TargetOpcode::G_SITOFP: 1046 return AArch64::SCVTFUWSri; 1047 case TargetOpcode::G_UITOFP: 1048 return AArch64::UCVTFUWSri; 1049 case TargetOpcode::G_FPTOSI: 1050 return AArch64::FCVTZSUWSr; 1051 case TargetOpcode::G_FPTOUI: 1052 return AArch64::FCVTZUUWSr; 1053 default: 1054 return GenericOpc; 1055 } 1056 case 64: 1057 switch (GenericOpc) { 1058 case TargetOpcode::G_SITOFP: 1059 return AArch64::SCVTFUXSri; 1060 case TargetOpcode::G_UITOFP: 1061 return AArch64::UCVTFUXSri; 1062 case TargetOpcode::G_FPTOSI: 1063 return AArch64::FCVTZSUWDr; 1064 case TargetOpcode::G_FPTOUI: 1065 return AArch64::FCVTZUUWDr; 1066 default: 1067 return GenericOpc; 1068 } 1069 default: 1070 return GenericOpc; 1071 } 1072 case 64: 1073 switch (SrcSize) { 1074 case 32: 1075 switch (GenericOpc) { 1076 case TargetOpcode::G_SITOFP: 1077 return AArch64::SCVTFUWDri; 1078 case TargetOpcode::G_UITOFP: 1079 return AArch64::UCVTFUWDri; 1080 case TargetOpcode::G_FPTOSI: 1081 return AArch64::FCVTZSUXSr; 1082 case TargetOpcode::G_FPTOUI: 1083 return AArch64::FCVTZUUXSr; 1084 default: 1085 return GenericOpc; 1086 } 1087 case 64: 1088 switch (GenericOpc) { 1089 case TargetOpcode::G_SITOFP: 1090 return AArch64::SCVTFUXDri; 1091 case TargetOpcode::G_UITOFP: 1092 return AArch64::UCVTFUXDri; 1093 case TargetOpcode::G_FPTOSI: 1094 return AArch64::FCVTZSUXDr; 1095 case TargetOpcode::G_FPTOUI: 1096 return AArch64::FCVTZUUXDr; 1097 default: 1098 return GenericOpc; 1099 } 1100 default: 1101 return GenericOpc; 1102 } 1103 default: 1104 return GenericOpc; 1105 }; 1106 return GenericOpc; 1107 } 1108 1109 MachineInstr * 1110 AArch64InstructionSelector::emitSelect(Register Dst, Register True, 1111 Register False, AArch64CC::CondCode CC, 1112 MachineIRBuilder &MIB) const { 1113 MachineRegisterInfo &MRI = *MIB.getMRI(); 1114 assert(RBI.getRegBank(False, MRI, TRI)->getID() == 1115 RBI.getRegBank(True, MRI, TRI)->getID() && 1116 "Expected both select operands to have the same regbank?"); 1117 LLT Ty = MRI.getType(True); 1118 if (Ty.isVector()) 1119 return nullptr; 1120 const unsigned Size = Ty.getSizeInBits(); 1121 assert((Size == 32 || Size == 64) && 1122 "Expected 32 bit or 64 bit select only?"); 1123 const bool Is32Bit = Size == 32; 1124 if (RBI.getRegBank(True, MRI, TRI)->getID() != AArch64::GPRRegBankID) { 1125 unsigned Opc = Is32Bit ? AArch64::FCSELSrrr : AArch64::FCSELDrrr; 1126 auto FCSel = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC); 1127 constrainSelectedInstRegOperands(*FCSel, TII, TRI, RBI); 1128 return &*FCSel; 1129 } 1130 1131 // By default, we'll try and emit a CSEL. 1132 unsigned Opc = Is32Bit ? AArch64::CSELWr : AArch64::CSELXr; 1133 bool Optimized = false; 1134 auto TryFoldBinOpIntoSelect = [&Opc, Is32Bit, &CC, &MRI, 1135 &Optimized](Register &Reg, Register &OtherReg, 1136 bool Invert) { 1137 if (Optimized) 1138 return false; 1139 1140 // Attempt to fold: 1141 // 1142 // %sub = G_SUB 0, %x 1143 // %select = G_SELECT cc, %reg, %sub 1144 // 1145 // Into: 1146 // %select = CSNEG %reg, %x, cc 1147 Register MatchReg; 1148 if (mi_match(Reg, MRI, m_Neg(m_Reg(MatchReg)))) { 1149 Opc = Is32Bit ? AArch64::CSNEGWr : AArch64::CSNEGXr; 1150 Reg = MatchReg; 1151 if (Invert) { 1152 CC = AArch64CC::getInvertedCondCode(CC); 1153 std::swap(Reg, OtherReg); 1154 } 1155 return true; 1156 } 1157 1158 // Attempt to fold: 1159 // 1160 // %xor = G_XOR %x, -1 1161 // %select = G_SELECT cc, %reg, %xor 1162 // 1163 // Into: 1164 // %select = CSINV %reg, %x, cc 1165 if (mi_match(Reg, MRI, m_Not(m_Reg(MatchReg)))) { 1166 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; 1167 Reg = MatchReg; 1168 if (Invert) { 1169 CC = AArch64CC::getInvertedCondCode(CC); 1170 std::swap(Reg, OtherReg); 1171 } 1172 return true; 1173 } 1174 1175 // Attempt to fold: 1176 // 1177 // %add = G_ADD %x, 1 1178 // %select = G_SELECT cc, %reg, %add 1179 // 1180 // Into: 1181 // %select = CSINC %reg, %x, cc 1182 if (mi_match(Reg, MRI, 1183 m_any_of(m_GAdd(m_Reg(MatchReg), m_SpecificICst(1)), 1184 m_GPtrAdd(m_Reg(MatchReg), m_SpecificICst(1))))) { 1185 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; 1186 Reg = MatchReg; 1187 if (Invert) { 1188 CC = AArch64CC::getInvertedCondCode(CC); 1189 std::swap(Reg, OtherReg); 1190 } 1191 return true; 1192 } 1193 1194 return false; 1195 }; 1196 1197 // Helper lambda which tries to use CSINC/CSINV for the instruction when its 1198 // true/false values are constants. 1199 // FIXME: All of these patterns already exist in tablegen. We should be 1200 // able to import these. 1201 auto TryOptSelectCst = [&Opc, &True, &False, &CC, Is32Bit, &MRI, 1202 &Optimized]() { 1203 if (Optimized) 1204 return false; 1205 auto TrueCst = getIConstantVRegValWithLookThrough(True, MRI); 1206 auto FalseCst = getIConstantVRegValWithLookThrough(False, MRI); 1207 if (!TrueCst && !FalseCst) 1208 return false; 1209 1210 Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR; 1211 if (TrueCst && FalseCst) { 1212 int64_t T = TrueCst->Value.getSExtValue(); 1213 int64_t F = FalseCst->Value.getSExtValue(); 1214 1215 if (T == 0 && F == 1) { 1216 // G_SELECT cc, 0, 1 -> CSINC zreg, zreg, cc 1217 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; 1218 True = ZReg; 1219 False = ZReg; 1220 return true; 1221 } 1222 1223 if (T == 0 && F == -1) { 1224 // G_SELECT cc 0, -1 -> CSINV zreg, zreg cc 1225 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; 1226 True = ZReg; 1227 False = ZReg; 1228 return true; 1229 } 1230 } 1231 1232 if (TrueCst) { 1233 int64_t T = TrueCst->Value.getSExtValue(); 1234 if (T == 1) { 1235 // G_SELECT cc, 1, f -> CSINC f, zreg, inv_cc 1236 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; 1237 True = False; 1238 False = ZReg; 1239 CC = AArch64CC::getInvertedCondCode(CC); 1240 return true; 1241 } 1242 1243 if (T == -1) { 1244 // G_SELECT cc, -1, f -> CSINV f, zreg, inv_cc 1245 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; 1246 True = False; 1247 False = ZReg; 1248 CC = AArch64CC::getInvertedCondCode(CC); 1249 return true; 1250 } 1251 } 1252 1253 if (FalseCst) { 1254 int64_t F = FalseCst->Value.getSExtValue(); 1255 if (F == 1) { 1256 // G_SELECT cc, t, 1 -> CSINC t, zreg, cc 1257 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; 1258 False = ZReg; 1259 return true; 1260 } 1261 1262 if (F == -1) { 1263 // G_SELECT cc, t, -1 -> CSINC t, zreg, cc 1264 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; 1265 False = ZReg; 1266 return true; 1267 } 1268 } 1269 return false; 1270 }; 1271 1272 Optimized |= TryFoldBinOpIntoSelect(False, True, /*Invert = */ false); 1273 Optimized |= TryFoldBinOpIntoSelect(True, False, /*Invert = */ true); 1274 Optimized |= TryOptSelectCst(); 1275 auto SelectInst = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC); 1276 constrainSelectedInstRegOperands(*SelectInst, TII, TRI, RBI); 1277 return &*SelectInst; 1278 } 1279 1280 static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) { 1281 switch (P) { 1282 default: 1283 llvm_unreachable("Unknown condition code!"); 1284 case CmpInst::ICMP_NE: 1285 return AArch64CC::NE; 1286 case CmpInst::ICMP_EQ: 1287 return AArch64CC::EQ; 1288 case CmpInst::ICMP_SGT: 1289 return AArch64CC::GT; 1290 case CmpInst::ICMP_SGE: 1291 return AArch64CC::GE; 1292 case CmpInst::ICMP_SLT: 1293 return AArch64CC::LT; 1294 case CmpInst::ICMP_SLE: 1295 return AArch64CC::LE; 1296 case CmpInst::ICMP_UGT: 1297 return AArch64CC::HI; 1298 case CmpInst::ICMP_UGE: 1299 return AArch64CC::HS; 1300 case CmpInst::ICMP_ULT: 1301 return AArch64CC::LO; 1302 case CmpInst::ICMP_ULE: 1303 return AArch64CC::LS; 1304 } 1305 } 1306 1307 /// changeFPCCToORAArch64CC - Convert an IR fp condition code to an AArch64 CC. 1308 static void changeFPCCToORAArch64CC(CmpInst::Predicate CC, 1309 AArch64CC::CondCode &CondCode, 1310 AArch64CC::CondCode &CondCode2) { 1311 CondCode2 = AArch64CC::AL; 1312 switch (CC) { 1313 default: 1314 llvm_unreachable("Unknown FP condition!"); 1315 case CmpInst::FCMP_OEQ: 1316 CondCode = AArch64CC::EQ; 1317 break; 1318 case CmpInst::FCMP_OGT: 1319 CondCode = AArch64CC::GT; 1320 break; 1321 case CmpInst::FCMP_OGE: 1322 CondCode = AArch64CC::GE; 1323 break; 1324 case CmpInst::FCMP_OLT: 1325 CondCode = AArch64CC::MI; 1326 break; 1327 case CmpInst::FCMP_OLE: 1328 CondCode = AArch64CC::LS; 1329 break; 1330 case CmpInst::FCMP_ONE: 1331 CondCode = AArch64CC::MI; 1332 CondCode2 = AArch64CC::GT; 1333 break; 1334 case CmpInst::FCMP_ORD: 1335 CondCode = AArch64CC::VC; 1336 break; 1337 case CmpInst::FCMP_UNO: 1338 CondCode = AArch64CC::VS; 1339 break; 1340 case CmpInst::FCMP_UEQ: 1341 CondCode = AArch64CC::EQ; 1342 CondCode2 = AArch64CC::VS; 1343 break; 1344 case CmpInst::FCMP_UGT: 1345 CondCode = AArch64CC::HI; 1346 break; 1347 case CmpInst::FCMP_UGE: 1348 CondCode = AArch64CC::PL; 1349 break; 1350 case CmpInst::FCMP_ULT: 1351 CondCode = AArch64CC::LT; 1352 break; 1353 case CmpInst::FCMP_ULE: 1354 CondCode = AArch64CC::LE; 1355 break; 1356 case CmpInst::FCMP_UNE: 1357 CondCode = AArch64CC::NE; 1358 break; 1359 } 1360 } 1361 1362 /// Convert an IR fp condition code to an AArch64 CC. 1363 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that 1364 /// should be AND'ed instead of OR'ed. 1365 static void changeFPCCToANDAArch64CC(CmpInst::Predicate CC, 1366 AArch64CC::CondCode &CondCode, 1367 AArch64CC::CondCode &CondCode2) { 1368 CondCode2 = AArch64CC::AL; 1369 switch (CC) { 1370 default: 1371 changeFPCCToORAArch64CC(CC, CondCode, CondCode2); 1372 assert(CondCode2 == AArch64CC::AL); 1373 break; 1374 case CmpInst::FCMP_ONE: 1375 // (a one b) 1376 // == ((a olt b) || (a ogt b)) 1377 // == ((a ord b) && (a une b)) 1378 CondCode = AArch64CC::VC; 1379 CondCode2 = AArch64CC::NE; 1380 break; 1381 case CmpInst::FCMP_UEQ: 1382 // (a ueq b) 1383 // == ((a uno b) || (a oeq b)) 1384 // == ((a ule b) && (a uge b)) 1385 CondCode = AArch64CC::PL; 1386 CondCode2 = AArch64CC::LE; 1387 break; 1388 } 1389 } 1390 1391 /// Return a register which can be used as a bit to test in a TB(N)Z. 1392 static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert, 1393 MachineRegisterInfo &MRI) { 1394 assert(Reg.isValid() && "Expected valid register!"); 1395 bool HasZext = false; 1396 while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) { 1397 unsigned Opc = MI->getOpcode(); 1398 1399 if (!MI->getOperand(0).isReg() || 1400 !MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) 1401 break; 1402 1403 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits. 1404 // 1405 // (tbz (trunc x), b) -> (tbz x, b) is always safe, because the bit number 1406 // on the truncated x is the same as the bit number on x. 1407 if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT || 1408 Opc == TargetOpcode::G_TRUNC) { 1409 if (Opc == TargetOpcode::G_ZEXT) 1410 HasZext = true; 1411 1412 Register NextReg = MI->getOperand(1).getReg(); 1413 // Did we find something worth folding? 1414 if (!NextReg.isValid() || !MRI.hasOneNonDBGUse(NextReg)) 1415 break; 1416 1417 // NextReg is worth folding. Keep looking. 1418 Reg = NextReg; 1419 continue; 1420 } 1421 1422 // Attempt to find a suitable operation with a constant on one side. 1423 std::optional<uint64_t> C; 1424 Register TestReg; 1425 switch (Opc) { 1426 default: 1427 break; 1428 case TargetOpcode::G_AND: 1429 case TargetOpcode::G_XOR: { 1430 TestReg = MI->getOperand(1).getReg(); 1431 Register ConstantReg = MI->getOperand(2).getReg(); 1432 auto VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI); 1433 if (!VRegAndVal) { 1434 // AND commutes, check the other side for a constant. 1435 // FIXME: Can we canonicalize the constant so that it's always on the 1436 // same side at some point earlier? 1437 std::swap(ConstantReg, TestReg); 1438 VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI); 1439 } 1440 if (VRegAndVal) { 1441 if (HasZext) 1442 C = VRegAndVal->Value.getZExtValue(); 1443 else 1444 C = VRegAndVal->Value.getSExtValue(); 1445 } 1446 break; 1447 } 1448 case TargetOpcode::G_ASHR: 1449 case TargetOpcode::G_LSHR: 1450 case TargetOpcode::G_SHL: { 1451 TestReg = MI->getOperand(1).getReg(); 1452 auto VRegAndVal = 1453 getIConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI); 1454 if (VRegAndVal) 1455 C = VRegAndVal->Value.getSExtValue(); 1456 break; 1457 } 1458 } 1459 1460 // Didn't find a constant or viable register. Bail out of the loop. 1461 if (!C || !TestReg.isValid()) 1462 break; 1463 1464 // We found a suitable instruction with a constant. Check to see if we can 1465 // walk through the instruction. 1466 Register NextReg; 1467 unsigned TestRegSize = MRI.getType(TestReg).getSizeInBits(); 1468 switch (Opc) { 1469 default: 1470 break; 1471 case TargetOpcode::G_AND: 1472 // (tbz (and x, m), b) -> (tbz x, b) when the b-th bit of m is set. 1473 if ((*C >> Bit) & 1) 1474 NextReg = TestReg; 1475 break; 1476 case TargetOpcode::G_SHL: 1477 // (tbz (shl x, c), b) -> (tbz x, b-c) when b-c is positive and fits in 1478 // the type of the register. 1479 if (*C <= Bit && (Bit - *C) < TestRegSize) { 1480 NextReg = TestReg; 1481 Bit = Bit - *C; 1482 } 1483 break; 1484 case TargetOpcode::G_ASHR: 1485 // (tbz (ashr x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits 1486 // in x 1487 NextReg = TestReg; 1488 Bit = Bit + *C; 1489 if (Bit >= TestRegSize) 1490 Bit = TestRegSize - 1; 1491 break; 1492 case TargetOpcode::G_LSHR: 1493 // (tbz (lshr x, c), b) -> (tbz x, b+c) when b + c is < # bits in x 1494 if ((Bit + *C) < TestRegSize) { 1495 NextReg = TestReg; 1496 Bit = Bit + *C; 1497 } 1498 break; 1499 case TargetOpcode::G_XOR: 1500 // We can walk through a G_XOR by inverting whether we use tbz/tbnz when 1501 // appropriate. 1502 // 1503 // e.g. If x' = xor x, c, and the b-th bit is set in c then 1504 // 1505 // tbz x', b -> tbnz x, b 1506 // 1507 // Because x' only has the b-th bit set if x does not. 1508 if ((*C >> Bit) & 1) 1509 Invert = !Invert; 1510 NextReg = TestReg; 1511 break; 1512 } 1513 1514 // Check if we found anything worth folding. 1515 if (!NextReg.isValid()) 1516 return Reg; 1517 Reg = NextReg; 1518 } 1519 1520 return Reg; 1521 } 1522 1523 MachineInstr *AArch64InstructionSelector::emitTestBit( 1524 Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB, 1525 MachineIRBuilder &MIB) const { 1526 assert(TestReg.isValid()); 1527 assert(ProduceNonFlagSettingCondBr && 1528 "Cannot emit TB(N)Z with speculation tracking!"); 1529 MachineRegisterInfo &MRI = *MIB.getMRI(); 1530 1531 // Attempt to optimize the test bit by walking over instructions. 1532 TestReg = getTestBitReg(TestReg, Bit, IsNegative, MRI); 1533 LLT Ty = MRI.getType(TestReg); 1534 unsigned Size = Ty.getSizeInBits(); 1535 assert(!Ty.isVector() && "Expected a scalar!"); 1536 assert(Bit < 64 && "Bit is too large!"); 1537 1538 // When the test register is a 64-bit register, we have to narrow to make 1539 // TBNZW work. 1540 bool UseWReg = Bit < 32; 1541 unsigned NecessarySize = UseWReg ? 32 : 64; 1542 if (Size != NecessarySize) 1543 TestReg = moveScalarRegClass( 1544 TestReg, UseWReg ? AArch64::GPR32RegClass : AArch64::GPR64RegClass, 1545 MIB); 1546 1547 static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX}, 1548 {AArch64::TBZW, AArch64::TBNZW}}; 1549 unsigned Opc = OpcTable[UseWReg][IsNegative]; 1550 auto TestBitMI = 1551 MIB.buildInstr(Opc).addReg(TestReg).addImm(Bit).addMBB(DstMBB); 1552 constrainSelectedInstRegOperands(*TestBitMI, TII, TRI, RBI); 1553 return &*TestBitMI; 1554 } 1555 1556 bool AArch64InstructionSelector::tryOptAndIntoCompareBranch( 1557 MachineInstr &AndInst, bool Invert, MachineBasicBlock *DstMBB, 1558 MachineIRBuilder &MIB) const { 1559 assert(AndInst.getOpcode() == TargetOpcode::G_AND && "Expected G_AND only?"); 1560 // Given something like this: 1561 // 1562 // %x = ...Something... 1563 // %one = G_CONSTANT i64 1 1564 // %zero = G_CONSTANT i64 0 1565 // %and = G_AND %x, %one 1566 // %cmp = G_ICMP intpred(ne), %and, %zero 1567 // %cmp_trunc = G_TRUNC %cmp 1568 // G_BRCOND %cmp_trunc, %bb.3 1569 // 1570 // We want to try and fold the AND into the G_BRCOND and produce either a 1571 // TBNZ (when we have intpred(ne)) or a TBZ (when we have intpred(eq)). 1572 // 1573 // In this case, we'd get 1574 // 1575 // TBNZ %x %bb.3 1576 // 1577 1578 // Check if the AND has a constant on its RHS which we can use as a mask. 1579 // If it's a power of 2, then it's the same as checking a specific bit. 1580 // (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set) 1581 auto MaybeBit = getIConstantVRegValWithLookThrough( 1582 AndInst.getOperand(2).getReg(), *MIB.getMRI()); 1583 if (!MaybeBit) 1584 return false; 1585 1586 int32_t Bit = MaybeBit->Value.exactLogBase2(); 1587 if (Bit < 0) 1588 return false; 1589 1590 Register TestReg = AndInst.getOperand(1).getReg(); 1591 1592 // Emit a TB(N)Z. 1593 emitTestBit(TestReg, Bit, Invert, DstMBB, MIB); 1594 return true; 1595 } 1596 1597 MachineInstr *AArch64InstructionSelector::emitCBZ(Register CompareReg, 1598 bool IsNegative, 1599 MachineBasicBlock *DestMBB, 1600 MachineIRBuilder &MIB) const { 1601 assert(ProduceNonFlagSettingCondBr && "CBZ does not set flags!"); 1602 MachineRegisterInfo &MRI = *MIB.getMRI(); 1603 assert(RBI.getRegBank(CompareReg, MRI, TRI)->getID() == 1604 AArch64::GPRRegBankID && 1605 "Expected GPRs only?"); 1606 auto Ty = MRI.getType(CompareReg); 1607 unsigned Width = Ty.getSizeInBits(); 1608 assert(!Ty.isVector() && "Expected scalar only?"); 1609 assert(Width <= 64 && "Expected width to be at most 64?"); 1610 static const unsigned OpcTable[2][2] = {{AArch64::CBZW, AArch64::CBZX}, 1611 {AArch64::CBNZW, AArch64::CBNZX}}; 1612 unsigned Opc = OpcTable[IsNegative][Width == 64]; 1613 auto BranchMI = MIB.buildInstr(Opc, {}, {CompareReg}).addMBB(DestMBB); 1614 constrainSelectedInstRegOperands(*BranchMI, TII, TRI, RBI); 1615 return &*BranchMI; 1616 } 1617 1618 bool AArch64InstructionSelector::selectCompareBranchFedByFCmp( 1619 MachineInstr &I, MachineInstr &FCmp, MachineIRBuilder &MIB) const { 1620 assert(FCmp.getOpcode() == TargetOpcode::G_FCMP); 1621 assert(I.getOpcode() == TargetOpcode::G_BRCOND); 1622 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't 1623 // totally clean. Some of them require two branches to implement. 1624 auto Pred = (CmpInst::Predicate)FCmp.getOperand(1).getPredicate(); 1625 emitFPCompare(FCmp.getOperand(2).getReg(), FCmp.getOperand(3).getReg(), MIB, 1626 Pred); 1627 AArch64CC::CondCode CC1, CC2; 1628 changeFCMPPredToAArch64CC(static_cast<CmpInst::Predicate>(Pred), CC1, CC2); 1629 MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); 1630 MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC1).addMBB(DestMBB); 1631 if (CC2 != AArch64CC::AL) 1632 MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC2).addMBB(DestMBB); 1633 I.eraseFromParent(); 1634 return true; 1635 } 1636 1637 bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp( 1638 MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const { 1639 assert(ICmp.getOpcode() == TargetOpcode::G_ICMP); 1640 assert(I.getOpcode() == TargetOpcode::G_BRCOND); 1641 // Attempt to optimize the G_BRCOND + G_ICMP into a TB(N)Z/CB(N)Z. 1642 // 1643 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z 1644 // instructions will not be produced, as they are conditional branch 1645 // instructions that do not set flags. 1646 if (!ProduceNonFlagSettingCondBr) 1647 return false; 1648 1649 MachineRegisterInfo &MRI = *MIB.getMRI(); 1650 MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); 1651 auto Pred = 1652 static_cast<CmpInst::Predicate>(ICmp.getOperand(1).getPredicate()); 1653 Register LHS = ICmp.getOperand(2).getReg(); 1654 Register RHS = ICmp.getOperand(3).getReg(); 1655 1656 // We're allowed to emit a TB(N)Z/CB(N)Z. Try to do that. 1657 auto VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI); 1658 MachineInstr *AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI); 1659 1660 // When we can emit a TB(N)Z, prefer that. 1661 // 1662 // Handle non-commutative condition codes first. 1663 // Note that we don't want to do this when we have a G_AND because it can 1664 // become a tst. The tst will make the test bit in the TB(N)Z redundant. 1665 if (VRegAndVal && !AndInst) { 1666 int64_t C = VRegAndVal->Value.getSExtValue(); 1667 1668 // When we have a greater-than comparison, we can just test if the msb is 1669 // zero. 1670 if (C == -1 && Pred == CmpInst::ICMP_SGT) { 1671 uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1; 1672 emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB); 1673 I.eraseFromParent(); 1674 return true; 1675 } 1676 1677 // When we have a less than comparison, we can just test if the msb is not 1678 // zero. 1679 if (C == 0 && Pred == CmpInst::ICMP_SLT) { 1680 uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1; 1681 emitTestBit(LHS, Bit, /*IsNegative = */ true, DestMBB, MIB); 1682 I.eraseFromParent(); 1683 return true; 1684 } 1685 1686 // Inversely, if we have a signed greater-than-or-equal comparison to zero, 1687 // we can test if the msb is zero. 1688 if (C == 0 && Pred == CmpInst::ICMP_SGE) { 1689 uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1; 1690 emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB); 1691 I.eraseFromParent(); 1692 return true; 1693 } 1694 } 1695 1696 // Attempt to handle commutative condition codes. Right now, that's only 1697 // eq/ne. 1698 if (ICmpInst::isEquality(Pred)) { 1699 if (!VRegAndVal) { 1700 std::swap(RHS, LHS); 1701 VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI); 1702 AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI); 1703 } 1704 1705 if (VRegAndVal && VRegAndVal->Value == 0) { 1706 // If there's a G_AND feeding into this branch, try to fold it away by 1707 // emitting a TB(N)Z instead. 1708 // 1709 // Note: If we have LT, then it *is* possible to fold, but it wouldn't be 1710 // beneficial. When we have an AND and LT, we need a TST/ANDS, so folding 1711 // would be redundant. 1712 if (AndInst && 1713 tryOptAndIntoCompareBranch( 1714 *AndInst, /*Invert = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB)) { 1715 I.eraseFromParent(); 1716 return true; 1717 } 1718 1719 // Otherwise, try to emit a CB(N)Z instead. 1720 auto LHSTy = MRI.getType(LHS); 1721 if (!LHSTy.isVector() && LHSTy.getSizeInBits() <= 64) { 1722 emitCBZ(LHS, /*IsNegative = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB); 1723 I.eraseFromParent(); 1724 return true; 1725 } 1726 } 1727 } 1728 1729 return false; 1730 } 1731 1732 bool AArch64InstructionSelector::selectCompareBranchFedByICmp( 1733 MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const { 1734 assert(ICmp.getOpcode() == TargetOpcode::G_ICMP); 1735 assert(I.getOpcode() == TargetOpcode::G_BRCOND); 1736 if (tryOptCompareBranchFedByICmp(I, ICmp, MIB)) 1737 return true; 1738 1739 // Couldn't optimize. Emit a compare + a Bcc. 1740 MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); 1741 auto PredOp = ICmp.getOperand(1); 1742 emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB); 1743 const AArch64CC::CondCode CC = changeICMPPredToAArch64CC( 1744 static_cast<CmpInst::Predicate>(PredOp.getPredicate())); 1745 MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB); 1746 I.eraseFromParent(); 1747 return true; 1748 } 1749 1750 bool AArch64InstructionSelector::selectCompareBranch( 1751 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) { 1752 Register CondReg = I.getOperand(0).getReg(); 1753 MachineInstr *CCMI = MRI.getVRegDef(CondReg); 1754 // Try to select the G_BRCOND using whatever is feeding the condition if 1755 // possible. 1756 unsigned CCMIOpc = CCMI->getOpcode(); 1757 if (CCMIOpc == TargetOpcode::G_FCMP) 1758 return selectCompareBranchFedByFCmp(I, *CCMI, MIB); 1759 if (CCMIOpc == TargetOpcode::G_ICMP) 1760 return selectCompareBranchFedByICmp(I, *CCMI, MIB); 1761 1762 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z 1763 // instructions will not be produced, as they are conditional branch 1764 // instructions that do not set flags. 1765 if (ProduceNonFlagSettingCondBr) { 1766 emitTestBit(CondReg, /*Bit = */ 0, /*IsNegative = */ true, 1767 I.getOperand(1).getMBB(), MIB); 1768 I.eraseFromParent(); 1769 return true; 1770 } 1771 1772 // Can't emit TB(N)Z/CB(N)Z. Emit a tst + bcc instead. 1773 auto TstMI = 1774 MIB.buildInstr(AArch64::ANDSWri, {LLT::scalar(32)}, {CondReg}).addImm(1); 1775 constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI); 1776 auto Bcc = MIB.buildInstr(AArch64::Bcc) 1777 .addImm(AArch64CC::EQ) 1778 .addMBB(I.getOperand(1).getMBB()); 1779 I.eraseFromParent(); 1780 return constrainSelectedInstRegOperands(*Bcc, TII, TRI, RBI); 1781 } 1782 1783 /// Returns the element immediate value of a vector shift operand if found. 1784 /// This needs to detect a splat-like operation, e.g. a G_BUILD_VECTOR. 1785 static std::optional<int64_t> getVectorShiftImm(Register Reg, 1786 MachineRegisterInfo &MRI) { 1787 assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand"); 1788 MachineInstr *OpMI = MRI.getVRegDef(Reg); 1789 return getAArch64VectorSplatScalar(*OpMI, MRI); 1790 } 1791 1792 /// Matches and returns the shift immediate value for a SHL instruction given 1793 /// a shift operand. 1794 static std::optional<int64_t> getVectorSHLImm(LLT SrcTy, Register Reg, 1795 MachineRegisterInfo &MRI) { 1796 std::optional<int64_t> ShiftImm = getVectorShiftImm(Reg, MRI); 1797 if (!ShiftImm) 1798 return std::nullopt; 1799 // Check the immediate is in range for a SHL. 1800 int64_t Imm = *ShiftImm; 1801 if (Imm < 0) 1802 return std::nullopt; 1803 switch (SrcTy.getElementType().getSizeInBits()) { 1804 default: 1805 LLVM_DEBUG(dbgs() << "Unhandled element type for vector shift"); 1806 return std::nullopt; 1807 case 8: 1808 if (Imm > 7) 1809 return std::nullopt; 1810 break; 1811 case 16: 1812 if (Imm > 15) 1813 return std::nullopt; 1814 break; 1815 case 32: 1816 if (Imm > 31) 1817 return std::nullopt; 1818 break; 1819 case 64: 1820 if (Imm > 63) 1821 return std::nullopt; 1822 break; 1823 } 1824 return Imm; 1825 } 1826 1827 bool AArch64InstructionSelector::selectVectorSHL(MachineInstr &I, 1828 MachineRegisterInfo &MRI) { 1829 assert(I.getOpcode() == TargetOpcode::G_SHL); 1830 Register DstReg = I.getOperand(0).getReg(); 1831 const LLT Ty = MRI.getType(DstReg); 1832 Register Src1Reg = I.getOperand(1).getReg(); 1833 Register Src2Reg = I.getOperand(2).getReg(); 1834 1835 if (!Ty.isVector()) 1836 return false; 1837 1838 // Check if we have a vector of constants on RHS that we can select as the 1839 // immediate form. 1840 std::optional<int64_t> ImmVal = getVectorSHLImm(Ty, Src2Reg, MRI); 1841 1842 unsigned Opc = 0; 1843 if (Ty == LLT::fixed_vector(2, 64)) { 1844 Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64; 1845 } else if (Ty == LLT::fixed_vector(4, 32)) { 1846 Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32; 1847 } else if (Ty == LLT::fixed_vector(2, 32)) { 1848 Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32; 1849 } else if (Ty == LLT::fixed_vector(4, 16)) { 1850 Opc = ImmVal ? AArch64::SHLv4i16_shift : AArch64::USHLv4i16; 1851 } else if (Ty == LLT::fixed_vector(8, 16)) { 1852 Opc = ImmVal ? AArch64::SHLv8i16_shift : AArch64::USHLv8i16; 1853 } else if (Ty == LLT::fixed_vector(16, 8)) { 1854 Opc = ImmVal ? AArch64::SHLv16i8_shift : AArch64::USHLv16i8; 1855 } else if (Ty == LLT::fixed_vector(8, 8)) { 1856 Opc = ImmVal ? AArch64::SHLv8i8_shift : AArch64::USHLv8i8; 1857 } else { 1858 LLVM_DEBUG(dbgs() << "Unhandled G_SHL type"); 1859 return false; 1860 } 1861 1862 auto Shl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg}); 1863 if (ImmVal) 1864 Shl.addImm(*ImmVal); 1865 else 1866 Shl.addUse(Src2Reg); 1867 constrainSelectedInstRegOperands(*Shl, TII, TRI, RBI); 1868 I.eraseFromParent(); 1869 return true; 1870 } 1871 1872 bool AArch64InstructionSelector::selectVectorAshrLshr( 1873 MachineInstr &I, MachineRegisterInfo &MRI) { 1874 assert(I.getOpcode() == TargetOpcode::G_ASHR || 1875 I.getOpcode() == TargetOpcode::G_LSHR); 1876 Register DstReg = I.getOperand(0).getReg(); 1877 const LLT Ty = MRI.getType(DstReg); 1878 Register Src1Reg = I.getOperand(1).getReg(); 1879 Register Src2Reg = I.getOperand(2).getReg(); 1880 1881 if (!Ty.isVector()) 1882 return false; 1883 1884 bool IsASHR = I.getOpcode() == TargetOpcode::G_ASHR; 1885 1886 // We expect the immediate case to be lowered in the PostLegalCombiner to 1887 // AArch64ISD::VASHR or AArch64ISD::VLSHR equivalents. 1888 1889 // There is not a shift right register instruction, but the shift left 1890 // register instruction takes a signed value, where negative numbers specify a 1891 // right shift. 1892 1893 unsigned Opc = 0; 1894 unsigned NegOpc = 0; 1895 const TargetRegisterClass *RC = 1896 getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID)); 1897 if (Ty == LLT::fixed_vector(2, 64)) { 1898 Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64; 1899 NegOpc = AArch64::NEGv2i64; 1900 } else if (Ty == LLT::fixed_vector(4, 32)) { 1901 Opc = IsASHR ? AArch64::SSHLv4i32 : AArch64::USHLv4i32; 1902 NegOpc = AArch64::NEGv4i32; 1903 } else if (Ty == LLT::fixed_vector(2, 32)) { 1904 Opc = IsASHR ? AArch64::SSHLv2i32 : AArch64::USHLv2i32; 1905 NegOpc = AArch64::NEGv2i32; 1906 } else if (Ty == LLT::fixed_vector(4, 16)) { 1907 Opc = IsASHR ? AArch64::SSHLv4i16 : AArch64::USHLv4i16; 1908 NegOpc = AArch64::NEGv4i16; 1909 } else if (Ty == LLT::fixed_vector(8, 16)) { 1910 Opc = IsASHR ? AArch64::SSHLv8i16 : AArch64::USHLv8i16; 1911 NegOpc = AArch64::NEGv8i16; 1912 } else if (Ty == LLT::fixed_vector(16, 8)) { 1913 Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8; 1914 NegOpc = AArch64::NEGv16i8; 1915 } else if (Ty == LLT::fixed_vector(8, 8)) { 1916 Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8; 1917 NegOpc = AArch64::NEGv8i8; 1918 } else { 1919 LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type"); 1920 return false; 1921 } 1922 1923 auto Neg = MIB.buildInstr(NegOpc, {RC}, {Src2Reg}); 1924 constrainSelectedInstRegOperands(*Neg, TII, TRI, RBI); 1925 auto SShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Neg}); 1926 constrainSelectedInstRegOperands(*SShl, TII, TRI, RBI); 1927 I.eraseFromParent(); 1928 return true; 1929 } 1930 1931 bool AArch64InstructionSelector::selectVaStartAAPCS( 1932 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { 1933 return false; 1934 } 1935 1936 bool AArch64InstructionSelector::selectVaStartDarwin( 1937 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { 1938 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 1939 Register ListReg = I.getOperand(0).getReg(); 1940 1941 Register ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); 1942 1943 auto MIB = 1944 BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri)) 1945 .addDef(ArgsAddrReg) 1946 .addFrameIndex(FuncInfo->getVarArgsStackIndex()) 1947 .addImm(0) 1948 .addImm(0); 1949 1950 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1951 1952 MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui)) 1953 .addUse(ArgsAddrReg) 1954 .addUse(ListReg) 1955 .addImm(0) 1956 .addMemOperand(*I.memoperands_begin()); 1957 1958 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1959 I.eraseFromParent(); 1960 return true; 1961 } 1962 1963 void AArch64InstructionSelector::materializeLargeCMVal( 1964 MachineInstr &I, const Value *V, unsigned OpFlags) { 1965 MachineBasicBlock &MBB = *I.getParent(); 1966 MachineFunction &MF = *MBB.getParent(); 1967 MachineRegisterInfo &MRI = MF.getRegInfo(); 1968 1969 auto MovZ = MIB.buildInstr(AArch64::MOVZXi, {&AArch64::GPR64RegClass}, {}); 1970 MovZ->addOperand(MF, I.getOperand(1)); 1971 MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 | 1972 AArch64II::MO_NC); 1973 MovZ->addOperand(MF, MachineOperand::CreateImm(0)); 1974 constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI); 1975 1976 auto BuildMovK = [&](Register SrcReg, unsigned char Flags, unsigned Offset, 1977 Register ForceDstReg) { 1978 Register DstReg = ForceDstReg 1979 ? ForceDstReg 1980 : MRI.createVirtualRegister(&AArch64::GPR64RegClass); 1981 auto MovI = MIB.buildInstr(AArch64::MOVKXi).addDef(DstReg).addUse(SrcReg); 1982 if (auto *GV = dyn_cast<GlobalValue>(V)) { 1983 MovI->addOperand(MF, MachineOperand::CreateGA( 1984 GV, MovZ->getOperand(1).getOffset(), Flags)); 1985 } else { 1986 MovI->addOperand( 1987 MF, MachineOperand::CreateBA(cast<BlockAddress>(V), 1988 MovZ->getOperand(1).getOffset(), Flags)); 1989 } 1990 MovI->addOperand(MF, MachineOperand::CreateImm(Offset)); 1991 constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI); 1992 return DstReg; 1993 }; 1994 Register DstReg = BuildMovK(MovZ.getReg(0), 1995 AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0); 1996 DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0); 1997 BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg()); 1998 } 1999 2000 bool AArch64InstructionSelector::preISelLower(MachineInstr &I) { 2001 MachineBasicBlock &MBB = *I.getParent(); 2002 MachineFunction &MF = *MBB.getParent(); 2003 MachineRegisterInfo &MRI = MF.getRegInfo(); 2004 2005 switch (I.getOpcode()) { 2006 case TargetOpcode::G_STORE: { 2007 bool Changed = contractCrossBankCopyIntoStore(I, MRI); 2008 MachineOperand &SrcOp = I.getOperand(0); 2009 if (MRI.getType(SrcOp.getReg()).isPointer()) { 2010 // Allow matching with imported patterns for stores of pointers. Unlike 2011 // G_LOAD/G_PTR_ADD, we may not have selected all users. So, emit a copy 2012 // and constrain. 2013 auto Copy = MIB.buildCopy(LLT::scalar(64), SrcOp); 2014 Register NewSrc = Copy.getReg(0); 2015 SrcOp.setReg(NewSrc); 2016 RBI.constrainGenericRegister(NewSrc, AArch64::GPR64RegClass, MRI); 2017 Changed = true; 2018 } 2019 return Changed; 2020 } 2021 case TargetOpcode::G_PTR_ADD: 2022 return convertPtrAddToAdd(I, MRI); 2023 case TargetOpcode::G_LOAD: { 2024 // For scalar loads of pointers, we try to convert the dest type from p0 2025 // to s64 so that our imported patterns can match. Like with the G_PTR_ADD 2026 // conversion, this should be ok because all users should have been 2027 // selected already, so the type doesn't matter for them. 2028 Register DstReg = I.getOperand(0).getReg(); 2029 const LLT DstTy = MRI.getType(DstReg); 2030 if (!DstTy.isPointer()) 2031 return false; 2032 MRI.setType(DstReg, LLT::scalar(64)); 2033 return true; 2034 } 2035 case AArch64::G_DUP: { 2036 // Convert the type from p0 to s64 to help selection. 2037 LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 2038 if (!DstTy.getElementType().isPointer()) 2039 return false; 2040 auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(1).getReg()); 2041 MRI.setType(I.getOperand(0).getReg(), 2042 DstTy.changeElementType(LLT::scalar(64))); 2043 MRI.setRegClass(NewSrc.getReg(0), &AArch64::GPR64RegClass); 2044 I.getOperand(1).setReg(NewSrc.getReg(0)); 2045 return true; 2046 } 2047 case TargetOpcode::G_UITOFP: 2048 case TargetOpcode::G_SITOFP: { 2049 // If both source and destination regbanks are FPR, then convert the opcode 2050 // to G_SITOF so that the importer can select it to an fpr variant. 2051 // Otherwise, it ends up matching an fpr/gpr variant and adding a cross-bank 2052 // copy. 2053 Register SrcReg = I.getOperand(1).getReg(); 2054 LLT SrcTy = MRI.getType(SrcReg); 2055 LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 2056 if (SrcTy.isVector() || SrcTy.getSizeInBits() != DstTy.getSizeInBits()) 2057 return false; 2058 2059 if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::FPRRegBankID) { 2060 if (I.getOpcode() == TargetOpcode::G_SITOFP) 2061 I.setDesc(TII.get(AArch64::G_SITOF)); 2062 else 2063 I.setDesc(TII.get(AArch64::G_UITOF)); 2064 return true; 2065 } 2066 return false; 2067 } 2068 default: 2069 return false; 2070 } 2071 } 2072 2073 /// This lowering tries to look for G_PTR_ADD instructions and then converts 2074 /// them to a standard G_ADD with a COPY on the source. 2075 /// 2076 /// The motivation behind this is to expose the add semantics to the imported 2077 /// tablegen patterns. We shouldn't need to check for uses being loads/stores, 2078 /// because the selector works bottom up, uses before defs. By the time we 2079 /// end up trying to select a G_PTR_ADD, we should have already attempted to 2080 /// fold this into addressing modes and were therefore unsuccessful. 2081 bool AArch64InstructionSelector::convertPtrAddToAdd( 2082 MachineInstr &I, MachineRegisterInfo &MRI) { 2083 assert(I.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD"); 2084 Register DstReg = I.getOperand(0).getReg(); 2085 Register AddOp1Reg = I.getOperand(1).getReg(); 2086 const LLT PtrTy = MRI.getType(DstReg); 2087 if (PtrTy.getAddressSpace() != 0) 2088 return false; 2089 2090 const LLT CastPtrTy = 2091 PtrTy.isVector() ? LLT::fixed_vector(2, 64) : LLT::scalar(64); 2092 auto PtrToInt = MIB.buildPtrToInt(CastPtrTy, AddOp1Reg); 2093 // Set regbanks on the registers. 2094 if (PtrTy.isVector()) 2095 MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::FPRRegBankID)); 2096 else 2097 MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID)); 2098 2099 // Now turn the %dst(p0) = G_PTR_ADD %base, off into: 2100 // %dst(intty) = G_ADD %intbase, off 2101 I.setDesc(TII.get(TargetOpcode::G_ADD)); 2102 MRI.setType(DstReg, CastPtrTy); 2103 I.getOperand(1).setReg(PtrToInt.getReg(0)); 2104 if (!select(*PtrToInt)) { 2105 LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd"); 2106 return false; 2107 } 2108 2109 // Also take the opportunity here to try to do some optimization. 2110 // Try to convert this into a G_SUB if the offset is a 0-x negate idiom. 2111 Register NegatedReg; 2112 if (!mi_match(I.getOperand(2).getReg(), MRI, m_Neg(m_Reg(NegatedReg)))) 2113 return true; 2114 I.getOperand(2).setReg(NegatedReg); 2115 I.setDesc(TII.get(TargetOpcode::G_SUB)); 2116 return true; 2117 } 2118 2119 bool AArch64InstructionSelector::earlySelectSHL(MachineInstr &I, 2120 MachineRegisterInfo &MRI) { 2121 // We try to match the immediate variant of LSL, which is actually an alias 2122 // for a special case of UBFM. Otherwise, we fall back to the imported 2123 // selector which will match the register variant. 2124 assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op"); 2125 const auto &MO = I.getOperand(2); 2126 auto VRegAndVal = getIConstantVRegVal(MO.getReg(), MRI); 2127 if (!VRegAndVal) 2128 return false; 2129 2130 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 2131 if (DstTy.isVector()) 2132 return false; 2133 bool Is64Bit = DstTy.getSizeInBits() == 64; 2134 auto Imm1Fn = Is64Bit ? selectShiftA_64(MO) : selectShiftA_32(MO); 2135 auto Imm2Fn = Is64Bit ? selectShiftB_64(MO) : selectShiftB_32(MO); 2136 2137 if (!Imm1Fn || !Imm2Fn) 2138 return false; 2139 2140 auto NewI = 2141 MIB.buildInstr(Is64Bit ? AArch64::UBFMXri : AArch64::UBFMWri, 2142 {I.getOperand(0).getReg()}, {I.getOperand(1).getReg()}); 2143 2144 for (auto &RenderFn : *Imm1Fn) 2145 RenderFn(NewI); 2146 for (auto &RenderFn : *Imm2Fn) 2147 RenderFn(NewI); 2148 2149 I.eraseFromParent(); 2150 return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI); 2151 } 2152 2153 bool AArch64InstructionSelector::contractCrossBankCopyIntoStore( 2154 MachineInstr &I, MachineRegisterInfo &MRI) { 2155 assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE"); 2156 // If we're storing a scalar, it doesn't matter what register bank that 2157 // scalar is on. All that matters is the size. 2158 // 2159 // So, if we see something like this (with a 32-bit scalar as an example): 2160 // 2161 // %x:gpr(s32) = ... something ... 2162 // %y:fpr(s32) = COPY %x:gpr(s32) 2163 // G_STORE %y:fpr(s32) 2164 // 2165 // We can fix this up into something like this: 2166 // 2167 // G_STORE %x:gpr(s32) 2168 // 2169 // And then continue the selection process normally. 2170 Register DefDstReg = getSrcRegIgnoringCopies(I.getOperand(0).getReg(), MRI); 2171 if (!DefDstReg.isValid()) 2172 return false; 2173 LLT DefDstTy = MRI.getType(DefDstReg); 2174 Register StoreSrcReg = I.getOperand(0).getReg(); 2175 LLT StoreSrcTy = MRI.getType(StoreSrcReg); 2176 2177 // If we get something strange like a physical register, then we shouldn't 2178 // go any further. 2179 if (!DefDstTy.isValid()) 2180 return false; 2181 2182 // Are the source and dst types the same size? 2183 if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits()) 2184 return false; 2185 2186 if (RBI.getRegBank(StoreSrcReg, MRI, TRI) == 2187 RBI.getRegBank(DefDstReg, MRI, TRI)) 2188 return false; 2189 2190 // We have a cross-bank copy, which is entering a store. Let's fold it. 2191 I.getOperand(0).setReg(DefDstReg); 2192 return true; 2193 } 2194 2195 bool AArch64InstructionSelector::earlySelect(MachineInstr &I) { 2196 assert(I.getParent() && "Instruction should be in a basic block!"); 2197 assert(I.getParent()->getParent() && "Instruction should be in a function!"); 2198 2199 MachineBasicBlock &MBB = *I.getParent(); 2200 MachineFunction &MF = *MBB.getParent(); 2201 MachineRegisterInfo &MRI = MF.getRegInfo(); 2202 2203 switch (I.getOpcode()) { 2204 case AArch64::G_DUP: { 2205 // Before selecting a DUP instruction, check if it is better selected as a 2206 // MOV or load from a constant pool. 2207 Register Src = I.getOperand(1).getReg(); 2208 auto ValAndVReg = getIConstantVRegValWithLookThrough(Src, MRI); 2209 if (!ValAndVReg) 2210 return false; 2211 LLVMContext &Ctx = MF.getFunction().getContext(); 2212 Register Dst = I.getOperand(0).getReg(); 2213 auto *CV = ConstantDataVector::getSplat( 2214 MRI.getType(Dst).getNumElements(), 2215 ConstantInt::get(Type::getIntNTy(Ctx, MRI.getType(Src).getSizeInBits()), 2216 ValAndVReg->Value)); 2217 if (!emitConstantVector(Dst, CV, MIB, MRI)) 2218 return false; 2219 I.eraseFromParent(); 2220 return true; 2221 } 2222 case TargetOpcode::G_SEXT: 2223 // Check for i64 sext(i32 vector_extract) prior to tablegen to select SMOV 2224 // over a normal extend. 2225 if (selectUSMovFromExtend(I, MRI)) 2226 return true; 2227 return false; 2228 case TargetOpcode::G_BR: 2229 return false; 2230 case TargetOpcode::G_SHL: 2231 return earlySelectSHL(I, MRI); 2232 case TargetOpcode::G_CONSTANT: { 2233 bool IsZero = false; 2234 if (I.getOperand(1).isCImm()) 2235 IsZero = I.getOperand(1).getCImm()->getZExtValue() == 0; 2236 else if (I.getOperand(1).isImm()) 2237 IsZero = I.getOperand(1).getImm() == 0; 2238 2239 if (!IsZero) 2240 return false; 2241 2242 Register DefReg = I.getOperand(0).getReg(); 2243 LLT Ty = MRI.getType(DefReg); 2244 if (Ty.getSizeInBits() == 64) { 2245 I.getOperand(1).ChangeToRegister(AArch64::XZR, false); 2246 RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI); 2247 } else if (Ty.getSizeInBits() == 32) { 2248 I.getOperand(1).ChangeToRegister(AArch64::WZR, false); 2249 RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI); 2250 } else 2251 return false; 2252 2253 I.setDesc(TII.get(TargetOpcode::COPY)); 2254 return true; 2255 } 2256 2257 case TargetOpcode::G_ADD: { 2258 // Check if this is being fed by a G_ICMP on either side. 2259 // 2260 // (cmp pred, x, y) + z 2261 // 2262 // In the above case, when the cmp is true, we increment z by 1. So, we can 2263 // fold the add into the cset for the cmp by using cinc. 2264 // 2265 // FIXME: This would probably be a lot nicer in PostLegalizerLowering. 2266 Register AddDst = I.getOperand(0).getReg(); 2267 Register AddLHS = I.getOperand(1).getReg(); 2268 Register AddRHS = I.getOperand(2).getReg(); 2269 // Only handle scalars. 2270 LLT Ty = MRI.getType(AddLHS); 2271 if (Ty.isVector()) 2272 return false; 2273 // Since G_ICMP is modeled as ADDS/SUBS/ANDS, we can handle 32 bits or 64 2274 // bits. 2275 unsigned Size = Ty.getSizeInBits(); 2276 if (Size != 32 && Size != 64) 2277 return false; 2278 auto MatchCmp = [&](Register Reg) -> MachineInstr * { 2279 if (!MRI.hasOneNonDBGUse(Reg)) 2280 return nullptr; 2281 // If the LHS of the add is 32 bits, then we want to fold a 32-bit 2282 // compare. 2283 if (Size == 32) 2284 return getOpcodeDef(TargetOpcode::G_ICMP, Reg, MRI); 2285 // We model scalar compares using 32-bit destinations right now. 2286 // If it's a 64-bit compare, it'll have 64-bit sources. 2287 Register ZExt; 2288 if (!mi_match(Reg, MRI, 2289 m_OneNonDBGUse(m_GZExt(m_OneNonDBGUse(m_Reg(ZExt)))))) 2290 return nullptr; 2291 auto *Cmp = getOpcodeDef(TargetOpcode::G_ICMP, ZExt, MRI); 2292 if (!Cmp || 2293 MRI.getType(Cmp->getOperand(2).getReg()).getSizeInBits() != 64) 2294 return nullptr; 2295 return Cmp; 2296 }; 2297 // Try to match 2298 // z + (cmp pred, x, y) 2299 MachineInstr *Cmp = MatchCmp(AddRHS); 2300 if (!Cmp) { 2301 // (cmp pred, x, y) + z 2302 std::swap(AddLHS, AddRHS); 2303 Cmp = MatchCmp(AddRHS); 2304 if (!Cmp) 2305 return false; 2306 } 2307 auto &PredOp = Cmp->getOperand(1); 2308 auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate()); 2309 const AArch64CC::CondCode InvCC = 2310 changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred)); 2311 MIB.setInstrAndDebugLoc(I); 2312 emitIntegerCompare(/*LHS=*/Cmp->getOperand(2), 2313 /*RHS=*/Cmp->getOperand(3), PredOp, MIB); 2314 emitCSINC(/*Dst=*/AddDst, /*Src =*/AddLHS, /*Src2=*/AddLHS, InvCC, MIB); 2315 I.eraseFromParent(); 2316 return true; 2317 } 2318 case TargetOpcode::G_OR: { 2319 // Look for operations that take the lower `Width=Size-ShiftImm` bits of 2320 // `ShiftSrc` and insert them into the upper `Width` bits of `MaskSrc` via 2321 // shifting and masking that we can replace with a BFI (encoded as a BFM). 2322 Register Dst = I.getOperand(0).getReg(); 2323 LLT Ty = MRI.getType(Dst); 2324 2325 if (!Ty.isScalar()) 2326 return false; 2327 2328 unsigned Size = Ty.getSizeInBits(); 2329 if (Size != 32 && Size != 64) 2330 return false; 2331 2332 Register ShiftSrc; 2333 int64_t ShiftImm; 2334 Register MaskSrc; 2335 int64_t MaskImm; 2336 if (!mi_match( 2337 Dst, MRI, 2338 m_GOr(m_OneNonDBGUse(m_GShl(m_Reg(ShiftSrc), m_ICst(ShiftImm))), 2339 m_OneNonDBGUse(m_GAnd(m_Reg(MaskSrc), m_ICst(MaskImm)))))) 2340 return false; 2341 2342 if (ShiftImm > Size || ((1ULL << ShiftImm) - 1ULL) != uint64_t(MaskImm)) 2343 return false; 2344 2345 int64_t Immr = Size - ShiftImm; 2346 int64_t Imms = Size - ShiftImm - 1; 2347 unsigned Opc = Size == 32 ? AArch64::BFMWri : AArch64::BFMXri; 2348 emitInstr(Opc, {Dst}, {MaskSrc, ShiftSrc, Immr, Imms}, MIB); 2349 I.eraseFromParent(); 2350 return true; 2351 } 2352 case TargetOpcode::G_FENCE: { 2353 if (I.getOperand(1).getImm() == 0) 2354 BuildMI(MBB, I, MIMetadata(I), TII.get(TargetOpcode::MEMBARRIER)); 2355 else 2356 BuildMI(MBB, I, MIMetadata(I), TII.get(AArch64::DMB)) 2357 .addImm(I.getOperand(0).getImm() == 4 ? 0x9 : 0xb); 2358 I.eraseFromParent(); 2359 return true; 2360 } 2361 default: 2362 return false; 2363 } 2364 } 2365 2366 bool AArch64InstructionSelector::select(MachineInstr &I) { 2367 assert(I.getParent() && "Instruction should be in a basic block!"); 2368 assert(I.getParent()->getParent() && "Instruction should be in a function!"); 2369 2370 MachineBasicBlock &MBB = *I.getParent(); 2371 MachineFunction &MF = *MBB.getParent(); 2372 MachineRegisterInfo &MRI = MF.getRegInfo(); 2373 2374 const AArch64Subtarget *Subtarget = &MF.getSubtarget<AArch64Subtarget>(); 2375 if (Subtarget->requiresStrictAlign()) { 2376 // We don't support this feature yet. 2377 LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n"); 2378 return false; 2379 } 2380 2381 MIB.setInstrAndDebugLoc(I); 2382 2383 unsigned Opcode = I.getOpcode(); 2384 // G_PHI requires same handling as PHI 2385 if (!I.isPreISelOpcode() || Opcode == TargetOpcode::G_PHI) { 2386 // Certain non-generic instructions also need some special handling. 2387 2388 if (Opcode == TargetOpcode::LOAD_STACK_GUARD) 2389 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2390 2391 if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) { 2392 const Register DefReg = I.getOperand(0).getReg(); 2393 const LLT DefTy = MRI.getType(DefReg); 2394 2395 const RegClassOrRegBank &RegClassOrBank = 2396 MRI.getRegClassOrRegBank(DefReg); 2397 2398 const TargetRegisterClass *DefRC 2399 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); 2400 if (!DefRC) { 2401 if (!DefTy.isValid()) { 2402 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); 2403 return false; 2404 } 2405 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); 2406 DefRC = getRegClassForTypeOnBank(DefTy, RB); 2407 if (!DefRC) { 2408 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); 2409 return false; 2410 } 2411 } 2412 2413 I.setDesc(TII.get(TargetOpcode::PHI)); 2414 2415 return RBI.constrainGenericRegister(DefReg, *DefRC, MRI); 2416 } 2417 2418 if (I.isCopy()) 2419 return selectCopy(I, TII, MRI, TRI, RBI); 2420 2421 if (I.isDebugInstr()) 2422 return selectDebugInstr(I, MRI, RBI); 2423 2424 return true; 2425 } 2426 2427 2428 if (I.getNumOperands() != I.getNumExplicitOperands()) { 2429 LLVM_DEBUG( 2430 dbgs() << "Generic instruction has unexpected implicit operands\n"); 2431 return false; 2432 } 2433 2434 // Try to do some lowering before we start instruction selecting. These 2435 // lowerings are purely transformations on the input G_MIR and so selection 2436 // must continue after any modification of the instruction. 2437 if (preISelLower(I)) { 2438 Opcode = I.getOpcode(); // The opcode may have been modified, refresh it. 2439 } 2440 2441 // There may be patterns where the importer can't deal with them optimally, 2442 // but does select it to a suboptimal sequence so our custom C++ selection 2443 // code later never has a chance to work on it. Therefore, we have an early 2444 // selection attempt here to give priority to certain selection routines 2445 // over the imported ones. 2446 if (earlySelect(I)) 2447 return true; 2448 2449 if (selectImpl(I, *CoverageInfo)) 2450 return true; 2451 2452 LLT Ty = 2453 I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{}; 2454 2455 switch (Opcode) { 2456 case TargetOpcode::G_SBFX: 2457 case TargetOpcode::G_UBFX: { 2458 static const unsigned OpcTable[2][2] = { 2459 {AArch64::UBFMWri, AArch64::UBFMXri}, 2460 {AArch64::SBFMWri, AArch64::SBFMXri}}; 2461 bool IsSigned = Opcode == TargetOpcode::G_SBFX; 2462 unsigned Size = Ty.getSizeInBits(); 2463 unsigned Opc = OpcTable[IsSigned][Size == 64]; 2464 auto Cst1 = 2465 getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), MRI); 2466 assert(Cst1 && "Should have gotten a constant for src 1?"); 2467 auto Cst2 = 2468 getIConstantVRegValWithLookThrough(I.getOperand(3).getReg(), MRI); 2469 assert(Cst2 && "Should have gotten a constant for src 2?"); 2470 auto LSB = Cst1->Value.getZExtValue(); 2471 auto Width = Cst2->Value.getZExtValue(); 2472 auto BitfieldInst = 2473 MIB.buildInstr(Opc, {I.getOperand(0)}, {I.getOperand(1)}) 2474 .addImm(LSB) 2475 .addImm(LSB + Width - 1); 2476 I.eraseFromParent(); 2477 return constrainSelectedInstRegOperands(*BitfieldInst, TII, TRI, RBI); 2478 } 2479 case TargetOpcode::G_BRCOND: 2480 return selectCompareBranch(I, MF, MRI); 2481 2482 case TargetOpcode::G_BRINDIRECT: { 2483 I.setDesc(TII.get(AArch64::BR)); 2484 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2485 } 2486 2487 case TargetOpcode::G_BRJT: 2488 return selectBrJT(I, MRI); 2489 2490 case AArch64::G_ADD_LOW: { 2491 // This op may have been separated from it's ADRP companion by the localizer 2492 // or some other code motion pass. Given that many CPUs will try to 2493 // macro fuse these operations anyway, select this into a MOVaddr pseudo 2494 // which will later be expanded into an ADRP+ADD pair after scheduling. 2495 MachineInstr *BaseMI = MRI.getVRegDef(I.getOperand(1).getReg()); 2496 if (BaseMI->getOpcode() != AArch64::ADRP) { 2497 I.setDesc(TII.get(AArch64::ADDXri)); 2498 I.addOperand(MachineOperand::CreateImm(0)); 2499 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2500 } 2501 assert(TM.getCodeModel() == CodeModel::Small && 2502 "Expected small code model"); 2503 auto Op1 = BaseMI->getOperand(1); 2504 auto Op2 = I.getOperand(2); 2505 auto MovAddr = MIB.buildInstr(AArch64::MOVaddr, {I.getOperand(0)}, {}) 2506 .addGlobalAddress(Op1.getGlobal(), Op1.getOffset(), 2507 Op1.getTargetFlags()) 2508 .addGlobalAddress(Op2.getGlobal(), Op2.getOffset(), 2509 Op2.getTargetFlags()); 2510 I.eraseFromParent(); 2511 return constrainSelectedInstRegOperands(*MovAddr, TII, TRI, RBI); 2512 } 2513 2514 case TargetOpcode::G_BSWAP: { 2515 // Handle vector types for G_BSWAP directly. 2516 Register DstReg = I.getOperand(0).getReg(); 2517 LLT DstTy = MRI.getType(DstReg); 2518 2519 // We should only get vector types here; everything else is handled by the 2520 // importer right now. 2521 if (!DstTy.isVector() || DstTy.getSizeInBits() > 128) { 2522 LLVM_DEBUG(dbgs() << "Dst type for G_BSWAP currently unsupported.\n"); 2523 return false; 2524 } 2525 2526 // Only handle 4 and 2 element vectors for now. 2527 // TODO: 16-bit elements. 2528 unsigned NumElts = DstTy.getNumElements(); 2529 if (NumElts != 4 && NumElts != 2) { 2530 LLVM_DEBUG(dbgs() << "Unsupported number of elements for G_BSWAP.\n"); 2531 return false; 2532 } 2533 2534 // Choose the correct opcode for the supported types. Right now, that's 2535 // v2s32, v4s32, and v2s64. 2536 unsigned Opc = 0; 2537 unsigned EltSize = DstTy.getElementType().getSizeInBits(); 2538 if (EltSize == 32) 2539 Opc = (DstTy.getNumElements() == 2) ? AArch64::REV32v8i8 2540 : AArch64::REV32v16i8; 2541 else if (EltSize == 64) 2542 Opc = AArch64::REV64v16i8; 2543 2544 // We should always get something by the time we get here... 2545 assert(Opc != 0 && "Didn't get an opcode for G_BSWAP?"); 2546 2547 I.setDesc(TII.get(Opc)); 2548 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2549 } 2550 2551 case TargetOpcode::G_FCONSTANT: 2552 case TargetOpcode::G_CONSTANT: { 2553 const bool isFP = Opcode == TargetOpcode::G_FCONSTANT; 2554 2555 const LLT s8 = LLT::scalar(8); 2556 const LLT s16 = LLT::scalar(16); 2557 const LLT s32 = LLT::scalar(32); 2558 const LLT s64 = LLT::scalar(64); 2559 const LLT s128 = LLT::scalar(128); 2560 const LLT p0 = LLT::pointer(0, 64); 2561 2562 const Register DefReg = I.getOperand(0).getReg(); 2563 const LLT DefTy = MRI.getType(DefReg); 2564 const unsigned DefSize = DefTy.getSizeInBits(); 2565 const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); 2566 2567 // FIXME: Redundant check, but even less readable when factored out. 2568 if (isFP) { 2569 if (Ty != s16 && Ty != s32 && Ty != s64 && Ty != s128) { 2570 LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty 2571 << " constant, expected: " << s16 << " or " << s32 2572 << " or " << s64 << " or " << s128 << '\n'); 2573 return false; 2574 } 2575 2576 if (RB.getID() != AArch64::FPRRegBankID) { 2577 LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty 2578 << " constant on bank: " << RB 2579 << ", expected: FPR\n"); 2580 return false; 2581 } 2582 2583 // The case when we have 0.0 is covered by tablegen. Reject it here so we 2584 // can be sure tablegen works correctly and isn't rescued by this code. 2585 // 0.0 is not covered by tablegen for FP128. So we will handle this 2586 // scenario in the code here. 2587 if (DefSize != 128 && I.getOperand(1).getFPImm()->isExactlyValue(0.0)) 2588 return false; 2589 } else { 2590 // s32 and s64 are covered by tablegen. 2591 if (Ty != p0 && Ty != s8 && Ty != s16) { 2592 LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty 2593 << " constant, expected: " << s32 << ", " << s64 2594 << ", or " << p0 << '\n'); 2595 return false; 2596 } 2597 2598 if (RB.getID() != AArch64::GPRRegBankID) { 2599 LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty 2600 << " constant on bank: " << RB 2601 << ", expected: GPR\n"); 2602 return false; 2603 } 2604 } 2605 2606 if (isFP) { 2607 const TargetRegisterClass &FPRRC = *getRegClassForTypeOnBank(DefTy, RB); 2608 // For 16, 64, and 128b values, emit a constant pool load. 2609 switch (DefSize) { 2610 default: 2611 llvm_unreachable("Unexpected destination size for G_FCONSTANT?"); 2612 case 32: 2613 // For s32, use a cp load if we have optsize/minsize. 2614 if (!shouldOptForSize(&MF)) 2615 break; 2616 [[fallthrough]]; 2617 case 16: 2618 case 64: 2619 case 128: { 2620 auto *FPImm = I.getOperand(1).getFPImm(); 2621 auto *LoadMI = emitLoadFromConstantPool(FPImm, MIB); 2622 if (!LoadMI) { 2623 LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n"); 2624 return false; 2625 } 2626 MIB.buildCopy({DefReg}, {LoadMI->getOperand(0).getReg()}); 2627 I.eraseFromParent(); 2628 return RBI.constrainGenericRegister(DefReg, FPRRC, MRI); 2629 } 2630 } 2631 2632 // Either emit a FMOV, or emit a copy to emit a normal mov. 2633 assert(DefSize == 32 && 2634 "Expected constant pool loads for all sizes other than 32!"); 2635 const Register DefGPRReg = 2636 MRI.createVirtualRegister(&AArch64::GPR32RegClass); 2637 MachineOperand &RegOp = I.getOperand(0); 2638 RegOp.setReg(DefGPRReg); 2639 MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator())); 2640 MIB.buildCopy({DefReg}, {DefGPRReg}); 2641 2642 if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) { 2643 LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n"); 2644 return false; 2645 } 2646 2647 MachineOperand &ImmOp = I.getOperand(1); 2648 // FIXME: Is going through int64_t always correct? 2649 ImmOp.ChangeToImmediate( 2650 ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue()); 2651 } else if (I.getOperand(1).isCImm()) { 2652 uint64_t Val = I.getOperand(1).getCImm()->getZExtValue(); 2653 I.getOperand(1).ChangeToImmediate(Val); 2654 } else if (I.getOperand(1).isImm()) { 2655 uint64_t Val = I.getOperand(1).getImm(); 2656 I.getOperand(1).ChangeToImmediate(Val); 2657 } 2658 2659 const unsigned MovOpc = 2660 DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm; 2661 I.setDesc(TII.get(MovOpc)); 2662 constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2663 return true; 2664 } 2665 case TargetOpcode::G_EXTRACT: { 2666 Register DstReg = I.getOperand(0).getReg(); 2667 Register SrcReg = I.getOperand(1).getReg(); 2668 LLT SrcTy = MRI.getType(SrcReg); 2669 LLT DstTy = MRI.getType(DstReg); 2670 (void)DstTy; 2671 unsigned SrcSize = SrcTy.getSizeInBits(); 2672 2673 if (SrcTy.getSizeInBits() > 64) { 2674 // This should be an extract of an s128, which is like a vector extract. 2675 if (SrcTy.getSizeInBits() != 128) 2676 return false; 2677 // Only support extracting 64 bits from an s128 at the moment. 2678 if (DstTy.getSizeInBits() != 64) 2679 return false; 2680 2681 unsigned Offset = I.getOperand(2).getImm(); 2682 if (Offset % 64 != 0) 2683 return false; 2684 2685 // Check we have the right regbank always. 2686 const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI); 2687 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); 2688 assert(SrcRB.getID() == DstRB.getID() && "Wrong extract regbank!"); 2689 2690 if (SrcRB.getID() == AArch64::GPRRegBankID) { 2691 auto NewI = 2692 MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}) 2693 .addUse(SrcReg, 0, 2694 Offset == 0 ? AArch64::sube64 : AArch64::subo64); 2695 constrainOperandRegClass(MF, TRI, MRI, TII, RBI, *NewI, 2696 AArch64::GPR64RegClass, NewI->getOperand(0)); 2697 I.eraseFromParent(); 2698 return true; 2699 } 2700 2701 // Emit the same code as a vector extract. 2702 // Offset must be a multiple of 64. 2703 unsigned LaneIdx = Offset / 64; 2704 MachineInstr *Extract = emitExtractVectorElt( 2705 DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB); 2706 if (!Extract) 2707 return false; 2708 I.eraseFromParent(); 2709 return true; 2710 } 2711 2712 I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri)); 2713 MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() + 2714 Ty.getSizeInBits() - 1); 2715 2716 if (SrcSize < 64) { 2717 assert(SrcSize == 32 && DstTy.getSizeInBits() == 16 && 2718 "unexpected G_EXTRACT types"); 2719 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2720 } 2721 2722 DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); 2723 MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator())); 2724 MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {}) 2725 .addReg(DstReg, 0, AArch64::sub_32); 2726 RBI.constrainGenericRegister(I.getOperand(0).getReg(), 2727 AArch64::GPR32RegClass, MRI); 2728 I.getOperand(0).setReg(DstReg); 2729 2730 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2731 } 2732 2733 case TargetOpcode::G_INSERT: { 2734 LLT SrcTy = MRI.getType(I.getOperand(2).getReg()); 2735 LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 2736 unsigned DstSize = DstTy.getSizeInBits(); 2737 // Larger inserts are vectors, same-size ones should be something else by 2738 // now (split up or turned into COPYs). 2739 if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32) 2740 return false; 2741 2742 I.setDesc(TII.get(DstSize == 64 ? AArch64::BFMXri : AArch64::BFMWri)); 2743 unsigned LSB = I.getOperand(3).getImm(); 2744 unsigned Width = MRI.getType(I.getOperand(2).getReg()).getSizeInBits(); 2745 I.getOperand(3).setImm((DstSize - LSB) % DstSize); 2746 MachineInstrBuilder(MF, I).addImm(Width - 1); 2747 2748 if (DstSize < 64) { 2749 assert(DstSize == 32 && SrcTy.getSizeInBits() == 16 && 2750 "unexpected G_INSERT types"); 2751 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2752 } 2753 2754 Register SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); 2755 BuildMI(MBB, I.getIterator(), I.getDebugLoc(), 2756 TII.get(AArch64::SUBREG_TO_REG)) 2757 .addDef(SrcReg) 2758 .addImm(0) 2759 .addUse(I.getOperand(2).getReg()) 2760 .addImm(AArch64::sub_32); 2761 RBI.constrainGenericRegister(I.getOperand(2).getReg(), 2762 AArch64::GPR32RegClass, MRI); 2763 I.getOperand(2).setReg(SrcReg); 2764 2765 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2766 } 2767 case TargetOpcode::G_FRAME_INDEX: { 2768 // allocas and G_FRAME_INDEX are only supported in addrspace(0). 2769 if (Ty != LLT::pointer(0, 64)) { 2770 LLVM_DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty 2771 << ", expected: " << LLT::pointer(0, 64) << '\n'); 2772 return false; 2773 } 2774 I.setDesc(TII.get(AArch64::ADDXri)); 2775 2776 // MOs for a #0 shifted immediate. 2777 I.addOperand(MachineOperand::CreateImm(0)); 2778 I.addOperand(MachineOperand::CreateImm(0)); 2779 2780 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2781 } 2782 2783 case TargetOpcode::G_GLOBAL_VALUE: { 2784 auto GV = I.getOperand(1).getGlobal(); 2785 if (GV->isThreadLocal()) 2786 return selectTLSGlobalValue(I, MRI); 2787 2788 unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM); 2789 if (OpFlags & AArch64II::MO_GOT) { 2790 I.setDesc(TII.get(AArch64::LOADgot)); 2791 I.getOperand(1).setTargetFlags(OpFlags); 2792 } else if (TM.getCodeModel() == CodeModel::Large) { 2793 // Materialize the global using movz/movk instructions. 2794 materializeLargeCMVal(I, GV, OpFlags); 2795 I.eraseFromParent(); 2796 return true; 2797 } else if (TM.getCodeModel() == CodeModel::Tiny) { 2798 I.setDesc(TII.get(AArch64::ADR)); 2799 I.getOperand(1).setTargetFlags(OpFlags); 2800 } else { 2801 I.setDesc(TII.get(AArch64::MOVaddr)); 2802 I.getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_PAGE); 2803 MachineInstrBuilder MIB(MF, I); 2804 MIB.addGlobalAddress(GV, I.getOperand(1).getOffset(), 2805 OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 2806 } 2807 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2808 } 2809 2810 case TargetOpcode::G_ZEXTLOAD: 2811 case TargetOpcode::G_LOAD: 2812 case TargetOpcode::G_STORE: { 2813 GLoadStore &LdSt = cast<GLoadStore>(I); 2814 bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD; 2815 LLT PtrTy = MRI.getType(LdSt.getPointerReg()); 2816 2817 if (PtrTy != LLT::pointer(0, 64)) { 2818 LLVM_DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy 2819 << ", expected: " << LLT::pointer(0, 64) << '\n'); 2820 return false; 2821 } 2822 2823 uint64_t MemSizeInBytes = LdSt.getMemSize(); 2824 unsigned MemSizeInBits = LdSt.getMemSizeInBits(); 2825 AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering(); 2826 2827 // Need special instructions for atomics that affect ordering. 2828 if (Order != AtomicOrdering::NotAtomic && 2829 Order != AtomicOrdering::Unordered && 2830 Order != AtomicOrdering::Monotonic) { 2831 assert(!isa<GZExtLoad>(LdSt)); 2832 if (MemSizeInBytes > 64) 2833 return false; 2834 2835 if (isa<GLoad>(LdSt)) { 2836 static constexpr unsigned LDAPROpcodes[] = { 2837 AArch64::LDAPRB, AArch64::LDAPRH, AArch64::LDAPRW, AArch64::LDAPRX}; 2838 static constexpr unsigned LDAROpcodes[] = { 2839 AArch64::LDARB, AArch64::LDARH, AArch64::LDARW, AArch64::LDARX}; 2840 ArrayRef<unsigned> Opcodes = 2841 STI.hasRCPC() && Order != AtomicOrdering::SequentiallyConsistent 2842 ? LDAPROpcodes 2843 : LDAROpcodes; 2844 I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)])); 2845 } else { 2846 static constexpr unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH, 2847 AArch64::STLRW, AArch64::STLRX}; 2848 Register ValReg = LdSt.getReg(0); 2849 if (MRI.getType(ValReg).getSizeInBits() == 64 && MemSizeInBits != 64) { 2850 // Emit a subreg copy of 32 bits. 2851 Register NewVal = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 2852 MIB.buildInstr(TargetOpcode::COPY, {NewVal}, {}) 2853 .addReg(I.getOperand(0).getReg(), 0, AArch64::sub_32); 2854 I.getOperand(0).setReg(NewVal); 2855 } 2856 I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)])); 2857 } 2858 constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2859 return true; 2860 } 2861 2862 #ifndef NDEBUG 2863 const Register PtrReg = LdSt.getPointerReg(); 2864 const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI); 2865 // Check that the pointer register is valid. 2866 assert(PtrRB.getID() == AArch64::GPRRegBankID && 2867 "Load/Store pointer operand isn't a GPR"); 2868 assert(MRI.getType(PtrReg).isPointer() && 2869 "Load/Store pointer operand isn't a pointer"); 2870 #endif 2871 2872 const Register ValReg = LdSt.getReg(0); 2873 const LLT ValTy = MRI.getType(ValReg); 2874 const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI); 2875 2876 // The code below doesn't support truncating stores, so we need to split it 2877 // again. 2878 if (isa<GStore>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) { 2879 unsigned SubReg; 2880 LLT MemTy = LdSt.getMMO().getMemoryType(); 2881 auto *RC = getRegClassForTypeOnBank(MemTy, RB); 2882 if (!getSubRegForClass(RC, TRI, SubReg)) 2883 return false; 2884 2885 // Generate a subreg copy. 2886 auto Copy = MIB.buildInstr(TargetOpcode::COPY, {MemTy}, {}) 2887 .addReg(ValReg, 0, SubReg) 2888 .getReg(0); 2889 RBI.constrainGenericRegister(Copy, *RC, MRI); 2890 LdSt.getOperand(0).setReg(Copy); 2891 } else if (isa<GLoad>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) { 2892 // If this is an any-extending load from the FPR bank, split it into a regular 2893 // load + extend. 2894 if (RB.getID() == AArch64::FPRRegBankID) { 2895 unsigned SubReg; 2896 LLT MemTy = LdSt.getMMO().getMemoryType(); 2897 auto *RC = getRegClassForTypeOnBank(MemTy, RB); 2898 if (!getSubRegForClass(RC, TRI, SubReg)) 2899 return false; 2900 Register OldDst = LdSt.getReg(0); 2901 Register NewDst = 2902 MRI.createGenericVirtualRegister(LdSt.getMMO().getMemoryType()); 2903 LdSt.getOperand(0).setReg(NewDst); 2904 MRI.setRegBank(NewDst, RB); 2905 // Generate a SUBREG_TO_REG to extend it. 2906 MIB.setInsertPt(MIB.getMBB(), std::next(LdSt.getIterator())); 2907 MIB.buildInstr(AArch64::SUBREG_TO_REG, {OldDst}, {}) 2908 .addImm(0) 2909 .addUse(NewDst) 2910 .addImm(SubReg); 2911 auto SubRegRC = getRegClassForTypeOnBank(MRI.getType(OldDst), RB); 2912 RBI.constrainGenericRegister(OldDst, *SubRegRC, MRI); 2913 MIB.setInstr(LdSt); 2914 } 2915 } 2916 2917 // Helper lambda for partially selecting I. Either returns the original 2918 // instruction with an updated opcode, or a new instruction. 2919 auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * { 2920 bool IsStore = isa<GStore>(I); 2921 const unsigned NewOpc = 2922 selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits); 2923 if (NewOpc == I.getOpcode()) 2924 return nullptr; 2925 // Check if we can fold anything into the addressing mode. 2926 auto AddrModeFns = 2927 selectAddrModeIndexed(I.getOperand(1), MemSizeInBytes); 2928 if (!AddrModeFns) { 2929 // Can't fold anything. Use the original instruction. 2930 I.setDesc(TII.get(NewOpc)); 2931 I.addOperand(MachineOperand::CreateImm(0)); 2932 return &I; 2933 } 2934 2935 // Folded something. Create a new instruction and return it. 2936 auto NewInst = MIB.buildInstr(NewOpc, {}, {}, I.getFlags()); 2937 Register CurValReg = I.getOperand(0).getReg(); 2938 IsStore ? NewInst.addUse(CurValReg) : NewInst.addDef(CurValReg); 2939 NewInst.cloneMemRefs(I); 2940 for (auto &Fn : *AddrModeFns) 2941 Fn(NewInst); 2942 I.eraseFromParent(); 2943 return &*NewInst; 2944 }; 2945 2946 MachineInstr *LoadStore = SelectLoadStoreAddressingMode(); 2947 if (!LoadStore) 2948 return false; 2949 2950 // If we're storing a 0, use WZR/XZR. 2951 if (Opcode == TargetOpcode::G_STORE) { 2952 auto CVal = getIConstantVRegValWithLookThrough( 2953 LoadStore->getOperand(0).getReg(), MRI); 2954 if (CVal && CVal->Value == 0) { 2955 switch (LoadStore->getOpcode()) { 2956 case AArch64::STRWui: 2957 case AArch64::STRHHui: 2958 case AArch64::STRBBui: 2959 LoadStore->getOperand(0).setReg(AArch64::WZR); 2960 break; 2961 case AArch64::STRXui: 2962 LoadStore->getOperand(0).setReg(AArch64::XZR); 2963 break; 2964 } 2965 } 2966 } 2967 2968 if (IsZExtLoad) { 2969 // The zextload from a smaller type to i32 should be handled by the 2970 // importer. 2971 if (MRI.getType(LoadStore->getOperand(0).getReg()).getSizeInBits() != 64) 2972 return false; 2973 // If we have a ZEXTLOAD then change the load's type to be a narrower reg 2974 // and zero_extend with SUBREG_TO_REG. 2975 Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 2976 Register DstReg = LoadStore->getOperand(0).getReg(); 2977 LoadStore->getOperand(0).setReg(LdReg); 2978 2979 MIB.setInsertPt(MIB.getMBB(), std::next(LoadStore->getIterator())); 2980 MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {}) 2981 .addImm(0) 2982 .addUse(LdReg) 2983 .addImm(AArch64::sub_32); 2984 constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI); 2985 return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass, 2986 MRI); 2987 } 2988 return constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI); 2989 } 2990 2991 case TargetOpcode::G_SMULH: 2992 case TargetOpcode::G_UMULH: { 2993 // Reject the various things we don't support yet. 2994 if (unsupportedBinOp(I, RBI, MRI, TRI)) 2995 return false; 2996 2997 const Register DefReg = I.getOperand(0).getReg(); 2998 const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); 2999 3000 if (RB.getID() != AArch64::GPRRegBankID) { 3001 LLVM_DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n"); 3002 return false; 3003 } 3004 3005 if (Ty != LLT::scalar(64)) { 3006 LLVM_DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty 3007 << ", expected: " << LLT::scalar(64) << '\n'); 3008 return false; 3009 } 3010 3011 unsigned NewOpc = I.getOpcode() == TargetOpcode::G_SMULH ? AArch64::SMULHrr 3012 : AArch64::UMULHrr; 3013 I.setDesc(TII.get(NewOpc)); 3014 3015 // Now that we selected an opcode, we need to constrain the register 3016 // operands to use appropriate classes. 3017 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3018 } 3019 case TargetOpcode::G_LSHR: 3020 case TargetOpcode::G_ASHR: 3021 if (MRI.getType(I.getOperand(0).getReg()).isVector()) 3022 return selectVectorAshrLshr(I, MRI); 3023 [[fallthrough]]; 3024 case TargetOpcode::G_SHL: 3025 if (Opcode == TargetOpcode::G_SHL && 3026 MRI.getType(I.getOperand(0).getReg()).isVector()) 3027 return selectVectorSHL(I, MRI); 3028 3029 // These shifts were legalized to have 64 bit shift amounts because we 3030 // want to take advantage of the selection patterns that assume the 3031 // immediates are s64s, however, selectBinaryOp will assume both operands 3032 // will have the same bit size. 3033 { 3034 Register SrcReg = I.getOperand(1).getReg(); 3035 Register ShiftReg = I.getOperand(2).getReg(); 3036 const LLT ShiftTy = MRI.getType(ShiftReg); 3037 const LLT SrcTy = MRI.getType(SrcReg); 3038 if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 32 && 3039 ShiftTy.getSizeInBits() == 64) { 3040 assert(!ShiftTy.isVector() && "unexpected vector shift ty"); 3041 // Insert a subregister copy to implement a 64->32 trunc 3042 auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {}) 3043 .addReg(ShiftReg, 0, AArch64::sub_32); 3044 MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID)); 3045 I.getOperand(2).setReg(Trunc.getReg(0)); 3046 } 3047 } 3048 [[fallthrough]]; 3049 case TargetOpcode::G_OR: { 3050 // Reject the various things we don't support yet. 3051 if (unsupportedBinOp(I, RBI, MRI, TRI)) 3052 return false; 3053 3054 const unsigned OpSize = Ty.getSizeInBits(); 3055 3056 const Register DefReg = I.getOperand(0).getReg(); 3057 const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); 3058 3059 const unsigned NewOpc = selectBinaryOp(I.getOpcode(), RB.getID(), OpSize); 3060 if (NewOpc == I.getOpcode()) 3061 return false; 3062 3063 I.setDesc(TII.get(NewOpc)); 3064 // FIXME: Should the type be always reset in setDesc? 3065 3066 // Now that we selected an opcode, we need to constrain the register 3067 // operands to use appropriate classes. 3068 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3069 } 3070 3071 case TargetOpcode::G_PTR_ADD: { 3072 emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2), MIB); 3073 I.eraseFromParent(); 3074 return true; 3075 } 3076 case TargetOpcode::G_SADDO: 3077 case TargetOpcode::G_UADDO: 3078 case TargetOpcode::G_SSUBO: 3079 case TargetOpcode::G_USUBO: { 3080 // Emit the operation and get the correct condition code. 3081 auto OpAndCC = emitOverflowOp(Opcode, I.getOperand(0).getReg(), 3082 I.getOperand(2), I.getOperand(3), MIB); 3083 3084 // Now, put the overflow result in the register given by the first operand 3085 // to the overflow op. CSINC increments the result when the predicate is 3086 // false, so to get the increment when it's true, we need to use the 3087 // inverse. In this case, we want to increment when carry is set. 3088 Register ZReg = AArch64::WZR; 3089 emitCSINC(/*Dst=*/I.getOperand(1).getReg(), /*Src1=*/ZReg, /*Src2=*/ZReg, 3090 getInvertedCondCode(OpAndCC.second), MIB); 3091 I.eraseFromParent(); 3092 return true; 3093 } 3094 3095 case TargetOpcode::G_PTRMASK: { 3096 Register MaskReg = I.getOperand(2).getReg(); 3097 std::optional<int64_t> MaskVal = getIConstantVRegSExtVal(MaskReg, MRI); 3098 // TODO: Implement arbitrary cases 3099 if (!MaskVal || !isShiftedMask_64(*MaskVal)) 3100 return false; 3101 3102 uint64_t Mask = *MaskVal; 3103 I.setDesc(TII.get(AArch64::ANDXri)); 3104 I.getOperand(2).ChangeToImmediate( 3105 AArch64_AM::encodeLogicalImmediate(Mask, 64)); 3106 3107 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3108 } 3109 case TargetOpcode::G_PTRTOINT: 3110 case TargetOpcode::G_TRUNC: { 3111 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 3112 const LLT SrcTy = MRI.getType(I.getOperand(1).getReg()); 3113 3114 const Register DstReg = I.getOperand(0).getReg(); 3115 const Register SrcReg = I.getOperand(1).getReg(); 3116 3117 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); 3118 const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI); 3119 3120 if (DstRB.getID() != SrcRB.getID()) { 3121 LLVM_DEBUG( 3122 dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n"); 3123 return false; 3124 } 3125 3126 if (DstRB.getID() == AArch64::GPRRegBankID) { 3127 const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB); 3128 if (!DstRC) 3129 return false; 3130 3131 const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(SrcTy, SrcRB); 3132 if (!SrcRC) 3133 return false; 3134 3135 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) || 3136 !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { 3137 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n"); 3138 return false; 3139 } 3140 3141 if (DstRC == SrcRC) { 3142 // Nothing to be done 3143 } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) && 3144 SrcTy == LLT::scalar(64)) { 3145 llvm_unreachable("TableGen can import this case"); 3146 return false; 3147 } else if (DstRC == &AArch64::GPR32RegClass && 3148 SrcRC == &AArch64::GPR64RegClass) { 3149 I.getOperand(1).setSubReg(AArch64::sub_32); 3150 } else { 3151 LLVM_DEBUG( 3152 dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n"); 3153 return false; 3154 } 3155 3156 I.setDesc(TII.get(TargetOpcode::COPY)); 3157 return true; 3158 } else if (DstRB.getID() == AArch64::FPRRegBankID) { 3159 if (DstTy == LLT::fixed_vector(4, 16) && 3160 SrcTy == LLT::fixed_vector(4, 32)) { 3161 I.setDesc(TII.get(AArch64::XTNv4i16)); 3162 constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3163 return true; 3164 } 3165 3166 if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) { 3167 MachineInstr *Extract = emitExtractVectorElt( 3168 DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB); 3169 if (!Extract) 3170 return false; 3171 I.eraseFromParent(); 3172 return true; 3173 } 3174 3175 // We might have a vector G_PTRTOINT, in which case just emit a COPY. 3176 if (Opcode == TargetOpcode::G_PTRTOINT) { 3177 assert(DstTy.isVector() && "Expected an FPR ptrtoint to be a vector"); 3178 I.setDesc(TII.get(TargetOpcode::COPY)); 3179 return selectCopy(I, TII, MRI, TRI, RBI); 3180 } 3181 } 3182 3183 return false; 3184 } 3185 3186 case TargetOpcode::G_ANYEXT: { 3187 if (selectUSMovFromExtend(I, MRI)) 3188 return true; 3189 3190 const Register DstReg = I.getOperand(0).getReg(); 3191 const Register SrcReg = I.getOperand(1).getReg(); 3192 3193 const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI); 3194 if (RBDst.getID() != AArch64::GPRRegBankID) { 3195 LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst 3196 << ", expected: GPR\n"); 3197 return false; 3198 } 3199 3200 const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI); 3201 if (RBSrc.getID() != AArch64::GPRRegBankID) { 3202 LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc 3203 << ", expected: GPR\n"); 3204 return false; 3205 } 3206 3207 const unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); 3208 3209 if (DstSize == 0) { 3210 LLVM_DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n"); 3211 return false; 3212 } 3213 3214 if (DstSize != 64 && DstSize > 32) { 3215 LLVM_DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize 3216 << ", expected: 32 or 64\n"); 3217 return false; 3218 } 3219 // At this point G_ANYEXT is just like a plain COPY, but we need 3220 // to explicitly form the 64-bit value if any. 3221 if (DstSize > 32) { 3222 Register ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass); 3223 BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG)) 3224 .addDef(ExtSrc) 3225 .addImm(0) 3226 .addUse(SrcReg) 3227 .addImm(AArch64::sub_32); 3228 I.getOperand(1).setReg(ExtSrc); 3229 } 3230 return selectCopy(I, TII, MRI, TRI, RBI); 3231 } 3232 3233 case TargetOpcode::G_ZEXT: 3234 case TargetOpcode::G_SEXT_INREG: 3235 case TargetOpcode::G_SEXT: { 3236 if (selectUSMovFromExtend(I, MRI)) 3237 return true; 3238 3239 unsigned Opcode = I.getOpcode(); 3240 const bool IsSigned = Opcode != TargetOpcode::G_ZEXT; 3241 const Register DefReg = I.getOperand(0).getReg(); 3242 Register SrcReg = I.getOperand(1).getReg(); 3243 const LLT DstTy = MRI.getType(DefReg); 3244 const LLT SrcTy = MRI.getType(SrcReg); 3245 unsigned DstSize = DstTy.getSizeInBits(); 3246 unsigned SrcSize = SrcTy.getSizeInBits(); 3247 3248 // SEXT_INREG has the same src reg size as dst, the size of the value to be 3249 // extended is encoded in the imm. 3250 if (Opcode == TargetOpcode::G_SEXT_INREG) 3251 SrcSize = I.getOperand(2).getImm(); 3252 3253 if (DstTy.isVector()) 3254 return false; // Should be handled by imported patterns. 3255 3256 assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() == 3257 AArch64::GPRRegBankID && 3258 "Unexpected ext regbank"); 3259 3260 MachineInstr *ExtI; 3261 3262 // First check if we're extending the result of a load which has a dest type 3263 // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest 3264 // GPR register on AArch64 and all loads which are smaller automatically 3265 // zero-extend the upper bits. E.g. 3266 // %v(s8) = G_LOAD %p, :: (load 1) 3267 // %v2(s32) = G_ZEXT %v(s8) 3268 if (!IsSigned) { 3269 auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI); 3270 bool IsGPR = 3271 RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID; 3272 if (LoadMI && IsGPR) { 3273 const MachineMemOperand *MemOp = *LoadMI->memoperands_begin(); 3274 unsigned BytesLoaded = MemOp->getSize(); 3275 if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded) 3276 return selectCopy(I, TII, MRI, TRI, RBI); 3277 } 3278 3279 // For the 32-bit -> 64-bit case, we can emit a mov (ORRWrs) 3280 // + SUBREG_TO_REG. 3281 if (IsGPR && SrcSize == 32 && DstSize == 64) { 3282 Register SubregToRegSrc = 3283 MRI.createVirtualRegister(&AArch64::GPR32RegClass); 3284 const Register ZReg = AArch64::WZR; 3285 MIB.buildInstr(AArch64::ORRWrs, {SubregToRegSrc}, {ZReg, SrcReg}) 3286 .addImm(0); 3287 3288 MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {}) 3289 .addImm(0) 3290 .addUse(SubregToRegSrc) 3291 .addImm(AArch64::sub_32); 3292 3293 if (!RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, 3294 MRI)) { 3295 LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n"); 3296 return false; 3297 } 3298 3299 if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, 3300 MRI)) { 3301 LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n"); 3302 return false; 3303 } 3304 3305 I.eraseFromParent(); 3306 return true; 3307 } 3308 } 3309 3310 if (DstSize == 64) { 3311 if (Opcode != TargetOpcode::G_SEXT_INREG) { 3312 // FIXME: Can we avoid manually doing this? 3313 if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, 3314 MRI)) { 3315 LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode) 3316 << " operand\n"); 3317 return false; 3318 } 3319 SrcReg = MIB.buildInstr(AArch64::SUBREG_TO_REG, 3320 {&AArch64::GPR64RegClass}, {}) 3321 .addImm(0) 3322 .addUse(SrcReg) 3323 .addImm(AArch64::sub_32) 3324 .getReg(0); 3325 } 3326 3327 ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri, 3328 {DefReg}, {SrcReg}) 3329 .addImm(0) 3330 .addImm(SrcSize - 1); 3331 } else if (DstSize <= 32) { 3332 ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri, 3333 {DefReg}, {SrcReg}) 3334 .addImm(0) 3335 .addImm(SrcSize - 1); 3336 } else { 3337 return false; 3338 } 3339 3340 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 3341 I.eraseFromParent(); 3342 return true; 3343 } 3344 3345 case TargetOpcode::G_SITOFP: 3346 case TargetOpcode::G_UITOFP: 3347 case TargetOpcode::G_FPTOSI: 3348 case TargetOpcode::G_FPTOUI: { 3349 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()), 3350 SrcTy = MRI.getType(I.getOperand(1).getReg()); 3351 const unsigned NewOpc = selectFPConvOpc(Opcode, DstTy, SrcTy); 3352 if (NewOpc == Opcode) 3353 return false; 3354 3355 I.setDesc(TII.get(NewOpc)); 3356 constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3357 I.setFlags(MachineInstr::NoFPExcept); 3358 3359 return true; 3360 } 3361 3362 case TargetOpcode::G_FREEZE: 3363 return selectCopy(I, TII, MRI, TRI, RBI); 3364 3365 case TargetOpcode::G_INTTOPTR: 3366 // The importer is currently unable to import pointer types since they 3367 // didn't exist in SelectionDAG. 3368 return selectCopy(I, TII, MRI, TRI, RBI); 3369 3370 case TargetOpcode::G_BITCAST: 3371 // Imported SelectionDAG rules can handle every bitcast except those that 3372 // bitcast from a type to the same type. Ideally, these shouldn't occur 3373 // but we might not run an optimizer that deletes them. The other exception 3374 // is bitcasts involving pointer types, as SelectionDAG has no knowledge 3375 // of them. 3376 return selectCopy(I, TII, MRI, TRI, RBI); 3377 3378 case TargetOpcode::G_SELECT: { 3379 auto &Sel = cast<GSelect>(I); 3380 const Register CondReg = Sel.getCondReg(); 3381 const Register TReg = Sel.getTrueReg(); 3382 const Register FReg = Sel.getFalseReg(); 3383 3384 if (tryOptSelect(Sel)) 3385 return true; 3386 3387 // Make sure to use an unused vreg instead of wzr, so that the peephole 3388 // optimizations will be able to optimize these. 3389 Register DeadVReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 3390 auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg}) 3391 .addImm(AArch64_AM::encodeLogicalImmediate(1, 32)); 3392 constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI); 3393 if (!emitSelect(Sel.getReg(0), TReg, FReg, AArch64CC::NE, MIB)) 3394 return false; 3395 Sel.eraseFromParent(); 3396 return true; 3397 } 3398 case TargetOpcode::G_ICMP: { 3399 if (Ty.isVector()) 3400 return selectVectorICmp(I, MRI); 3401 3402 if (Ty != LLT::scalar(32)) { 3403 LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty 3404 << ", expected: " << LLT::scalar(32) << '\n'); 3405 return false; 3406 } 3407 3408 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate()); 3409 const AArch64CC::CondCode InvCC = 3410 changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred)); 3411 emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1), MIB); 3412 emitCSINC(/*Dst=*/I.getOperand(0).getReg(), /*Src1=*/AArch64::WZR, 3413 /*Src2=*/AArch64::WZR, InvCC, MIB); 3414 I.eraseFromParent(); 3415 return true; 3416 } 3417 3418 case TargetOpcode::G_FCMP: { 3419 CmpInst::Predicate Pred = 3420 static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate()); 3421 if (!emitFPCompare(I.getOperand(2).getReg(), I.getOperand(3).getReg(), MIB, 3422 Pred) || 3423 !emitCSetForFCmp(I.getOperand(0).getReg(), Pred, MIB)) 3424 return false; 3425 I.eraseFromParent(); 3426 return true; 3427 } 3428 case TargetOpcode::G_VASTART: 3429 return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI) 3430 : selectVaStartAAPCS(I, MF, MRI); 3431 case TargetOpcode::G_INTRINSIC: 3432 return selectIntrinsic(I, MRI); 3433 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: 3434 return selectIntrinsicWithSideEffects(I, MRI); 3435 case TargetOpcode::G_IMPLICIT_DEF: { 3436 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); 3437 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 3438 const Register DstReg = I.getOperand(0).getReg(); 3439 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); 3440 const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB); 3441 RBI.constrainGenericRegister(DstReg, *DstRC, MRI); 3442 return true; 3443 } 3444 case TargetOpcode::G_BLOCK_ADDR: { 3445 if (TM.getCodeModel() == CodeModel::Large) { 3446 materializeLargeCMVal(I, I.getOperand(1).getBlockAddress(), 0); 3447 I.eraseFromParent(); 3448 return true; 3449 } else { 3450 I.setDesc(TII.get(AArch64::MOVaddrBA)); 3451 auto MovMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::MOVaddrBA), 3452 I.getOperand(0).getReg()) 3453 .addBlockAddress(I.getOperand(1).getBlockAddress(), 3454 /* Offset */ 0, AArch64II::MO_PAGE) 3455 .addBlockAddress( 3456 I.getOperand(1).getBlockAddress(), /* Offset */ 0, 3457 AArch64II::MO_NC | AArch64II::MO_PAGEOFF); 3458 I.eraseFromParent(); 3459 return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI); 3460 } 3461 } 3462 case AArch64::G_DUP: { 3463 // When the scalar of G_DUP is an s8/s16 gpr, they can't be selected by 3464 // imported patterns. Do it manually here. Avoiding generating s16 gpr is 3465 // difficult because at RBS we may end up pessimizing the fpr case if we 3466 // decided to add an anyextend to fix this. Manual selection is the most 3467 // robust solution for now. 3468 if (RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() != 3469 AArch64::GPRRegBankID) 3470 return false; // We expect the fpr regbank case to be imported. 3471 LLT VecTy = MRI.getType(I.getOperand(0).getReg()); 3472 if (VecTy == LLT::fixed_vector(8, 8)) 3473 I.setDesc(TII.get(AArch64::DUPv8i8gpr)); 3474 else if (VecTy == LLT::fixed_vector(16, 8)) 3475 I.setDesc(TII.get(AArch64::DUPv16i8gpr)); 3476 else if (VecTy == LLT::fixed_vector(4, 16)) 3477 I.setDesc(TII.get(AArch64::DUPv4i16gpr)); 3478 else if (VecTy == LLT::fixed_vector(8, 16)) 3479 I.setDesc(TII.get(AArch64::DUPv8i16gpr)); 3480 else 3481 return false; 3482 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3483 } 3484 case TargetOpcode::G_INTRINSIC_TRUNC: 3485 return selectIntrinsicTrunc(I, MRI); 3486 case TargetOpcode::G_INTRINSIC_ROUND: 3487 return selectIntrinsicRound(I, MRI); 3488 case TargetOpcode::G_BUILD_VECTOR: 3489 return selectBuildVector(I, MRI); 3490 case TargetOpcode::G_MERGE_VALUES: 3491 return selectMergeValues(I, MRI); 3492 case TargetOpcode::G_UNMERGE_VALUES: 3493 return selectUnmergeValues(I, MRI); 3494 case TargetOpcode::G_SHUFFLE_VECTOR: 3495 return selectShuffleVector(I, MRI); 3496 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 3497 return selectExtractElt(I, MRI); 3498 case TargetOpcode::G_INSERT_VECTOR_ELT: 3499 return selectInsertElt(I, MRI); 3500 case TargetOpcode::G_CONCAT_VECTORS: 3501 return selectConcatVectors(I, MRI); 3502 case TargetOpcode::G_JUMP_TABLE: 3503 return selectJumpTable(I, MRI); 3504 case TargetOpcode::G_VECREDUCE_FADD: 3505 case TargetOpcode::G_VECREDUCE_ADD: 3506 return selectReduction(I, MRI); 3507 case TargetOpcode::G_MEMCPY: 3508 case TargetOpcode::G_MEMCPY_INLINE: 3509 case TargetOpcode::G_MEMMOVE: 3510 case TargetOpcode::G_MEMSET: 3511 assert(STI.hasMOPS() && "Shouldn't get here without +mops feature"); 3512 return selectMOPS(I, MRI); 3513 } 3514 3515 return false; 3516 } 3517 3518 bool AArch64InstructionSelector::selectReduction(MachineInstr &I, 3519 MachineRegisterInfo &MRI) { 3520 Register VecReg = I.getOperand(1).getReg(); 3521 LLT VecTy = MRI.getType(VecReg); 3522 if (I.getOpcode() == TargetOpcode::G_VECREDUCE_ADD) { 3523 // For <2 x i32> ADDPv2i32 generates an FPR64 value, so we need to emit 3524 // a subregister copy afterwards. 3525 if (VecTy == LLT::fixed_vector(2, 32)) { 3526 Register DstReg = I.getOperand(0).getReg(); 3527 auto AddP = MIB.buildInstr(AArch64::ADDPv2i32, {&AArch64::FPR64RegClass}, 3528 {VecReg, VecReg}); 3529 auto Copy = MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}) 3530 .addReg(AddP.getReg(0), 0, AArch64::ssub) 3531 .getReg(0); 3532 RBI.constrainGenericRegister(Copy, AArch64::FPR32RegClass, MRI); 3533 I.eraseFromParent(); 3534 return constrainSelectedInstRegOperands(*AddP, TII, TRI, RBI); 3535 } 3536 3537 unsigned Opc = 0; 3538 if (VecTy == LLT::fixed_vector(16, 8)) 3539 Opc = AArch64::ADDVv16i8v; 3540 else if (VecTy == LLT::fixed_vector(8, 16)) 3541 Opc = AArch64::ADDVv8i16v; 3542 else if (VecTy == LLT::fixed_vector(4, 32)) 3543 Opc = AArch64::ADDVv4i32v; 3544 else if (VecTy == LLT::fixed_vector(2, 64)) 3545 Opc = AArch64::ADDPv2i64p; 3546 else { 3547 LLVM_DEBUG(dbgs() << "Unhandled type for add reduction"); 3548 return false; 3549 } 3550 I.setDesc(TII.get(Opc)); 3551 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3552 } 3553 3554 if (I.getOpcode() == TargetOpcode::G_VECREDUCE_FADD) { 3555 unsigned Opc = 0; 3556 if (VecTy == LLT::fixed_vector(2, 32)) 3557 Opc = AArch64::FADDPv2i32p; 3558 else if (VecTy == LLT::fixed_vector(2, 64)) 3559 Opc = AArch64::FADDPv2i64p; 3560 else { 3561 LLVM_DEBUG(dbgs() << "Unhandled type for fadd reduction"); 3562 return false; 3563 } 3564 I.setDesc(TII.get(Opc)); 3565 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3566 } 3567 return false; 3568 } 3569 3570 bool AArch64InstructionSelector::selectMOPS(MachineInstr &GI, 3571 MachineRegisterInfo &MRI) { 3572 unsigned Mopcode; 3573 switch (GI.getOpcode()) { 3574 case TargetOpcode::G_MEMCPY: 3575 case TargetOpcode::G_MEMCPY_INLINE: 3576 Mopcode = AArch64::MOPSMemoryCopyPseudo; 3577 break; 3578 case TargetOpcode::G_MEMMOVE: 3579 Mopcode = AArch64::MOPSMemoryMovePseudo; 3580 break; 3581 case TargetOpcode::G_MEMSET: 3582 // For tagged memset see llvm.aarch64.mops.memset.tag 3583 Mopcode = AArch64::MOPSMemorySetPseudo; 3584 break; 3585 } 3586 3587 auto &DstPtr = GI.getOperand(0); 3588 auto &SrcOrVal = GI.getOperand(1); 3589 auto &Size = GI.getOperand(2); 3590 3591 // Create copies of the registers that can be clobbered. 3592 const Register DstPtrCopy = MRI.cloneVirtualRegister(DstPtr.getReg()); 3593 const Register SrcValCopy = MRI.cloneVirtualRegister(SrcOrVal.getReg()); 3594 const Register SizeCopy = MRI.cloneVirtualRegister(Size.getReg()); 3595 3596 const bool IsSet = Mopcode == AArch64::MOPSMemorySetPseudo; 3597 const auto &SrcValRegClass = 3598 IsSet ? AArch64::GPR64RegClass : AArch64::GPR64commonRegClass; 3599 3600 // Constrain to specific registers 3601 RBI.constrainGenericRegister(DstPtrCopy, AArch64::GPR64commonRegClass, MRI); 3602 RBI.constrainGenericRegister(SrcValCopy, SrcValRegClass, MRI); 3603 RBI.constrainGenericRegister(SizeCopy, AArch64::GPR64RegClass, MRI); 3604 3605 MIB.buildCopy(DstPtrCopy, DstPtr); 3606 MIB.buildCopy(SrcValCopy, SrcOrVal); 3607 MIB.buildCopy(SizeCopy, Size); 3608 3609 // New instruction uses the copied registers because it must update them. 3610 // The defs are not used since they don't exist in G_MEM*. They are still 3611 // tied. 3612 // Note: order of operands is different from G_MEMSET, G_MEMCPY, G_MEMMOVE 3613 Register DefDstPtr = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass); 3614 Register DefSize = MRI.createVirtualRegister(&AArch64::GPR64RegClass); 3615 if (IsSet) { 3616 MIB.buildInstr(Mopcode, {DefDstPtr, DefSize}, 3617 {DstPtrCopy, SizeCopy, SrcValCopy}); 3618 } else { 3619 Register DefSrcPtr = MRI.createVirtualRegister(&SrcValRegClass); 3620 MIB.buildInstr(Mopcode, {DefDstPtr, DefSrcPtr, DefSize}, 3621 {DstPtrCopy, SrcValCopy, SizeCopy}); 3622 } 3623 3624 GI.eraseFromParent(); 3625 return true; 3626 } 3627 3628 bool AArch64InstructionSelector::selectBrJT(MachineInstr &I, 3629 MachineRegisterInfo &MRI) { 3630 assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT"); 3631 Register JTAddr = I.getOperand(0).getReg(); 3632 unsigned JTI = I.getOperand(1).getIndex(); 3633 Register Index = I.getOperand(2).getReg(); 3634 3635 Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); 3636 Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass); 3637 3638 MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(JTI, 4, nullptr); 3639 auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32, 3640 {TargetReg, ScratchReg}, {JTAddr, Index}) 3641 .addJumpTableIndex(JTI); 3642 // Build the indirect branch. 3643 MIB.buildInstr(AArch64::BR, {}, {TargetReg}); 3644 I.eraseFromParent(); 3645 return constrainSelectedInstRegOperands(*JumpTableInst, TII, TRI, RBI); 3646 } 3647 3648 bool AArch64InstructionSelector::selectJumpTable(MachineInstr &I, 3649 MachineRegisterInfo &MRI) { 3650 assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table"); 3651 assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!"); 3652 3653 Register DstReg = I.getOperand(0).getReg(); 3654 unsigned JTI = I.getOperand(1).getIndex(); 3655 // We generate a MOVaddrJT which will get expanded to an ADRP + ADD later. 3656 auto MovMI = 3657 MIB.buildInstr(AArch64::MOVaddrJT, {DstReg}, {}) 3658 .addJumpTableIndex(JTI, AArch64II::MO_PAGE) 3659 .addJumpTableIndex(JTI, AArch64II::MO_NC | AArch64II::MO_PAGEOFF); 3660 I.eraseFromParent(); 3661 return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI); 3662 } 3663 3664 bool AArch64InstructionSelector::selectTLSGlobalValue( 3665 MachineInstr &I, MachineRegisterInfo &MRI) { 3666 if (!STI.isTargetMachO()) 3667 return false; 3668 MachineFunction &MF = *I.getParent()->getParent(); 3669 MF.getFrameInfo().setAdjustsStack(true); 3670 3671 const auto &GlobalOp = I.getOperand(1); 3672 assert(GlobalOp.getOffset() == 0 && 3673 "Shouldn't have an offset on TLS globals!"); 3674 const GlobalValue &GV = *GlobalOp.getGlobal(); 3675 3676 auto LoadGOT = 3677 MIB.buildInstr(AArch64::LOADgot, {&AArch64::GPR64commonRegClass}, {}) 3678 .addGlobalAddress(&GV, 0, AArch64II::MO_TLS); 3679 3680 auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass}, 3681 {LoadGOT.getReg(0)}) 3682 .addImm(0); 3683 3684 MIB.buildCopy(Register(AArch64::X0), LoadGOT.getReg(0)); 3685 // TLS calls preserve all registers except those that absolutely must be 3686 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be 3687 // silly). 3688 MIB.buildInstr(getBLRCallOpcode(MF), {}, {Load}) 3689 .addUse(AArch64::X0, RegState::Implicit) 3690 .addDef(AArch64::X0, RegState::Implicit) 3691 .addRegMask(TRI.getTLSCallPreservedMask()); 3692 3693 MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X0)); 3694 RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR64RegClass, 3695 MRI); 3696 I.eraseFromParent(); 3697 return true; 3698 } 3699 3700 bool AArch64InstructionSelector::selectIntrinsicTrunc( 3701 MachineInstr &I, MachineRegisterInfo &MRI) const { 3702 const LLT SrcTy = MRI.getType(I.getOperand(0).getReg()); 3703 3704 // Select the correct opcode. 3705 unsigned Opc = 0; 3706 if (!SrcTy.isVector()) { 3707 switch (SrcTy.getSizeInBits()) { 3708 default: 3709 case 16: 3710 Opc = AArch64::FRINTZHr; 3711 break; 3712 case 32: 3713 Opc = AArch64::FRINTZSr; 3714 break; 3715 case 64: 3716 Opc = AArch64::FRINTZDr; 3717 break; 3718 } 3719 } else { 3720 unsigned NumElts = SrcTy.getNumElements(); 3721 switch (SrcTy.getElementType().getSizeInBits()) { 3722 default: 3723 break; 3724 case 16: 3725 if (NumElts == 4) 3726 Opc = AArch64::FRINTZv4f16; 3727 else if (NumElts == 8) 3728 Opc = AArch64::FRINTZv8f16; 3729 break; 3730 case 32: 3731 if (NumElts == 2) 3732 Opc = AArch64::FRINTZv2f32; 3733 else if (NumElts == 4) 3734 Opc = AArch64::FRINTZv4f32; 3735 break; 3736 case 64: 3737 if (NumElts == 2) 3738 Opc = AArch64::FRINTZv2f64; 3739 break; 3740 } 3741 } 3742 3743 if (!Opc) { 3744 // Didn't get an opcode above, bail. 3745 LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_TRUNC!\n"); 3746 return false; 3747 } 3748 3749 // Legalization would have set us up perfectly for this; we just need to 3750 // set the opcode and move on. 3751 I.setDesc(TII.get(Opc)); 3752 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3753 } 3754 3755 bool AArch64InstructionSelector::selectIntrinsicRound( 3756 MachineInstr &I, MachineRegisterInfo &MRI) const { 3757 const LLT SrcTy = MRI.getType(I.getOperand(0).getReg()); 3758 3759 // Select the correct opcode. 3760 unsigned Opc = 0; 3761 if (!SrcTy.isVector()) { 3762 switch (SrcTy.getSizeInBits()) { 3763 default: 3764 case 16: 3765 Opc = AArch64::FRINTAHr; 3766 break; 3767 case 32: 3768 Opc = AArch64::FRINTASr; 3769 break; 3770 case 64: 3771 Opc = AArch64::FRINTADr; 3772 break; 3773 } 3774 } else { 3775 unsigned NumElts = SrcTy.getNumElements(); 3776 switch (SrcTy.getElementType().getSizeInBits()) { 3777 default: 3778 break; 3779 case 16: 3780 if (NumElts == 4) 3781 Opc = AArch64::FRINTAv4f16; 3782 else if (NumElts == 8) 3783 Opc = AArch64::FRINTAv8f16; 3784 break; 3785 case 32: 3786 if (NumElts == 2) 3787 Opc = AArch64::FRINTAv2f32; 3788 else if (NumElts == 4) 3789 Opc = AArch64::FRINTAv4f32; 3790 break; 3791 case 64: 3792 if (NumElts == 2) 3793 Opc = AArch64::FRINTAv2f64; 3794 break; 3795 } 3796 } 3797 3798 if (!Opc) { 3799 // Didn't get an opcode above, bail. 3800 LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_ROUND!\n"); 3801 return false; 3802 } 3803 3804 // Legalization would have set us up perfectly for this; we just need to 3805 // set the opcode and move on. 3806 I.setDesc(TII.get(Opc)); 3807 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3808 } 3809 3810 bool AArch64InstructionSelector::selectVectorICmp( 3811 MachineInstr &I, MachineRegisterInfo &MRI) { 3812 Register DstReg = I.getOperand(0).getReg(); 3813 LLT DstTy = MRI.getType(DstReg); 3814 Register SrcReg = I.getOperand(2).getReg(); 3815 Register Src2Reg = I.getOperand(3).getReg(); 3816 LLT SrcTy = MRI.getType(SrcReg); 3817 3818 unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits(); 3819 unsigned NumElts = DstTy.getNumElements(); 3820 3821 // First index is element size, 0 == 8b, 1 == 16b, 2 == 32b, 3 == 64b 3822 // Second index is num elts, 0 == v2, 1 == v4, 2 == v8, 3 == v16 3823 // Third index is cc opcode: 3824 // 0 == eq 3825 // 1 == ugt 3826 // 2 == uge 3827 // 3 == ult 3828 // 4 == ule 3829 // 5 == sgt 3830 // 6 == sge 3831 // 7 == slt 3832 // 8 == sle 3833 // ne is done by negating 'eq' result. 3834 3835 // This table below assumes that for some comparisons the operands will be 3836 // commuted. 3837 // ult op == commute + ugt op 3838 // ule op == commute + uge op 3839 // slt op == commute + sgt op 3840 // sle op == commute + sge op 3841 unsigned PredIdx = 0; 3842 bool SwapOperands = false; 3843 CmpInst::Predicate Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); 3844 switch (Pred) { 3845 case CmpInst::ICMP_NE: 3846 case CmpInst::ICMP_EQ: 3847 PredIdx = 0; 3848 break; 3849 case CmpInst::ICMP_UGT: 3850 PredIdx = 1; 3851 break; 3852 case CmpInst::ICMP_UGE: 3853 PredIdx = 2; 3854 break; 3855 case CmpInst::ICMP_ULT: 3856 PredIdx = 3; 3857 SwapOperands = true; 3858 break; 3859 case CmpInst::ICMP_ULE: 3860 PredIdx = 4; 3861 SwapOperands = true; 3862 break; 3863 case CmpInst::ICMP_SGT: 3864 PredIdx = 5; 3865 break; 3866 case CmpInst::ICMP_SGE: 3867 PredIdx = 6; 3868 break; 3869 case CmpInst::ICMP_SLT: 3870 PredIdx = 7; 3871 SwapOperands = true; 3872 break; 3873 case CmpInst::ICMP_SLE: 3874 PredIdx = 8; 3875 SwapOperands = true; 3876 break; 3877 default: 3878 llvm_unreachable("Unhandled icmp predicate"); 3879 return false; 3880 } 3881 3882 // This table obviously should be tablegen'd when we have our GISel native 3883 // tablegen selector. 3884 3885 static const unsigned OpcTable[4][4][9] = { 3886 { 3887 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3888 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3889 0 /* invalid */}, 3890 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3891 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3892 0 /* invalid */}, 3893 {AArch64::CMEQv8i8, AArch64::CMHIv8i8, AArch64::CMHSv8i8, 3894 AArch64::CMHIv8i8, AArch64::CMHSv8i8, AArch64::CMGTv8i8, 3895 AArch64::CMGEv8i8, AArch64::CMGTv8i8, AArch64::CMGEv8i8}, 3896 {AArch64::CMEQv16i8, AArch64::CMHIv16i8, AArch64::CMHSv16i8, 3897 AArch64::CMHIv16i8, AArch64::CMHSv16i8, AArch64::CMGTv16i8, 3898 AArch64::CMGEv16i8, AArch64::CMGTv16i8, AArch64::CMGEv16i8} 3899 }, 3900 { 3901 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3902 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3903 0 /* invalid */}, 3904 {AArch64::CMEQv4i16, AArch64::CMHIv4i16, AArch64::CMHSv4i16, 3905 AArch64::CMHIv4i16, AArch64::CMHSv4i16, AArch64::CMGTv4i16, 3906 AArch64::CMGEv4i16, AArch64::CMGTv4i16, AArch64::CMGEv4i16}, 3907 {AArch64::CMEQv8i16, AArch64::CMHIv8i16, AArch64::CMHSv8i16, 3908 AArch64::CMHIv8i16, AArch64::CMHSv8i16, AArch64::CMGTv8i16, 3909 AArch64::CMGEv8i16, AArch64::CMGTv8i16, AArch64::CMGEv8i16}, 3910 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3911 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3912 0 /* invalid */} 3913 }, 3914 { 3915 {AArch64::CMEQv2i32, AArch64::CMHIv2i32, AArch64::CMHSv2i32, 3916 AArch64::CMHIv2i32, AArch64::CMHSv2i32, AArch64::CMGTv2i32, 3917 AArch64::CMGEv2i32, AArch64::CMGTv2i32, AArch64::CMGEv2i32}, 3918 {AArch64::CMEQv4i32, AArch64::CMHIv4i32, AArch64::CMHSv4i32, 3919 AArch64::CMHIv4i32, AArch64::CMHSv4i32, AArch64::CMGTv4i32, 3920 AArch64::CMGEv4i32, AArch64::CMGTv4i32, AArch64::CMGEv4i32}, 3921 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3922 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3923 0 /* invalid */}, 3924 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3925 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3926 0 /* invalid */} 3927 }, 3928 { 3929 {AArch64::CMEQv2i64, AArch64::CMHIv2i64, AArch64::CMHSv2i64, 3930 AArch64::CMHIv2i64, AArch64::CMHSv2i64, AArch64::CMGTv2i64, 3931 AArch64::CMGEv2i64, AArch64::CMGTv2i64, AArch64::CMGEv2i64}, 3932 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3933 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3934 0 /* invalid */}, 3935 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3936 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3937 0 /* invalid */}, 3938 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3939 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3940 0 /* invalid */} 3941 }, 3942 }; 3943 unsigned EltIdx = Log2_32(SrcEltSize / 8); 3944 unsigned NumEltsIdx = Log2_32(NumElts / 2); 3945 unsigned Opc = OpcTable[EltIdx][NumEltsIdx][PredIdx]; 3946 if (!Opc) { 3947 LLVM_DEBUG(dbgs() << "Could not map G_ICMP to cmp opcode"); 3948 return false; 3949 } 3950 3951 const RegisterBank &VecRB = *RBI.getRegBank(SrcReg, MRI, TRI); 3952 const TargetRegisterClass *SrcRC = 3953 getRegClassForTypeOnBank(SrcTy, VecRB, true); 3954 if (!SrcRC) { 3955 LLVM_DEBUG(dbgs() << "Could not determine source register class.\n"); 3956 return false; 3957 } 3958 3959 unsigned NotOpc = Pred == ICmpInst::ICMP_NE ? AArch64::NOTv8i8 : 0; 3960 if (SrcTy.getSizeInBits() == 128) 3961 NotOpc = NotOpc ? AArch64::NOTv16i8 : 0; 3962 3963 if (SwapOperands) 3964 std::swap(SrcReg, Src2Reg); 3965 3966 auto Cmp = MIB.buildInstr(Opc, {SrcRC}, {SrcReg, Src2Reg}); 3967 constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI); 3968 3969 // Invert if we had a 'ne' cc. 3970 if (NotOpc) { 3971 Cmp = MIB.buildInstr(NotOpc, {DstReg}, {Cmp}); 3972 constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI); 3973 } else { 3974 MIB.buildCopy(DstReg, Cmp.getReg(0)); 3975 } 3976 RBI.constrainGenericRegister(DstReg, *SrcRC, MRI); 3977 I.eraseFromParent(); 3978 return true; 3979 } 3980 3981 MachineInstr *AArch64InstructionSelector::emitScalarToVector( 3982 unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar, 3983 MachineIRBuilder &MIRBuilder) const { 3984 auto Undef = MIRBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstRC}, {}); 3985 3986 auto BuildFn = [&](unsigned SubregIndex) { 3987 auto Ins = 3988 MIRBuilder 3989 .buildInstr(TargetOpcode::INSERT_SUBREG, {DstRC}, {Undef, Scalar}) 3990 .addImm(SubregIndex); 3991 constrainSelectedInstRegOperands(*Undef, TII, TRI, RBI); 3992 constrainSelectedInstRegOperands(*Ins, TII, TRI, RBI); 3993 return &*Ins; 3994 }; 3995 3996 switch (EltSize) { 3997 case 16: 3998 return BuildFn(AArch64::hsub); 3999 case 32: 4000 return BuildFn(AArch64::ssub); 4001 case 64: 4002 return BuildFn(AArch64::dsub); 4003 default: 4004 return nullptr; 4005 } 4006 } 4007 4008 bool AArch64InstructionSelector::selectMergeValues( 4009 MachineInstr &I, MachineRegisterInfo &MRI) { 4010 assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode"); 4011 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 4012 const LLT SrcTy = MRI.getType(I.getOperand(1).getReg()); 4013 assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation"); 4014 const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI); 4015 4016 if (I.getNumOperands() != 3) 4017 return false; 4018 4019 // Merging 2 s64s into an s128. 4020 if (DstTy == LLT::scalar(128)) { 4021 if (SrcTy.getSizeInBits() != 64) 4022 return false; 4023 Register DstReg = I.getOperand(0).getReg(); 4024 Register Src1Reg = I.getOperand(1).getReg(); 4025 Register Src2Reg = I.getOperand(2).getReg(); 4026 auto Tmp = MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstTy}, {}); 4027 MachineInstr *InsMI = emitLaneInsert(std::nullopt, Tmp.getReg(0), Src1Reg, 4028 /* LaneIdx */ 0, RB, MIB); 4029 if (!InsMI) 4030 return false; 4031 MachineInstr *Ins2MI = emitLaneInsert(DstReg, InsMI->getOperand(0).getReg(), 4032 Src2Reg, /* LaneIdx */ 1, RB, MIB); 4033 if (!Ins2MI) 4034 return false; 4035 constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI); 4036 constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI); 4037 I.eraseFromParent(); 4038 return true; 4039 } 4040 4041 if (RB.getID() != AArch64::GPRRegBankID) 4042 return false; 4043 4044 if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32) 4045 return false; 4046 4047 auto *DstRC = &AArch64::GPR64RegClass; 4048 Register SubToRegDef = MRI.createVirtualRegister(DstRC); 4049 MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(), 4050 TII.get(TargetOpcode::SUBREG_TO_REG)) 4051 .addDef(SubToRegDef) 4052 .addImm(0) 4053 .addUse(I.getOperand(1).getReg()) 4054 .addImm(AArch64::sub_32); 4055 Register SubToRegDef2 = MRI.createVirtualRegister(DstRC); 4056 // Need to anyext the second scalar before we can use bfm 4057 MachineInstr &SubRegMI2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(), 4058 TII.get(TargetOpcode::SUBREG_TO_REG)) 4059 .addDef(SubToRegDef2) 4060 .addImm(0) 4061 .addUse(I.getOperand(2).getReg()) 4062 .addImm(AArch64::sub_32); 4063 MachineInstr &BFM = 4064 *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::BFMXri)) 4065 .addDef(I.getOperand(0).getReg()) 4066 .addUse(SubToRegDef) 4067 .addUse(SubToRegDef2) 4068 .addImm(32) 4069 .addImm(31); 4070 constrainSelectedInstRegOperands(SubRegMI, TII, TRI, RBI); 4071 constrainSelectedInstRegOperands(SubRegMI2, TII, TRI, RBI); 4072 constrainSelectedInstRegOperands(BFM, TII, TRI, RBI); 4073 I.eraseFromParent(); 4074 return true; 4075 } 4076 4077 static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg, 4078 const unsigned EltSize) { 4079 // Choose a lane copy opcode and subregister based off of the size of the 4080 // vector's elements. 4081 switch (EltSize) { 4082 case 8: 4083 CopyOpc = AArch64::DUPi8; 4084 ExtractSubReg = AArch64::bsub; 4085 break; 4086 case 16: 4087 CopyOpc = AArch64::DUPi16; 4088 ExtractSubReg = AArch64::hsub; 4089 break; 4090 case 32: 4091 CopyOpc = AArch64::DUPi32; 4092 ExtractSubReg = AArch64::ssub; 4093 break; 4094 case 64: 4095 CopyOpc = AArch64::DUPi64; 4096 ExtractSubReg = AArch64::dsub; 4097 break; 4098 default: 4099 // Unknown size, bail out. 4100 LLVM_DEBUG(dbgs() << "Elt size '" << EltSize << "' unsupported.\n"); 4101 return false; 4102 } 4103 return true; 4104 } 4105 4106 MachineInstr *AArch64InstructionSelector::emitExtractVectorElt( 4107 std::optional<Register> DstReg, const RegisterBank &DstRB, LLT ScalarTy, 4108 Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const { 4109 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 4110 unsigned CopyOpc = 0; 4111 unsigned ExtractSubReg = 0; 4112 if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, ScalarTy.getSizeInBits())) { 4113 LLVM_DEBUG( 4114 dbgs() << "Couldn't determine lane copy opcode for instruction.\n"); 4115 return nullptr; 4116 } 4117 4118 const TargetRegisterClass *DstRC = 4119 getRegClassForTypeOnBank(ScalarTy, DstRB, true); 4120 if (!DstRC) { 4121 LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n"); 4122 return nullptr; 4123 } 4124 4125 const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI); 4126 const LLT &VecTy = MRI.getType(VecReg); 4127 const TargetRegisterClass *VecRC = 4128 getRegClassForTypeOnBank(VecTy, VecRB, true); 4129 if (!VecRC) { 4130 LLVM_DEBUG(dbgs() << "Could not determine source register class.\n"); 4131 return nullptr; 4132 } 4133 4134 // The register that we're going to copy into. 4135 Register InsertReg = VecReg; 4136 if (!DstReg) 4137 DstReg = MRI.createVirtualRegister(DstRC); 4138 // If the lane index is 0, we just use a subregister COPY. 4139 if (LaneIdx == 0) { 4140 auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {*DstReg}, {}) 4141 .addReg(VecReg, 0, ExtractSubReg); 4142 RBI.constrainGenericRegister(*DstReg, *DstRC, MRI); 4143 return &*Copy; 4144 } 4145 4146 // Lane copies require 128-bit wide registers. If we're dealing with an 4147 // unpacked vector, then we need to move up to that width. Insert an implicit 4148 // def and a subregister insert to get us there. 4149 if (VecTy.getSizeInBits() != 128) { 4150 MachineInstr *ScalarToVector = emitScalarToVector( 4151 VecTy.getSizeInBits(), &AArch64::FPR128RegClass, VecReg, MIRBuilder); 4152 if (!ScalarToVector) 4153 return nullptr; 4154 InsertReg = ScalarToVector->getOperand(0).getReg(); 4155 } 4156 4157 MachineInstr *LaneCopyMI = 4158 MIRBuilder.buildInstr(CopyOpc, {*DstReg}, {InsertReg}).addImm(LaneIdx); 4159 constrainSelectedInstRegOperands(*LaneCopyMI, TII, TRI, RBI); 4160 4161 // Make sure that we actually constrain the initial copy. 4162 RBI.constrainGenericRegister(*DstReg, *DstRC, MRI); 4163 return LaneCopyMI; 4164 } 4165 4166 bool AArch64InstructionSelector::selectExtractElt( 4167 MachineInstr &I, MachineRegisterInfo &MRI) { 4168 assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT && 4169 "unexpected opcode!"); 4170 Register DstReg = I.getOperand(0).getReg(); 4171 const LLT NarrowTy = MRI.getType(DstReg); 4172 const Register SrcReg = I.getOperand(1).getReg(); 4173 const LLT WideTy = MRI.getType(SrcReg); 4174 (void)WideTy; 4175 assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() && 4176 "source register size too small!"); 4177 assert(!NarrowTy.isVector() && "cannot extract vector into vector!"); 4178 4179 // Need the lane index to determine the correct copy opcode. 4180 MachineOperand &LaneIdxOp = I.getOperand(2); 4181 assert(LaneIdxOp.isReg() && "Lane index operand was not a register?"); 4182 4183 if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) { 4184 LLVM_DEBUG(dbgs() << "Cannot extract into GPR.\n"); 4185 return false; 4186 } 4187 4188 // Find the index to extract from. 4189 auto VRegAndVal = getIConstantVRegValWithLookThrough(LaneIdxOp.getReg(), MRI); 4190 if (!VRegAndVal) 4191 return false; 4192 unsigned LaneIdx = VRegAndVal->Value.getSExtValue(); 4193 4194 4195 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); 4196 MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, NarrowTy, SrcReg, 4197 LaneIdx, MIB); 4198 if (!Extract) 4199 return false; 4200 4201 I.eraseFromParent(); 4202 return true; 4203 } 4204 4205 bool AArch64InstructionSelector::selectSplitVectorUnmerge( 4206 MachineInstr &I, MachineRegisterInfo &MRI) { 4207 unsigned NumElts = I.getNumOperands() - 1; 4208 Register SrcReg = I.getOperand(NumElts).getReg(); 4209 const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg()); 4210 const LLT SrcTy = MRI.getType(SrcReg); 4211 4212 assert(NarrowTy.isVector() && "Expected an unmerge into vectors"); 4213 if (SrcTy.getSizeInBits() > 128) { 4214 LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge"); 4215 return false; 4216 } 4217 4218 // We implement a split vector operation by treating the sub-vectors as 4219 // scalars and extracting them. 4220 const RegisterBank &DstRB = 4221 *RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI); 4222 for (unsigned OpIdx = 0; OpIdx < NumElts; ++OpIdx) { 4223 Register Dst = I.getOperand(OpIdx).getReg(); 4224 MachineInstr *Extract = 4225 emitExtractVectorElt(Dst, DstRB, NarrowTy, SrcReg, OpIdx, MIB); 4226 if (!Extract) 4227 return false; 4228 } 4229 I.eraseFromParent(); 4230 return true; 4231 } 4232 4233 bool AArch64InstructionSelector::selectUnmergeValues(MachineInstr &I, 4234 MachineRegisterInfo &MRI) { 4235 assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES && 4236 "unexpected opcode"); 4237 4238 // TODO: Handle unmerging into GPRs and from scalars to scalars. 4239 if (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() != 4240 AArch64::FPRRegBankID || 4241 RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() != 4242 AArch64::FPRRegBankID) { 4243 LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar " 4244 "currently unsupported.\n"); 4245 return false; 4246 } 4247 4248 // The last operand is the vector source register, and every other operand is 4249 // a register to unpack into. 4250 unsigned NumElts = I.getNumOperands() - 1; 4251 Register SrcReg = I.getOperand(NumElts).getReg(); 4252 const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg()); 4253 const LLT WideTy = MRI.getType(SrcReg); 4254 (void)WideTy; 4255 assert((WideTy.isVector() || WideTy.getSizeInBits() == 128) && 4256 "can only unmerge from vector or s128 types!"); 4257 assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() && 4258 "source register size too small!"); 4259 4260 if (!NarrowTy.isScalar()) 4261 return selectSplitVectorUnmerge(I, MRI); 4262 4263 // Choose a lane copy opcode and subregister based off of the size of the 4264 // vector's elements. 4265 unsigned CopyOpc = 0; 4266 unsigned ExtractSubReg = 0; 4267 if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, NarrowTy.getSizeInBits())) 4268 return false; 4269 4270 // Set up for the lane copies. 4271 MachineBasicBlock &MBB = *I.getParent(); 4272 4273 // Stores the registers we'll be copying from. 4274 SmallVector<Register, 4> InsertRegs; 4275 4276 // We'll use the first register twice, so we only need NumElts-1 registers. 4277 unsigned NumInsertRegs = NumElts - 1; 4278 4279 // If our elements fit into exactly 128 bits, then we can copy from the source 4280 // directly. Otherwise, we need to do a bit of setup with some subregister 4281 // inserts. 4282 if (NarrowTy.getSizeInBits() * NumElts == 128) { 4283 InsertRegs = SmallVector<Register, 4>(NumInsertRegs, SrcReg); 4284 } else { 4285 // No. We have to perform subregister inserts. For each insert, create an 4286 // implicit def and a subregister insert, and save the register we create. 4287 const TargetRegisterClass *RC = getRegClassForTypeOnBank( 4288 LLT::fixed_vector(NumElts, WideTy.getScalarSizeInBits()), 4289 *RBI.getRegBank(SrcReg, MRI, TRI)); 4290 unsigned SubReg = 0; 4291 bool Found = getSubRegForClass(RC, TRI, SubReg); 4292 (void)Found; 4293 assert(Found && "expected to find last operand's subeg idx"); 4294 for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) { 4295 Register ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass); 4296 MachineInstr &ImpDefMI = 4297 *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::IMPLICIT_DEF), 4298 ImpDefReg); 4299 4300 // Now, create the subregister insert from SrcReg. 4301 Register InsertReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass); 4302 MachineInstr &InsMI = 4303 *BuildMI(MBB, I, I.getDebugLoc(), 4304 TII.get(TargetOpcode::INSERT_SUBREG), InsertReg) 4305 .addUse(ImpDefReg) 4306 .addUse(SrcReg) 4307 .addImm(SubReg); 4308 4309 constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI); 4310 constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI); 4311 4312 // Save the register so that we can copy from it after. 4313 InsertRegs.push_back(InsertReg); 4314 } 4315 } 4316 4317 // Now that we've created any necessary subregister inserts, we can 4318 // create the copies. 4319 // 4320 // Perform the first copy separately as a subregister copy. 4321 Register CopyTo = I.getOperand(0).getReg(); 4322 auto FirstCopy = MIB.buildInstr(TargetOpcode::COPY, {CopyTo}, {}) 4323 .addReg(InsertRegs[0], 0, ExtractSubReg); 4324 constrainSelectedInstRegOperands(*FirstCopy, TII, TRI, RBI); 4325 4326 // Now, perform the remaining copies as vector lane copies. 4327 unsigned LaneIdx = 1; 4328 for (Register InsReg : InsertRegs) { 4329 Register CopyTo = I.getOperand(LaneIdx).getReg(); 4330 MachineInstr &CopyInst = 4331 *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CopyOpc), CopyTo) 4332 .addUse(InsReg) 4333 .addImm(LaneIdx); 4334 constrainSelectedInstRegOperands(CopyInst, TII, TRI, RBI); 4335 ++LaneIdx; 4336 } 4337 4338 // Separately constrain the first copy's destination. Because of the 4339 // limitation in constrainOperandRegClass, we can't guarantee that this will 4340 // actually be constrained. So, do it ourselves using the second operand. 4341 const TargetRegisterClass *RC = 4342 MRI.getRegClassOrNull(I.getOperand(1).getReg()); 4343 if (!RC) { 4344 LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n"); 4345 return false; 4346 } 4347 4348 RBI.constrainGenericRegister(CopyTo, *RC, MRI); 4349 I.eraseFromParent(); 4350 return true; 4351 } 4352 4353 bool AArch64InstructionSelector::selectConcatVectors( 4354 MachineInstr &I, MachineRegisterInfo &MRI) { 4355 assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS && 4356 "Unexpected opcode"); 4357 Register Dst = I.getOperand(0).getReg(); 4358 Register Op1 = I.getOperand(1).getReg(); 4359 Register Op2 = I.getOperand(2).getReg(); 4360 MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIB); 4361 if (!ConcatMI) 4362 return false; 4363 I.eraseFromParent(); 4364 return true; 4365 } 4366 4367 unsigned 4368 AArch64InstructionSelector::emitConstantPoolEntry(const Constant *CPVal, 4369 MachineFunction &MF) const { 4370 Type *CPTy = CPVal->getType(); 4371 Align Alignment = MF.getDataLayout().getPrefTypeAlign(CPTy); 4372 4373 MachineConstantPool *MCP = MF.getConstantPool(); 4374 return MCP->getConstantPoolIndex(CPVal, Alignment); 4375 } 4376 4377 MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool( 4378 const Constant *CPVal, MachineIRBuilder &MIRBuilder) const { 4379 auto &MF = MIRBuilder.getMF(); 4380 unsigned CPIdx = emitConstantPoolEntry(CPVal, MF); 4381 4382 auto Adrp = 4383 MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {}) 4384 .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE); 4385 4386 MachineInstr *LoadMI = nullptr; 4387 MachinePointerInfo PtrInfo = MachinePointerInfo::getConstantPool(MF); 4388 unsigned Size = MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType()); 4389 switch (Size) { 4390 case 16: 4391 LoadMI = 4392 &*MIRBuilder 4393 .buildInstr(AArch64::LDRQui, {&AArch64::FPR128RegClass}, {Adrp}) 4394 .addConstantPoolIndex(CPIdx, 0, 4395 AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 4396 break; 4397 case 8: 4398 LoadMI = 4399 &*MIRBuilder 4400 .buildInstr(AArch64::LDRDui, {&AArch64::FPR64RegClass}, {Adrp}) 4401 .addConstantPoolIndex(CPIdx, 0, 4402 AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 4403 break; 4404 case 4: 4405 LoadMI = 4406 &*MIRBuilder 4407 .buildInstr(AArch64::LDRSui, {&AArch64::FPR32RegClass}, {Adrp}) 4408 .addConstantPoolIndex(CPIdx, 0, 4409 AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 4410 break; 4411 case 2: 4412 LoadMI = 4413 &*MIRBuilder 4414 .buildInstr(AArch64::LDRHui, {&AArch64::FPR16RegClass}, {Adrp}) 4415 .addConstantPoolIndex(CPIdx, 0, 4416 AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 4417 break; 4418 default: 4419 LLVM_DEBUG(dbgs() << "Could not load from constant pool of type " 4420 << *CPVal->getType()); 4421 return nullptr; 4422 } 4423 LoadMI->addMemOperand(MF, MF.getMachineMemOperand(PtrInfo, 4424 MachineMemOperand::MOLoad, 4425 Size, Align(Size))); 4426 constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI); 4427 constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI); 4428 return LoadMI; 4429 } 4430 4431 /// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given 4432 /// size and RB. 4433 static std::pair<unsigned, unsigned> 4434 getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) { 4435 unsigned Opc, SubregIdx; 4436 if (RB.getID() == AArch64::GPRRegBankID) { 4437 if (EltSize == 16) { 4438 Opc = AArch64::INSvi16gpr; 4439 SubregIdx = AArch64::ssub; 4440 } else if (EltSize == 32) { 4441 Opc = AArch64::INSvi32gpr; 4442 SubregIdx = AArch64::ssub; 4443 } else if (EltSize == 64) { 4444 Opc = AArch64::INSvi64gpr; 4445 SubregIdx = AArch64::dsub; 4446 } else { 4447 llvm_unreachable("invalid elt size!"); 4448 } 4449 } else { 4450 if (EltSize == 8) { 4451 Opc = AArch64::INSvi8lane; 4452 SubregIdx = AArch64::bsub; 4453 } else if (EltSize == 16) { 4454 Opc = AArch64::INSvi16lane; 4455 SubregIdx = AArch64::hsub; 4456 } else if (EltSize == 32) { 4457 Opc = AArch64::INSvi32lane; 4458 SubregIdx = AArch64::ssub; 4459 } else if (EltSize == 64) { 4460 Opc = AArch64::INSvi64lane; 4461 SubregIdx = AArch64::dsub; 4462 } else { 4463 llvm_unreachable("invalid elt size!"); 4464 } 4465 } 4466 return std::make_pair(Opc, SubregIdx); 4467 } 4468 4469 MachineInstr *AArch64InstructionSelector::emitInstr( 4470 unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps, 4471 std::initializer_list<llvm::SrcOp> SrcOps, MachineIRBuilder &MIRBuilder, 4472 const ComplexRendererFns &RenderFns) const { 4473 assert(Opcode && "Expected an opcode?"); 4474 assert(!isPreISelGenericOpcode(Opcode) && 4475 "Function should only be used to produce selected instructions!"); 4476 auto MI = MIRBuilder.buildInstr(Opcode, DstOps, SrcOps); 4477 if (RenderFns) 4478 for (auto &Fn : *RenderFns) 4479 Fn(MI); 4480 constrainSelectedInstRegOperands(*MI, TII, TRI, RBI); 4481 return &*MI; 4482 } 4483 4484 MachineInstr *AArch64InstructionSelector::emitAddSub( 4485 const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode, 4486 Register Dst, MachineOperand &LHS, MachineOperand &RHS, 4487 MachineIRBuilder &MIRBuilder) const { 4488 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); 4489 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?"); 4490 auto Ty = MRI.getType(LHS.getReg()); 4491 assert(!Ty.isVector() && "Expected a scalar or pointer?"); 4492 unsigned Size = Ty.getSizeInBits(); 4493 assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit type only"); 4494 bool Is32Bit = Size == 32; 4495 4496 // INSTRri form with positive arithmetic immediate. 4497 if (auto Fns = selectArithImmed(RHS)) 4498 return emitInstr(AddrModeAndSizeToOpcode[0][Is32Bit], {Dst}, {LHS}, 4499 MIRBuilder, Fns); 4500 4501 // INSTRri form with negative arithmetic immediate. 4502 if (auto Fns = selectNegArithImmed(RHS)) 4503 return emitInstr(AddrModeAndSizeToOpcode[3][Is32Bit], {Dst}, {LHS}, 4504 MIRBuilder, Fns); 4505 4506 // INSTRrx form. 4507 if (auto Fns = selectArithExtendedRegister(RHS)) 4508 return emitInstr(AddrModeAndSizeToOpcode[4][Is32Bit], {Dst}, {LHS}, 4509 MIRBuilder, Fns); 4510 4511 // INSTRrs form. 4512 if (auto Fns = selectShiftedRegister(RHS)) 4513 return emitInstr(AddrModeAndSizeToOpcode[1][Is32Bit], {Dst}, {LHS}, 4514 MIRBuilder, Fns); 4515 return emitInstr(AddrModeAndSizeToOpcode[2][Is32Bit], {Dst}, {LHS, RHS}, 4516 MIRBuilder); 4517 } 4518 4519 MachineInstr * 4520 AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS, 4521 MachineOperand &RHS, 4522 MachineIRBuilder &MIRBuilder) const { 4523 const std::array<std::array<unsigned, 2>, 5> OpcTable{ 4524 {{AArch64::ADDXri, AArch64::ADDWri}, 4525 {AArch64::ADDXrs, AArch64::ADDWrs}, 4526 {AArch64::ADDXrr, AArch64::ADDWrr}, 4527 {AArch64::SUBXri, AArch64::SUBWri}, 4528 {AArch64::ADDXrx, AArch64::ADDWrx}}}; 4529 return emitAddSub(OpcTable, DefReg, LHS, RHS, MIRBuilder); 4530 } 4531 4532 MachineInstr * 4533 AArch64InstructionSelector::emitADDS(Register Dst, MachineOperand &LHS, 4534 MachineOperand &RHS, 4535 MachineIRBuilder &MIRBuilder) const { 4536 const std::array<std::array<unsigned, 2>, 5> OpcTable{ 4537 {{AArch64::ADDSXri, AArch64::ADDSWri}, 4538 {AArch64::ADDSXrs, AArch64::ADDSWrs}, 4539 {AArch64::ADDSXrr, AArch64::ADDSWrr}, 4540 {AArch64::SUBSXri, AArch64::SUBSWri}, 4541 {AArch64::ADDSXrx, AArch64::ADDSWrx}}}; 4542 return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder); 4543 } 4544 4545 MachineInstr * 4546 AArch64InstructionSelector::emitSUBS(Register Dst, MachineOperand &LHS, 4547 MachineOperand &RHS, 4548 MachineIRBuilder &MIRBuilder) const { 4549 const std::array<std::array<unsigned, 2>, 5> OpcTable{ 4550 {{AArch64::SUBSXri, AArch64::SUBSWri}, 4551 {AArch64::SUBSXrs, AArch64::SUBSWrs}, 4552 {AArch64::SUBSXrr, AArch64::SUBSWrr}, 4553 {AArch64::ADDSXri, AArch64::ADDSWri}, 4554 {AArch64::SUBSXrx, AArch64::SUBSWrx}}}; 4555 return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder); 4556 } 4557 4558 MachineInstr * 4559 AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS, 4560 MachineIRBuilder &MIRBuilder) const { 4561 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); 4562 bool Is32Bit = (MRI.getType(LHS.getReg()).getSizeInBits() == 32); 4563 auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass; 4564 return emitADDS(MRI.createVirtualRegister(RC), LHS, RHS, MIRBuilder); 4565 } 4566 4567 MachineInstr * 4568 AArch64InstructionSelector::emitTST(MachineOperand &LHS, MachineOperand &RHS, 4569 MachineIRBuilder &MIRBuilder) const { 4570 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?"); 4571 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); 4572 LLT Ty = MRI.getType(LHS.getReg()); 4573 unsigned RegSize = Ty.getSizeInBits(); 4574 bool Is32Bit = (RegSize == 32); 4575 const unsigned OpcTable[3][2] = {{AArch64::ANDSXri, AArch64::ANDSWri}, 4576 {AArch64::ANDSXrs, AArch64::ANDSWrs}, 4577 {AArch64::ANDSXrr, AArch64::ANDSWrr}}; 4578 // ANDS needs a logical immediate for its immediate form. Check if we can 4579 // fold one in. 4580 if (auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI)) { 4581 int64_t Imm = ValAndVReg->Value.getSExtValue(); 4582 4583 if (AArch64_AM::isLogicalImmediate(Imm, RegSize)) { 4584 auto TstMI = MIRBuilder.buildInstr(OpcTable[0][Is32Bit], {Ty}, {LHS}); 4585 TstMI.addImm(AArch64_AM::encodeLogicalImmediate(Imm, RegSize)); 4586 constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI); 4587 return &*TstMI; 4588 } 4589 } 4590 4591 if (auto Fns = selectLogicalShiftedRegister(RHS)) 4592 return emitInstr(OpcTable[1][Is32Bit], {Ty}, {LHS}, MIRBuilder, Fns); 4593 return emitInstr(OpcTable[2][Is32Bit], {Ty}, {LHS, RHS}, MIRBuilder); 4594 } 4595 4596 MachineInstr *AArch64InstructionSelector::emitIntegerCompare( 4597 MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate, 4598 MachineIRBuilder &MIRBuilder) const { 4599 assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!"); 4600 assert(Predicate.isPredicate() && "Expected predicate?"); 4601 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); 4602 LLT CmpTy = MRI.getType(LHS.getReg()); 4603 assert(!CmpTy.isVector() && "Expected scalar or pointer"); 4604 unsigned Size = CmpTy.getSizeInBits(); 4605 (void)Size; 4606 assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit LHS/RHS?"); 4607 // Fold the compare into a cmn or tst if possible. 4608 if (auto FoldCmp = tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder)) 4609 return FoldCmp; 4610 auto Dst = MRI.cloneVirtualRegister(LHS.getReg()); 4611 return emitSUBS(Dst, LHS, RHS, MIRBuilder); 4612 } 4613 4614 MachineInstr *AArch64InstructionSelector::emitCSetForFCmp( 4615 Register Dst, CmpInst::Predicate Pred, MachineIRBuilder &MIRBuilder) const { 4616 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 4617 #ifndef NDEBUG 4618 LLT Ty = MRI.getType(Dst); 4619 assert(!Ty.isVector() && Ty.getSizeInBits() == 32 && 4620 "Expected a 32-bit scalar register?"); 4621 #endif 4622 const Register ZReg = AArch64::WZR; 4623 AArch64CC::CondCode CC1, CC2; 4624 changeFCMPPredToAArch64CC(Pred, CC1, CC2); 4625 auto InvCC1 = AArch64CC::getInvertedCondCode(CC1); 4626 if (CC2 == AArch64CC::AL) 4627 return emitCSINC(/*Dst=*/Dst, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1, 4628 MIRBuilder); 4629 const TargetRegisterClass *RC = &AArch64::GPR32RegClass; 4630 Register Def1Reg = MRI.createVirtualRegister(RC); 4631 Register Def2Reg = MRI.createVirtualRegister(RC); 4632 auto InvCC2 = AArch64CC::getInvertedCondCode(CC2); 4633 emitCSINC(/*Dst=*/Def1Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1, MIRBuilder); 4634 emitCSINC(/*Dst=*/Def2Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC2, MIRBuilder); 4635 auto OrMI = MIRBuilder.buildInstr(AArch64::ORRWrr, {Dst}, {Def1Reg, Def2Reg}); 4636 constrainSelectedInstRegOperands(*OrMI, TII, TRI, RBI); 4637 return &*OrMI; 4638 } 4639 4640 MachineInstr *AArch64InstructionSelector::emitFPCompare( 4641 Register LHS, Register RHS, MachineIRBuilder &MIRBuilder, 4642 std::optional<CmpInst::Predicate> Pred) const { 4643 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 4644 LLT Ty = MRI.getType(LHS); 4645 if (Ty.isVector()) 4646 return nullptr; 4647 unsigned OpSize = Ty.getSizeInBits(); 4648 if (OpSize != 32 && OpSize != 64) 4649 return nullptr; 4650 4651 // If this is a compare against +0.0, then we don't have 4652 // to explicitly materialize a constant. 4653 const ConstantFP *FPImm = getConstantFPVRegVal(RHS, MRI); 4654 bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative()); 4655 4656 auto IsEqualityPred = [](CmpInst::Predicate P) { 4657 return P == CmpInst::FCMP_OEQ || P == CmpInst::FCMP_ONE || 4658 P == CmpInst::FCMP_UEQ || P == CmpInst::FCMP_UNE; 4659 }; 4660 if (!ShouldUseImm && Pred && IsEqualityPred(*Pred)) { 4661 // Try commutating the operands. 4662 const ConstantFP *LHSImm = getConstantFPVRegVal(LHS, MRI); 4663 if (LHSImm && (LHSImm->isZero() && !LHSImm->isNegative())) { 4664 ShouldUseImm = true; 4665 std::swap(LHS, RHS); 4666 } 4667 } 4668 unsigned CmpOpcTbl[2][2] = {{AArch64::FCMPSrr, AArch64::FCMPDrr}, 4669 {AArch64::FCMPSri, AArch64::FCMPDri}}; 4670 unsigned CmpOpc = CmpOpcTbl[ShouldUseImm][OpSize == 64]; 4671 4672 // Partially build the compare. Decide if we need to add a use for the 4673 // third operand based off whether or not we're comparing against 0.0. 4674 auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addUse(LHS); 4675 CmpMI.setMIFlags(MachineInstr::NoFPExcept); 4676 if (!ShouldUseImm) 4677 CmpMI.addUse(RHS); 4678 constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI); 4679 return &*CmpMI; 4680 } 4681 4682 MachineInstr *AArch64InstructionSelector::emitVectorConcat( 4683 std::optional<Register> Dst, Register Op1, Register Op2, 4684 MachineIRBuilder &MIRBuilder) const { 4685 // We implement a vector concat by: 4686 // 1. Use scalar_to_vector to insert the lower vector into the larger dest 4687 // 2. Insert the upper vector into the destination's upper element 4688 // TODO: some of this code is common with G_BUILD_VECTOR handling. 4689 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); 4690 4691 const LLT Op1Ty = MRI.getType(Op1); 4692 const LLT Op2Ty = MRI.getType(Op2); 4693 4694 if (Op1Ty != Op2Ty) { 4695 LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys"); 4696 return nullptr; 4697 } 4698 assert(Op1Ty.isVector() && "Expected a vector for vector concat"); 4699 4700 if (Op1Ty.getSizeInBits() >= 128) { 4701 LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors"); 4702 return nullptr; 4703 } 4704 4705 // At the moment we just support 64 bit vector concats. 4706 if (Op1Ty.getSizeInBits() != 64) { 4707 LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors"); 4708 return nullptr; 4709 } 4710 4711 const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits()); 4712 const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI); 4713 const TargetRegisterClass *DstRC = 4714 getRegClassForTypeOnBank(Op1Ty.multiplyElements(2), FPRBank); 4715 4716 MachineInstr *WidenedOp1 = 4717 emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder); 4718 MachineInstr *WidenedOp2 = 4719 emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op2, MIRBuilder); 4720 if (!WidenedOp1 || !WidenedOp2) { 4721 LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value"); 4722 return nullptr; 4723 } 4724 4725 // Now do the insert of the upper element. 4726 unsigned InsertOpc, InsSubRegIdx; 4727 std::tie(InsertOpc, InsSubRegIdx) = 4728 getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits()); 4729 4730 if (!Dst) 4731 Dst = MRI.createVirtualRegister(DstRC); 4732 auto InsElt = 4733 MIRBuilder 4734 .buildInstr(InsertOpc, {*Dst}, {WidenedOp1->getOperand(0).getReg()}) 4735 .addImm(1) /* Lane index */ 4736 .addUse(WidenedOp2->getOperand(0).getReg()) 4737 .addImm(0); 4738 constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI); 4739 return &*InsElt; 4740 } 4741 4742 MachineInstr * 4743 AArch64InstructionSelector::emitCSINC(Register Dst, Register Src1, 4744 Register Src2, AArch64CC::CondCode Pred, 4745 MachineIRBuilder &MIRBuilder) const { 4746 auto &MRI = *MIRBuilder.getMRI(); 4747 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Dst); 4748 // If we used a register class, then this won't necessarily have an LLT. 4749 // Compute the size based off whether or not we have a class or bank. 4750 unsigned Size; 4751 if (const auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>()) 4752 Size = TRI.getRegSizeInBits(*RC); 4753 else 4754 Size = MRI.getType(Dst).getSizeInBits(); 4755 // Some opcodes use s1. 4756 assert(Size <= 64 && "Expected 64 bits or less only!"); 4757 static const unsigned OpcTable[2] = {AArch64::CSINCWr, AArch64::CSINCXr}; 4758 unsigned Opc = OpcTable[Size == 64]; 4759 auto CSINC = MIRBuilder.buildInstr(Opc, {Dst}, {Src1, Src2}).addImm(Pred); 4760 constrainSelectedInstRegOperands(*CSINC, TII, TRI, RBI); 4761 return &*CSINC; 4762 } 4763 4764 std::pair<MachineInstr *, AArch64CC::CondCode> 4765 AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst, 4766 MachineOperand &LHS, 4767 MachineOperand &RHS, 4768 MachineIRBuilder &MIRBuilder) const { 4769 switch (Opcode) { 4770 default: 4771 llvm_unreachable("Unexpected opcode!"); 4772 case TargetOpcode::G_SADDO: 4773 return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS); 4774 case TargetOpcode::G_UADDO: 4775 return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS); 4776 case TargetOpcode::G_SSUBO: 4777 return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS); 4778 case TargetOpcode::G_USUBO: 4779 return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO); 4780 } 4781 } 4782 4783 /// Returns true if @p Val is a tree of AND/OR/CMP operations that can be 4784 /// expressed as a conjunction. 4785 /// \param CanNegate Set to true if we can negate the whole sub-tree just by 4786 /// changing the conditions on the CMP tests. 4787 /// (this means we can call emitConjunctionRec() with 4788 /// Negate==true on this sub-tree) 4789 /// \param MustBeFirst Set to true if this subtree needs to be negated and we 4790 /// cannot do the negation naturally. We are required to 4791 /// emit the subtree first in this case. 4792 /// \param WillNegate Is true if are called when the result of this 4793 /// subexpression must be negated. This happens when the 4794 /// outer expression is an OR. We can use this fact to know 4795 /// that we have a double negation (or (or ...) ...) that 4796 /// can be implemented for free. 4797 static bool canEmitConjunction(Register Val, bool &CanNegate, bool &MustBeFirst, 4798 bool WillNegate, MachineRegisterInfo &MRI, 4799 unsigned Depth = 0) { 4800 if (!MRI.hasOneNonDBGUse(Val)) 4801 return false; 4802 MachineInstr *ValDef = MRI.getVRegDef(Val); 4803 unsigned Opcode = ValDef->getOpcode(); 4804 if (isa<GAnyCmp>(ValDef)) { 4805 CanNegate = true; 4806 MustBeFirst = false; 4807 return true; 4808 } 4809 // Protect against exponential runtime and stack overflow. 4810 if (Depth > 6) 4811 return false; 4812 if (Opcode == TargetOpcode::G_AND || Opcode == TargetOpcode::G_OR) { 4813 bool IsOR = Opcode == TargetOpcode::G_OR; 4814 Register O0 = ValDef->getOperand(1).getReg(); 4815 Register O1 = ValDef->getOperand(2).getReg(); 4816 bool CanNegateL; 4817 bool MustBeFirstL; 4818 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, MRI, Depth + 1)) 4819 return false; 4820 bool CanNegateR; 4821 bool MustBeFirstR; 4822 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, MRI, Depth + 1)) 4823 return false; 4824 4825 if (MustBeFirstL && MustBeFirstR) 4826 return false; 4827 4828 if (IsOR) { 4829 // For an OR expression we need to be able to naturally negate at least 4830 // one side or we cannot do the transformation at all. 4831 if (!CanNegateL && !CanNegateR) 4832 return false; 4833 // If we the result of the OR will be negated and we can naturally negate 4834 // the leaves, then this sub-tree as a whole negates naturally. 4835 CanNegate = WillNegate && CanNegateL && CanNegateR; 4836 // If we cannot naturally negate the whole sub-tree, then this must be 4837 // emitted first. 4838 MustBeFirst = !CanNegate; 4839 } else { 4840 assert(Opcode == TargetOpcode::G_AND && "Must be G_AND"); 4841 // We cannot naturally negate an AND operation. 4842 CanNegate = false; 4843 MustBeFirst = MustBeFirstL || MustBeFirstR; 4844 } 4845 return true; 4846 } 4847 return false; 4848 } 4849 4850 MachineInstr *AArch64InstructionSelector::emitConditionalComparison( 4851 Register LHS, Register RHS, CmpInst::Predicate CC, 4852 AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, 4853 MachineIRBuilder &MIB) const { 4854 // TODO: emit CMN as an optimization. 4855 auto &MRI = *MIB.getMRI(); 4856 LLT OpTy = MRI.getType(LHS); 4857 assert(OpTy.getSizeInBits() == 32 || OpTy.getSizeInBits() == 64); 4858 unsigned CCmpOpc; 4859 std::optional<ValueAndVReg> C; 4860 if (CmpInst::isIntPredicate(CC)) { 4861 C = getIConstantVRegValWithLookThrough(RHS, MRI); 4862 if (C && C->Value.ult(32)) 4863 CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWi : AArch64::CCMPXi; 4864 else 4865 CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWr : AArch64::CCMPXr; 4866 } else { 4867 switch (OpTy.getSizeInBits()) { 4868 case 16: 4869 CCmpOpc = AArch64::FCCMPHrr; 4870 break; 4871 case 32: 4872 CCmpOpc = AArch64::FCCMPSrr; 4873 break; 4874 case 64: 4875 CCmpOpc = AArch64::FCCMPDrr; 4876 break; 4877 default: 4878 return nullptr; 4879 } 4880 } 4881 AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); 4882 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); 4883 auto CCmp = 4884 MIB.buildInstr(CCmpOpc, {}, {LHS}); 4885 if (CCmpOpc == AArch64::CCMPWi || CCmpOpc == AArch64::CCMPXi) 4886 CCmp.addImm(C->Value.getZExtValue()); 4887 else 4888 CCmp.addReg(RHS); 4889 CCmp.addImm(NZCV).addImm(Predicate); 4890 constrainSelectedInstRegOperands(*CCmp, TII, TRI, RBI); 4891 return &*CCmp; 4892 } 4893 4894 MachineInstr *AArch64InstructionSelector::emitConjunctionRec( 4895 Register Val, AArch64CC::CondCode &OutCC, bool Negate, Register CCOp, 4896 AArch64CC::CondCode Predicate, MachineIRBuilder &MIB) const { 4897 // We're at a tree leaf, produce a conditional comparison operation. 4898 auto &MRI = *MIB.getMRI(); 4899 MachineInstr *ValDef = MRI.getVRegDef(Val); 4900 unsigned Opcode = ValDef->getOpcode(); 4901 if (auto *Cmp = dyn_cast<GAnyCmp>(ValDef)) { 4902 Register LHS = Cmp->getLHSReg(); 4903 Register RHS = Cmp->getRHSReg(); 4904 CmpInst::Predicate CC = Cmp->getCond(); 4905 if (Negate) 4906 CC = CmpInst::getInversePredicate(CC); 4907 if (isa<GICmp>(Cmp)) { 4908 OutCC = changeICMPPredToAArch64CC(CC); 4909 } else { 4910 // Handle special FP cases. 4911 AArch64CC::CondCode ExtraCC; 4912 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC); 4913 // Some floating point conditions can't be tested with a single condition 4914 // code. Construct an additional comparison in this case. 4915 if (ExtraCC != AArch64CC::AL) { 4916 MachineInstr *ExtraCmp; 4917 if (!CCOp) 4918 ExtraCmp = emitFPCompare(LHS, RHS, MIB, CC); 4919 else 4920 ExtraCmp = 4921 emitConditionalComparison(LHS, RHS, CC, Predicate, ExtraCC, MIB); 4922 CCOp = ExtraCmp->getOperand(0).getReg(); 4923 Predicate = ExtraCC; 4924 } 4925 } 4926 4927 // Produce a normal comparison if we are first in the chain 4928 if (!CCOp) { 4929 auto Dst = MRI.cloneVirtualRegister(LHS); 4930 if (isa<GICmp>(Cmp)) 4931 return emitSUBS(Dst, Cmp->getOperand(2), Cmp->getOperand(3), MIB); 4932 return emitFPCompare(Cmp->getOperand(2).getReg(), 4933 Cmp->getOperand(3).getReg(), MIB); 4934 } 4935 // Otherwise produce a ccmp. 4936 return emitConditionalComparison(LHS, RHS, CC, Predicate, OutCC, MIB); 4937 } 4938 assert(MRI.hasOneNonDBGUse(Val) && "Valid conjunction/disjunction tree"); 4939 4940 bool IsOR = Opcode == TargetOpcode::G_OR; 4941 4942 Register LHS = ValDef->getOperand(1).getReg(); 4943 bool CanNegateL; 4944 bool MustBeFirstL; 4945 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR, MRI); 4946 assert(ValidL && "Valid conjunction/disjunction tree"); 4947 (void)ValidL; 4948 4949 Register RHS = ValDef->getOperand(2).getReg(); 4950 bool CanNegateR; 4951 bool MustBeFirstR; 4952 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR, MRI); 4953 assert(ValidR && "Valid conjunction/disjunction tree"); 4954 (void)ValidR; 4955 4956 // Swap sub-tree that must come first to the right side. 4957 if (MustBeFirstL) { 4958 assert(!MustBeFirstR && "Valid conjunction/disjunction tree"); 4959 std::swap(LHS, RHS); 4960 std::swap(CanNegateL, CanNegateR); 4961 std::swap(MustBeFirstL, MustBeFirstR); 4962 } 4963 4964 bool NegateR; 4965 bool NegateAfterR; 4966 bool NegateL; 4967 bool NegateAfterAll; 4968 if (Opcode == TargetOpcode::G_OR) { 4969 // Swap the sub-tree that we can negate naturally to the left. 4970 if (!CanNegateL) { 4971 assert(CanNegateR && "at least one side must be negatable"); 4972 assert(!MustBeFirstR && "invalid conjunction/disjunction tree"); 4973 assert(!Negate); 4974 std::swap(LHS, RHS); 4975 NegateR = false; 4976 NegateAfterR = true; 4977 } else { 4978 // Negate the left sub-tree if possible, otherwise negate the result. 4979 NegateR = CanNegateR; 4980 NegateAfterR = !CanNegateR; 4981 } 4982 NegateL = true; 4983 NegateAfterAll = !Negate; 4984 } else { 4985 assert(Opcode == TargetOpcode::G_AND && 4986 "Valid conjunction/disjunction tree"); 4987 assert(!Negate && "Valid conjunction/disjunction tree"); 4988 4989 NegateL = false; 4990 NegateR = false; 4991 NegateAfterR = false; 4992 NegateAfterAll = false; 4993 } 4994 4995 // Emit sub-trees. 4996 AArch64CC::CondCode RHSCC; 4997 MachineInstr *CmpR = 4998 emitConjunctionRec(RHS, RHSCC, NegateR, CCOp, Predicate, MIB); 4999 if (NegateAfterR) 5000 RHSCC = AArch64CC::getInvertedCondCode(RHSCC); 5001 MachineInstr *CmpL = emitConjunctionRec( 5002 LHS, OutCC, NegateL, CmpR->getOperand(0).getReg(), RHSCC, MIB); 5003 if (NegateAfterAll) 5004 OutCC = AArch64CC::getInvertedCondCode(OutCC); 5005 return CmpL; 5006 } 5007 5008 MachineInstr *AArch64InstructionSelector::emitConjunction( 5009 Register Val, AArch64CC::CondCode &OutCC, MachineIRBuilder &MIB) const { 5010 bool DummyCanNegate; 5011 bool DummyMustBeFirst; 5012 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false, 5013 *MIB.getMRI())) 5014 return nullptr; 5015 return emitConjunctionRec(Val, OutCC, false, Register(), AArch64CC::AL, MIB); 5016 } 5017 5018 bool AArch64InstructionSelector::tryOptSelectConjunction(GSelect &SelI, 5019 MachineInstr &CondMI) { 5020 AArch64CC::CondCode AArch64CC; 5021 MachineInstr *ConjMI = emitConjunction(SelI.getCondReg(), AArch64CC, MIB); 5022 if (!ConjMI) 5023 return false; 5024 5025 emitSelect(SelI.getReg(0), SelI.getTrueReg(), SelI.getFalseReg(), AArch64CC, MIB); 5026 SelI.eraseFromParent(); 5027 return true; 5028 } 5029 5030 bool AArch64InstructionSelector::tryOptSelect(GSelect &I) { 5031 MachineRegisterInfo &MRI = *MIB.getMRI(); 5032 // We want to recognize this pattern: 5033 // 5034 // $z = G_FCMP pred, $x, $y 5035 // ... 5036 // $w = G_SELECT $z, $a, $b 5037 // 5038 // Where the value of $z is *only* ever used by the G_SELECT (possibly with 5039 // some copies/truncs in between.) 5040 // 5041 // If we see this, then we can emit something like this: 5042 // 5043 // fcmp $x, $y 5044 // fcsel $w, $a, $b, pred 5045 // 5046 // Rather than emitting both of the rather long sequences in the standard 5047 // G_FCMP/G_SELECT select methods. 5048 5049 // First, check if the condition is defined by a compare. 5050 MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg()); 5051 5052 // We can only fold if all of the defs have one use. 5053 Register CondDefReg = CondDef->getOperand(0).getReg(); 5054 if (!MRI.hasOneNonDBGUse(CondDefReg)) { 5055 // Unless it's another select. 5056 for (const MachineInstr &UI : MRI.use_nodbg_instructions(CondDefReg)) { 5057 if (CondDef == &UI) 5058 continue; 5059 if (UI.getOpcode() != TargetOpcode::G_SELECT) 5060 return false; 5061 } 5062 } 5063 5064 // Is the condition defined by a compare? 5065 unsigned CondOpc = CondDef->getOpcode(); 5066 if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP) { 5067 if (tryOptSelectConjunction(I, *CondDef)) 5068 return true; 5069 return false; 5070 } 5071 5072 AArch64CC::CondCode CondCode; 5073 if (CondOpc == TargetOpcode::G_ICMP) { 5074 auto Pred = 5075 static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate()); 5076 CondCode = changeICMPPredToAArch64CC(Pred); 5077 emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3), 5078 CondDef->getOperand(1), MIB); 5079 } else { 5080 // Get the condition code for the select. 5081 auto Pred = 5082 static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate()); 5083 AArch64CC::CondCode CondCode2; 5084 changeFCMPPredToAArch64CC(Pred, CondCode, CondCode2); 5085 5086 // changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two 5087 // instructions to emit the comparison. 5088 // TODO: Handle FCMP_UEQ and FCMP_ONE. After that, this check will be 5089 // unnecessary. 5090 if (CondCode2 != AArch64CC::AL) 5091 return false; 5092 5093 if (!emitFPCompare(CondDef->getOperand(2).getReg(), 5094 CondDef->getOperand(3).getReg(), MIB)) { 5095 LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n"); 5096 return false; 5097 } 5098 } 5099 5100 // Emit the select. 5101 emitSelect(I.getOperand(0).getReg(), I.getOperand(2).getReg(), 5102 I.getOperand(3).getReg(), CondCode, MIB); 5103 I.eraseFromParent(); 5104 return true; 5105 } 5106 5107 MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare( 5108 MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate, 5109 MachineIRBuilder &MIRBuilder) const { 5110 assert(LHS.isReg() && RHS.isReg() && Predicate.isPredicate() && 5111 "Unexpected MachineOperand"); 5112 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 5113 // We want to find this sort of thing: 5114 // x = G_SUB 0, y 5115 // G_ICMP z, x 5116 // 5117 // In this case, we can fold the G_SUB into the G_ICMP using a CMN instead. 5118 // e.g: 5119 // 5120 // cmn z, y 5121 5122 // Check if the RHS or LHS of the G_ICMP is defined by a SUB 5123 MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI); 5124 MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI); 5125 auto P = static_cast<CmpInst::Predicate>(Predicate.getPredicate()); 5126 // Given this: 5127 // 5128 // x = G_SUB 0, y 5129 // G_ICMP x, z 5130 // 5131 // Produce this: 5132 // 5133 // cmn y, z 5134 if (isCMN(LHSDef, P, MRI)) 5135 return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder); 5136 5137 // Same idea here, but with the RHS of the compare instead: 5138 // 5139 // Given this: 5140 // 5141 // x = G_SUB 0, y 5142 // G_ICMP z, x 5143 // 5144 // Produce this: 5145 // 5146 // cmn z, y 5147 if (isCMN(RHSDef, P, MRI)) 5148 return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder); 5149 5150 // Given this: 5151 // 5152 // z = G_AND x, y 5153 // G_ICMP z, 0 5154 // 5155 // Produce this if the compare is signed: 5156 // 5157 // tst x, y 5158 if (!CmpInst::isUnsigned(P) && LHSDef && 5159 LHSDef->getOpcode() == TargetOpcode::G_AND) { 5160 // Make sure that the RHS is 0. 5161 auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI); 5162 if (!ValAndVReg || ValAndVReg->Value != 0) 5163 return nullptr; 5164 5165 return emitTST(LHSDef->getOperand(1), 5166 LHSDef->getOperand(2), MIRBuilder); 5167 } 5168 5169 return nullptr; 5170 } 5171 5172 bool AArch64InstructionSelector::selectShuffleVector( 5173 MachineInstr &I, MachineRegisterInfo &MRI) { 5174 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 5175 Register Src1Reg = I.getOperand(1).getReg(); 5176 const LLT Src1Ty = MRI.getType(Src1Reg); 5177 Register Src2Reg = I.getOperand(2).getReg(); 5178 const LLT Src2Ty = MRI.getType(Src2Reg); 5179 ArrayRef<int> Mask = I.getOperand(3).getShuffleMask(); 5180 5181 MachineBasicBlock &MBB = *I.getParent(); 5182 MachineFunction &MF = *MBB.getParent(); 5183 LLVMContext &Ctx = MF.getFunction().getContext(); 5184 5185 // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if 5186 // it's originated from a <1 x T> type. Those should have been lowered into 5187 // G_BUILD_VECTOR earlier. 5188 if (!Src1Ty.isVector() || !Src2Ty.isVector()) { 5189 LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n"); 5190 return false; 5191 } 5192 5193 unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8; 5194 5195 SmallVector<Constant *, 64> CstIdxs; 5196 for (int Val : Mask) { 5197 // For now, any undef indexes we'll just assume to be 0. This should be 5198 // optimized in future, e.g. to select DUP etc. 5199 Val = Val < 0 ? 0 : Val; 5200 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { 5201 unsigned Offset = Byte + Val * BytesPerElt; 5202 CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset)); 5203 } 5204 } 5205 5206 // Use a constant pool to load the index vector for TBL. 5207 Constant *CPVal = ConstantVector::get(CstIdxs); 5208 MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIB); 5209 if (!IndexLoad) { 5210 LLVM_DEBUG(dbgs() << "Could not load from a constant pool"); 5211 return false; 5212 } 5213 5214 if (DstTy.getSizeInBits() != 128) { 5215 assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty"); 5216 // This case can be done with TBL1. 5217 MachineInstr *Concat = 5218 emitVectorConcat(std::nullopt, Src1Reg, Src2Reg, MIB); 5219 if (!Concat) { 5220 LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1"); 5221 return false; 5222 } 5223 5224 // The constant pool load will be 64 bits, so need to convert to FPR128 reg. 5225 IndexLoad = emitScalarToVector(64, &AArch64::FPR128RegClass, 5226 IndexLoad->getOperand(0).getReg(), MIB); 5227 5228 auto TBL1 = MIB.buildInstr( 5229 AArch64::TBLv16i8One, {&AArch64::FPR128RegClass}, 5230 {Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()}); 5231 constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI); 5232 5233 auto Copy = 5234 MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {}) 5235 .addReg(TBL1.getReg(0), 0, AArch64::dsub); 5236 RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI); 5237 I.eraseFromParent(); 5238 return true; 5239 } 5240 5241 // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive 5242 // Q registers for regalloc. 5243 SmallVector<Register, 2> Regs = {Src1Reg, Src2Reg}; 5244 auto RegSeq = createQTuple(Regs, MIB); 5245 auto TBL2 = MIB.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0)}, 5246 {RegSeq, IndexLoad->getOperand(0)}); 5247 constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI); 5248 I.eraseFromParent(); 5249 return true; 5250 } 5251 5252 MachineInstr *AArch64InstructionSelector::emitLaneInsert( 5253 std::optional<Register> DstReg, Register SrcReg, Register EltReg, 5254 unsigned LaneIdx, const RegisterBank &RB, 5255 MachineIRBuilder &MIRBuilder) const { 5256 MachineInstr *InsElt = nullptr; 5257 const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass; 5258 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 5259 5260 // Create a register to define with the insert if one wasn't passed in. 5261 if (!DstReg) 5262 DstReg = MRI.createVirtualRegister(DstRC); 5263 5264 unsigned EltSize = MRI.getType(EltReg).getSizeInBits(); 5265 unsigned Opc = getInsertVecEltOpInfo(RB, EltSize).first; 5266 5267 if (RB.getID() == AArch64::FPRRegBankID) { 5268 auto InsSub = emitScalarToVector(EltSize, DstRC, EltReg, MIRBuilder); 5269 InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg}) 5270 .addImm(LaneIdx) 5271 .addUse(InsSub->getOperand(0).getReg()) 5272 .addImm(0); 5273 } else { 5274 InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg}) 5275 .addImm(LaneIdx) 5276 .addUse(EltReg); 5277 } 5278 5279 constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI); 5280 return InsElt; 5281 } 5282 5283 bool AArch64InstructionSelector::selectUSMovFromExtend( 5284 MachineInstr &MI, MachineRegisterInfo &MRI) { 5285 if (MI.getOpcode() != TargetOpcode::G_SEXT && 5286 MI.getOpcode() != TargetOpcode::G_ZEXT && 5287 MI.getOpcode() != TargetOpcode::G_ANYEXT) 5288 return false; 5289 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SEXT; 5290 const Register DefReg = MI.getOperand(0).getReg(); 5291 const LLT DstTy = MRI.getType(DefReg); 5292 unsigned DstSize = DstTy.getSizeInBits(); 5293 5294 if (DstSize != 32 && DstSize != 64) 5295 return false; 5296 5297 MachineInstr *Extract = getOpcodeDef(TargetOpcode::G_EXTRACT_VECTOR_ELT, 5298 MI.getOperand(1).getReg(), MRI); 5299 int64_t Lane; 5300 if (!Extract || !mi_match(Extract->getOperand(2).getReg(), MRI, m_ICst(Lane))) 5301 return false; 5302 Register Src0 = Extract->getOperand(1).getReg(); 5303 5304 const LLT &VecTy = MRI.getType(Src0); 5305 5306 if (VecTy.getSizeInBits() != 128) { 5307 const MachineInstr *ScalarToVector = emitScalarToVector( 5308 VecTy.getSizeInBits(), &AArch64::FPR128RegClass, Src0, MIB); 5309 assert(ScalarToVector && "Didn't expect emitScalarToVector to fail!"); 5310 Src0 = ScalarToVector->getOperand(0).getReg(); 5311 } 5312 5313 unsigned Opcode; 5314 if (DstSize == 64 && VecTy.getScalarSizeInBits() == 32) 5315 Opcode = IsSigned ? AArch64::SMOVvi32to64 : AArch64::UMOVvi32; 5316 else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 16) 5317 Opcode = IsSigned ? AArch64::SMOVvi16to64 : AArch64::UMOVvi16; 5318 else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 8) 5319 Opcode = IsSigned ? AArch64::SMOVvi8to64 : AArch64::UMOVvi8; 5320 else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 16) 5321 Opcode = IsSigned ? AArch64::SMOVvi16to32 : AArch64::UMOVvi16; 5322 else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 8) 5323 Opcode = IsSigned ? AArch64::SMOVvi8to32 : AArch64::UMOVvi8; 5324 else 5325 llvm_unreachable("Unexpected type combo for S/UMov!"); 5326 5327 // We may need to generate one of these, depending on the type and sign of the 5328 // input: 5329 // DstReg = SMOV Src0, Lane; 5330 // NewReg = UMOV Src0, Lane; DstReg = SUBREG_TO_REG NewReg, sub_32; 5331 MachineInstr *ExtI = nullptr; 5332 if (DstSize == 64 && !IsSigned) { 5333 Register NewReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 5334 MIB.buildInstr(Opcode, {NewReg}, {Src0}).addImm(Lane); 5335 ExtI = MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {}) 5336 .addImm(0) 5337 .addUse(NewReg) 5338 .addImm(AArch64::sub_32); 5339 RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI); 5340 } else 5341 ExtI = MIB.buildInstr(Opcode, {DefReg}, {Src0}).addImm(Lane); 5342 5343 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 5344 MI.eraseFromParent(); 5345 return true; 5346 } 5347 5348 bool AArch64InstructionSelector::selectInsertElt(MachineInstr &I, 5349 MachineRegisterInfo &MRI) { 5350 assert(I.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT); 5351 5352 // Get information on the destination. 5353 Register DstReg = I.getOperand(0).getReg(); 5354 const LLT DstTy = MRI.getType(DstReg); 5355 unsigned VecSize = DstTy.getSizeInBits(); 5356 5357 // Get information on the element we want to insert into the destination. 5358 Register EltReg = I.getOperand(2).getReg(); 5359 const LLT EltTy = MRI.getType(EltReg); 5360 unsigned EltSize = EltTy.getSizeInBits(); 5361 if (EltSize < 16 || EltSize > 64) 5362 return false; // Don't support all element types yet. 5363 5364 // Find the definition of the index. Bail out if it's not defined by a 5365 // G_CONSTANT. 5366 Register IdxReg = I.getOperand(3).getReg(); 5367 auto VRegAndVal = getIConstantVRegValWithLookThrough(IdxReg, MRI); 5368 if (!VRegAndVal) 5369 return false; 5370 unsigned LaneIdx = VRegAndVal->Value.getSExtValue(); 5371 5372 // Perform the lane insert. 5373 Register SrcReg = I.getOperand(1).getReg(); 5374 const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI); 5375 5376 if (VecSize < 128) { 5377 // If the vector we're inserting into is smaller than 128 bits, widen it 5378 // to 128 to do the insert. 5379 MachineInstr *ScalarToVec = 5380 emitScalarToVector(VecSize, &AArch64::FPR128RegClass, SrcReg, MIB); 5381 if (!ScalarToVec) 5382 return false; 5383 SrcReg = ScalarToVec->getOperand(0).getReg(); 5384 } 5385 5386 // Create an insert into a new FPR128 register. 5387 // Note that if our vector is already 128 bits, we end up emitting an extra 5388 // register. 5389 MachineInstr *InsMI = 5390 emitLaneInsert(std::nullopt, SrcReg, EltReg, LaneIdx, EltRB, MIB); 5391 5392 if (VecSize < 128) { 5393 // If we had to widen to perform the insert, then we have to demote back to 5394 // the original size to get the result we want. 5395 Register DemoteVec = InsMI->getOperand(0).getReg(); 5396 const TargetRegisterClass *RC = 5397 getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DemoteVec, MRI, TRI)); 5398 if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) { 5399 LLVM_DEBUG(dbgs() << "Unsupported register class!\n"); 5400 return false; 5401 } 5402 unsigned SubReg = 0; 5403 if (!getSubRegForClass(RC, TRI, SubReg)) 5404 return false; 5405 if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) { 5406 LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << VecSize 5407 << "\n"); 5408 return false; 5409 } 5410 MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}) 5411 .addReg(DemoteVec, 0, SubReg); 5412 RBI.constrainGenericRegister(DstReg, *RC, MRI); 5413 } else { 5414 // No widening needed. 5415 InsMI->getOperand(0).setReg(DstReg); 5416 constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI); 5417 } 5418 5419 I.eraseFromParent(); 5420 return true; 5421 } 5422 5423 MachineInstr * 5424 AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV, 5425 MachineIRBuilder &MIRBuilder, 5426 MachineRegisterInfo &MRI) { 5427 LLT DstTy = MRI.getType(Dst); 5428 unsigned DstSize = DstTy.getSizeInBits(); 5429 if (CV->isNullValue()) { 5430 if (DstSize == 128) { 5431 auto Mov = 5432 MIRBuilder.buildInstr(AArch64::MOVIv2d_ns, {Dst}, {}).addImm(0); 5433 constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI); 5434 return &*Mov; 5435 } 5436 5437 if (DstSize == 64) { 5438 auto Mov = 5439 MIRBuilder 5440 .buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {}) 5441 .addImm(0); 5442 auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {Dst}, {}) 5443 .addReg(Mov.getReg(0), 0, AArch64::dsub); 5444 RBI.constrainGenericRegister(Dst, AArch64::FPR64RegClass, MRI); 5445 return &*Copy; 5446 } 5447 } 5448 5449 auto *CPLoad = emitLoadFromConstantPool(CV, MIRBuilder); 5450 if (!CPLoad) { 5451 LLVM_DEBUG(dbgs() << "Could not generate cp load for constant vector!"); 5452 return nullptr; 5453 } 5454 5455 auto Copy = MIRBuilder.buildCopy(Dst, CPLoad->getOperand(0)); 5456 RBI.constrainGenericRegister( 5457 Dst, *MRI.getRegClass(CPLoad->getOperand(0).getReg()), MRI); 5458 return &*Copy; 5459 } 5460 5461 bool AArch64InstructionSelector::tryOptConstantBuildVec( 5462 MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) { 5463 assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR); 5464 unsigned DstSize = DstTy.getSizeInBits(); 5465 assert(DstSize <= 128 && "Unexpected build_vec type!"); 5466 if (DstSize < 32) 5467 return false; 5468 // Check if we're building a constant vector, in which case we want to 5469 // generate a constant pool load instead of a vector insert sequence. 5470 SmallVector<Constant *, 16> Csts; 5471 for (unsigned Idx = 1; Idx < I.getNumOperands(); ++Idx) { 5472 // Try to find G_CONSTANT or G_FCONSTANT 5473 auto *OpMI = 5474 getOpcodeDef(TargetOpcode::G_CONSTANT, I.getOperand(Idx).getReg(), MRI); 5475 if (OpMI) 5476 Csts.emplace_back( 5477 const_cast<ConstantInt *>(OpMI->getOperand(1).getCImm())); 5478 else if ((OpMI = getOpcodeDef(TargetOpcode::G_FCONSTANT, 5479 I.getOperand(Idx).getReg(), MRI))) 5480 Csts.emplace_back( 5481 const_cast<ConstantFP *>(OpMI->getOperand(1).getFPImm())); 5482 else 5483 return false; 5484 } 5485 Constant *CV = ConstantVector::get(Csts); 5486 if (!emitConstantVector(I.getOperand(0).getReg(), CV, MIB, MRI)) 5487 return false; 5488 I.eraseFromParent(); 5489 return true; 5490 } 5491 5492 bool AArch64InstructionSelector::tryOptBuildVecToSubregToReg( 5493 MachineInstr &I, MachineRegisterInfo &MRI) { 5494 // Given: 5495 // %vec = G_BUILD_VECTOR %elt, %undef, %undef, ... %undef 5496 // 5497 // Select the G_BUILD_VECTOR as a SUBREG_TO_REG from %elt. 5498 Register Dst = I.getOperand(0).getReg(); 5499 Register EltReg = I.getOperand(1).getReg(); 5500 LLT EltTy = MRI.getType(EltReg); 5501 // If the index isn't on the same bank as its elements, then this can't be a 5502 // SUBREG_TO_REG. 5503 const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI); 5504 const RegisterBank &DstRB = *RBI.getRegBank(Dst, MRI, TRI); 5505 if (EltRB != DstRB) 5506 return false; 5507 if (any_of(make_range(I.operands_begin() + 2, I.operands_end()), 5508 [&MRI](const MachineOperand &Op) { 5509 return !getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, Op.getReg(), 5510 MRI); 5511 })) 5512 return false; 5513 unsigned SubReg; 5514 const TargetRegisterClass *EltRC = getRegClassForTypeOnBank(EltTy, EltRB); 5515 if (!EltRC) 5516 return false; 5517 const TargetRegisterClass *DstRC = 5518 getRegClassForTypeOnBank(MRI.getType(Dst), DstRB); 5519 if (!DstRC) 5520 return false; 5521 if (!getSubRegForClass(EltRC, TRI, SubReg)) 5522 return false; 5523 auto SubregToReg = MIB.buildInstr(AArch64::SUBREG_TO_REG, {Dst}, {}) 5524 .addImm(0) 5525 .addUse(EltReg) 5526 .addImm(SubReg); 5527 I.eraseFromParent(); 5528 constrainSelectedInstRegOperands(*SubregToReg, TII, TRI, RBI); 5529 return RBI.constrainGenericRegister(Dst, *DstRC, MRI); 5530 } 5531 5532 bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I, 5533 MachineRegisterInfo &MRI) { 5534 assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR); 5535 // Until we port more of the optimized selections, for now just use a vector 5536 // insert sequence. 5537 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 5538 const LLT EltTy = MRI.getType(I.getOperand(1).getReg()); 5539 unsigned EltSize = EltTy.getSizeInBits(); 5540 5541 if (tryOptConstantBuildVec(I, DstTy, MRI)) 5542 return true; 5543 if (tryOptBuildVecToSubregToReg(I, MRI)) 5544 return true; 5545 5546 if (EltSize < 16 || EltSize > 64) 5547 return false; // Don't support all element types yet. 5548 const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI); 5549 5550 const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass; 5551 MachineInstr *ScalarToVec = 5552 emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC, 5553 I.getOperand(1).getReg(), MIB); 5554 if (!ScalarToVec) 5555 return false; 5556 5557 Register DstVec = ScalarToVec->getOperand(0).getReg(); 5558 unsigned DstSize = DstTy.getSizeInBits(); 5559 5560 // Keep track of the last MI we inserted. Later on, we might be able to save 5561 // a copy using it. 5562 MachineInstr *PrevMI = nullptr; 5563 for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) { 5564 // Note that if we don't do a subregister copy, we can end up making an 5565 // extra register. 5566 PrevMI = &*emitLaneInsert(std::nullopt, DstVec, I.getOperand(i).getReg(), 5567 i - 1, RB, MIB); 5568 DstVec = PrevMI->getOperand(0).getReg(); 5569 } 5570 5571 // If DstTy's size in bits is less than 128, then emit a subregister copy 5572 // from DstVec to the last register we've defined. 5573 if (DstSize < 128) { 5574 // Force this to be FPR using the destination vector. 5575 const TargetRegisterClass *RC = 5576 getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DstVec, MRI, TRI)); 5577 if (!RC) 5578 return false; 5579 if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) { 5580 LLVM_DEBUG(dbgs() << "Unsupported register class!\n"); 5581 return false; 5582 } 5583 5584 unsigned SubReg = 0; 5585 if (!getSubRegForClass(RC, TRI, SubReg)) 5586 return false; 5587 if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) { 5588 LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << DstSize 5589 << "\n"); 5590 return false; 5591 } 5592 5593 Register Reg = MRI.createVirtualRegister(RC); 5594 Register DstReg = I.getOperand(0).getReg(); 5595 5596 MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}).addReg(DstVec, 0, SubReg); 5597 MachineOperand &RegOp = I.getOperand(1); 5598 RegOp.setReg(Reg); 5599 RBI.constrainGenericRegister(DstReg, *RC, MRI); 5600 } else { 5601 // We don't need a subregister copy. Save a copy by re-using the 5602 // destination register on the final insert. 5603 assert(PrevMI && "PrevMI was null?"); 5604 PrevMI->getOperand(0).setReg(I.getOperand(0).getReg()); 5605 constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI); 5606 } 5607 5608 I.eraseFromParent(); 5609 return true; 5610 } 5611 5612 bool AArch64InstructionSelector::selectVectorLoadIntrinsic(unsigned Opc, 5613 unsigned NumVecs, 5614 MachineInstr &I) { 5615 assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS); 5616 assert(Opc && "Expected an opcode?"); 5617 assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors"); 5618 auto &MRI = *MIB.getMRI(); 5619 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 5620 unsigned Size = Ty.getSizeInBits(); 5621 assert((Size == 64 || Size == 128) && 5622 "Destination must be 64 bits or 128 bits?"); 5623 unsigned SubReg = Size == 64 ? AArch64::dsub0 : AArch64::qsub0; 5624 auto Ptr = I.getOperand(I.getNumOperands() - 1).getReg(); 5625 assert(MRI.getType(Ptr).isPointer() && "Expected a pointer type?"); 5626 auto Load = MIB.buildInstr(Opc, {Ty}, {Ptr}); 5627 Load.cloneMemRefs(I); 5628 constrainSelectedInstRegOperands(*Load, TII, TRI, RBI); 5629 Register SelectedLoadDst = Load->getOperand(0).getReg(); 5630 for (unsigned Idx = 0; Idx < NumVecs; ++Idx) { 5631 auto Vec = MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(Idx)}, {}) 5632 .addReg(SelectedLoadDst, 0, SubReg + Idx); 5633 // Emit the subreg copies and immediately select them. 5634 // FIXME: We should refactor our copy code into an emitCopy helper and 5635 // clean up uses of this pattern elsewhere in the selector. 5636 selectCopy(*Vec, TII, MRI, TRI, RBI); 5637 } 5638 return true; 5639 } 5640 5641 bool AArch64InstructionSelector::selectIntrinsicWithSideEffects( 5642 MachineInstr &I, MachineRegisterInfo &MRI) { 5643 // Find the intrinsic ID. 5644 unsigned IntrinID = I.getIntrinsicID(); 5645 5646 const LLT S8 = LLT::scalar(8); 5647 const LLT S16 = LLT::scalar(16); 5648 const LLT S32 = LLT::scalar(32); 5649 const LLT S64 = LLT::scalar(64); 5650 const LLT P0 = LLT::pointer(0, 64); 5651 // Select the instruction. 5652 switch (IntrinID) { 5653 default: 5654 return false; 5655 case Intrinsic::aarch64_ldxp: 5656 case Intrinsic::aarch64_ldaxp: { 5657 auto NewI = MIB.buildInstr( 5658 IntrinID == Intrinsic::aarch64_ldxp ? AArch64::LDXPX : AArch64::LDAXPX, 5659 {I.getOperand(0).getReg(), I.getOperand(1).getReg()}, 5660 {I.getOperand(3)}); 5661 NewI.cloneMemRefs(I); 5662 constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI); 5663 break; 5664 } 5665 case Intrinsic::trap: 5666 MIB.buildInstr(AArch64::BRK, {}, {}).addImm(1); 5667 break; 5668 case Intrinsic::debugtrap: 5669 MIB.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000); 5670 break; 5671 case Intrinsic::ubsantrap: 5672 MIB.buildInstr(AArch64::BRK, {}, {}) 5673 .addImm(I.getOperand(1).getImm() | ('U' << 8)); 5674 break; 5675 case Intrinsic::aarch64_neon_ld2: { 5676 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 5677 unsigned Opc = 0; 5678 if (Ty == LLT::fixed_vector(8, S8)) 5679 Opc = AArch64::LD2Twov8b; 5680 else if (Ty == LLT::fixed_vector(16, S8)) 5681 Opc = AArch64::LD2Twov16b; 5682 else if (Ty == LLT::fixed_vector(4, S16)) 5683 Opc = AArch64::LD2Twov4h; 5684 else if (Ty == LLT::fixed_vector(8, S16)) 5685 Opc = AArch64::LD2Twov8h; 5686 else if (Ty == LLT::fixed_vector(2, S32)) 5687 Opc = AArch64::LD2Twov2s; 5688 else if (Ty == LLT::fixed_vector(4, S32)) 5689 Opc = AArch64::LD2Twov4s; 5690 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 5691 Opc = AArch64::LD2Twov2d; 5692 else if (Ty == S64 || Ty == P0) 5693 Opc = AArch64::LD1Twov1d; 5694 else 5695 llvm_unreachable("Unexpected type for ld2!"); 5696 selectVectorLoadIntrinsic(Opc, 2, I); 5697 break; 5698 } 5699 case Intrinsic::aarch64_neon_ld4: { 5700 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 5701 unsigned Opc = 0; 5702 if (Ty == LLT::fixed_vector(8, S8)) 5703 Opc = AArch64::LD4Fourv8b; 5704 else if (Ty == LLT::fixed_vector(16, S8)) 5705 Opc = AArch64::LD4Fourv16b; 5706 else if (Ty == LLT::fixed_vector(4, S16)) 5707 Opc = AArch64::LD4Fourv4h; 5708 else if (Ty == LLT::fixed_vector(8, S16)) 5709 Opc = AArch64::LD4Fourv8h; 5710 else if (Ty == LLT::fixed_vector(2, S32)) 5711 Opc = AArch64::LD4Fourv2s; 5712 else if (Ty == LLT::fixed_vector(4, S32)) 5713 Opc = AArch64::LD4Fourv4s; 5714 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 5715 Opc = AArch64::LD4Fourv2d; 5716 else if (Ty == S64 || Ty == P0) 5717 Opc = AArch64::LD1Fourv1d; 5718 else 5719 llvm_unreachable("Unexpected type for ld4!"); 5720 selectVectorLoadIntrinsic(Opc, 4, I); 5721 break; 5722 } 5723 case Intrinsic::aarch64_neon_st2: { 5724 Register Src1 = I.getOperand(1).getReg(); 5725 Register Src2 = I.getOperand(2).getReg(); 5726 Register Ptr = I.getOperand(3).getReg(); 5727 LLT Ty = MRI.getType(Src1); 5728 unsigned Opc; 5729 if (Ty == LLT::fixed_vector(8, S8)) 5730 Opc = AArch64::ST2Twov8b; 5731 else if (Ty == LLT::fixed_vector(16, S8)) 5732 Opc = AArch64::ST2Twov16b; 5733 else if (Ty == LLT::fixed_vector(4, S16)) 5734 Opc = AArch64::ST2Twov4h; 5735 else if (Ty == LLT::fixed_vector(8, S16)) 5736 Opc = AArch64::ST2Twov8h; 5737 else if (Ty == LLT::fixed_vector(2, S32)) 5738 Opc = AArch64::ST2Twov2s; 5739 else if (Ty == LLT::fixed_vector(4, S32)) 5740 Opc = AArch64::ST2Twov4s; 5741 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 5742 Opc = AArch64::ST2Twov2d; 5743 else if (Ty == S64 || Ty == P0) 5744 Opc = AArch64::ST1Twov1d; 5745 else 5746 llvm_unreachable("Unexpected type for st2!"); 5747 SmallVector<Register, 2> Regs = {Src1, Src2}; 5748 Register Tuple = Ty.getSizeInBits() == 128 ? createQTuple(Regs, MIB) 5749 : createDTuple(Regs, MIB); 5750 auto Store = MIB.buildInstr(Opc, {}, {Tuple, Ptr}); 5751 Store.cloneMemRefs(I); 5752 constrainSelectedInstRegOperands(*Store, TII, TRI, RBI); 5753 break; 5754 } 5755 case Intrinsic::aarch64_mops_memset_tag: { 5756 // Transform 5757 // %dst:gpr(p0) = \ 5758 // G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.mops.memset.tag), 5759 // \ %dst:gpr(p0), %val:gpr(s64), %n:gpr(s64) 5760 // where %dst is updated, into 5761 // %Rd:GPR64common, %Rn:GPR64) = \ 5762 // MOPSMemorySetTaggingPseudo \ 5763 // %Rd:GPR64common, %Rn:GPR64, %Rm:GPR64 5764 // where Rd and Rn are tied. 5765 // It is expected that %val has been extended to s64 in legalization. 5766 // Note that the order of the size/value operands are swapped. 5767 5768 Register DstDef = I.getOperand(0).getReg(); 5769 // I.getOperand(1) is the intrinsic function 5770 Register DstUse = I.getOperand(2).getReg(); 5771 Register ValUse = I.getOperand(3).getReg(); 5772 Register SizeUse = I.getOperand(4).getReg(); 5773 5774 // MOPSMemorySetTaggingPseudo has two defs; the intrinsic call has only one. 5775 // Therefore an additional virtual register is requried for the updated size 5776 // operand. This value is not accessible via the semantics of the intrinsic. 5777 Register SizeDef = MRI.createGenericVirtualRegister(LLT::scalar(64)); 5778 5779 auto Memset = MIB.buildInstr(AArch64::MOPSMemorySetTaggingPseudo, 5780 {DstDef, SizeDef}, {DstUse, SizeUse, ValUse}); 5781 Memset.cloneMemRefs(I); 5782 constrainSelectedInstRegOperands(*Memset, TII, TRI, RBI); 5783 break; 5784 } 5785 } 5786 5787 I.eraseFromParent(); 5788 return true; 5789 } 5790 5791 bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I, 5792 MachineRegisterInfo &MRI) { 5793 unsigned IntrinID = I.getIntrinsicID(); 5794 5795 switch (IntrinID) { 5796 default: 5797 break; 5798 case Intrinsic::aarch64_crypto_sha1h: { 5799 Register DstReg = I.getOperand(0).getReg(); 5800 Register SrcReg = I.getOperand(2).getReg(); 5801 5802 // FIXME: Should this be an assert? 5803 if (MRI.getType(DstReg).getSizeInBits() != 32 || 5804 MRI.getType(SrcReg).getSizeInBits() != 32) 5805 return false; 5806 5807 // The operation has to happen on FPRs. Set up some new FPR registers for 5808 // the source and destination if they are on GPRs. 5809 if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) { 5810 SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); 5811 MIB.buildCopy({SrcReg}, {I.getOperand(2)}); 5812 5813 // Make sure the copy ends up getting constrained properly. 5814 RBI.constrainGenericRegister(I.getOperand(2).getReg(), 5815 AArch64::GPR32RegClass, MRI); 5816 } 5817 5818 if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) 5819 DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); 5820 5821 // Actually insert the instruction. 5822 auto SHA1Inst = MIB.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg}); 5823 constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI); 5824 5825 // Did we create a new register for the destination? 5826 if (DstReg != I.getOperand(0).getReg()) { 5827 // Yep. Copy the result of the instruction back into the original 5828 // destination. 5829 MIB.buildCopy({I.getOperand(0)}, {DstReg}); 5830 RBI.constrainGenericRegister(I.getOperand(0).getReg(), 5831 AArch64::GPR32RegClass, MRI); 5832 } 5833 5834 I.eraseFromParent(); 5835 return true; 5836 } 5837 case Intrinsic::ptrauth_sign: { 5838 Register DstReg = I.getOperand(0).getReg(); 5839 Register ValReg = I.getOperand(2).getReg(); 5840 uint64_t Key = I.getOperand(3).getImm(); 5841 Register DiscReg = I.getOperand(4).getReg(); 5842 auto DiscVal = getIConstantVRegVal(DiscReg, MRI); 5843 bool IsDiscZero = DiscVal && DiscVal->isNullValue(); 5844 5845 if (Key > AArch64PACKey::LAST) 5846 return false; 5847 5848 unsigned Opcodes[][4] = { 5849 {AArch64::PACIA, AArch64::PACIB, AArch64::PACDA, AArch64::PACDB}, 5850 {AArch64::PACIZA, AArch64::PACIZB, AArch64::PACDZA, AArch64::PACDZB}}; 5851 unsigned Opcode = Opcodes[IsDiscZero][Key]; 5852 5853 auto PAC = MIB.buildInstr(Opcode, {DstReg}, {ValReg}); 5854 5855 if (!IsDiscZero) { 5856 PAC.addUse(DiscReg); 5857 RBI.constrainGenericRegister(DiscReg, AArch64::GPR64spRegClass, MRI); 5858 } 5859 5860 RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI); 5861 I.eraseFromParent(); 5862 return true; 5863 } 5864 case Intrinsic::ptrauth_strip: { 5865 Register DstReg = I.getOperand(0).getReg(); 5866 Register ValReg = I.getOperand(2).getReg(); 5867 uint64_t Key = I.getOperand(3).getImm(); 5868 5869 if (Key > AArch64PACKey::LAST) 5870 return false; 5871 unsigned Opcode = getXPACOpcodeForKey((AArch64PACKey::ID)Key); 5872 5873 MIB.buildInstr(Opcode, {DstReg}, {ValReg}); 5874 5875 RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI); 5876 RBI.constrainGenericRegister(ValReg, AArch64::GPR64RegClass, MRI); 5877 I.eraseFromParent(); 5878 return true; 5879 } 5880 case Intrinsic::frameaddress: 5881 case Intrinsic::returnaddress: { 5882 MachineFunction &MF = *I.getParent()->getParent(); 5883 MachineFrameInfo &MFI = MF.getFrameInfo(); 5884 5885 unsigned Depth = I.getOperand(2).getImm(); 5886 Register DstReg = I.getOperand(0).getReg(); 5887 RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI); 5888 5889 if (Depth == 0 && IntrinID == Intrinsic::returnaddress) { 5890 if (!MFReturnAddr) { 5891 // Insert the copy from LR/X30 into the entry block, before it can be 5892 // clobbered by anything. 5893 MFI.setReturnAddressIsTaken(true); 5894 MFReturnAddr = getFunctionLiveInPhysReg( 5895 MF, TII, AArch64::LR, AArch64::GPR64RegClass, I.getDebugLoc()); 5896 } 5897 5898 if (STI.hasPAuth()) { 5899 MIB.buildInstr(AArch64::XPACI, {DstReg}, {MFReturnAddr}); 5900 } else { 5901 MIB.buildCopy({Register(AArch64::LR)}, {MFReturnAddr}); 5902 MIB.buildInstr(AArch64::XPACLRI); 5903 MIB.buildCopy({DstReg}, {Register(AArch64::LR)}); 5904 } 5905 5906 I.eraseFromParent(); 5907 return true; 5908 } 5909 5910 MFI.setFrameAddressIsTaken(true); 5911 Register FrameAddr(AArch64::FP); 5912 while (Depth--) { 5913 Register NextFrame = MRI.createVirtualRegister(&AArch64::GPR64spRegClass); 5914 auto Ldr = 5915 MIB.buildInstr(AArch64::LDRXui, {NextFrame}, {FrameAddr}).addImm(0); 5916 constrainSelectedInstRegOperands(*Ldr, TII, TRI, RBI); 5917 FrameAddr = NextFrame; 5918 } 5919 5920 if (IntrinID == Intrinsic::frameaddress) 5921 MIB.buildCopy({DstReg}, {FrameAddr}); 5922 else { 5923 MFI.setReturnAddressIsTaken(true); 5924 5925 if (STI.hasPAuth()) { 5926 Register TmpReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); 5927 MIB.buildInstr(AArch64::LDRXui, {TmpReg}, {FrameAddr}).addImm(1); 5928 MIB.buildInstr(AArch64::XPACI, {DstReg}, {TmpReg}); 5929 } else { 5930 MIB.buildInstr(AArch64::LDRXui, {Register(AArch64::LR)}, {FrameAddr}) 5931 .addImm(1); 5932 MIB.buildInstr(AArch64::XPACLRI); 5933 MIB.buildCopy({DstReg}, {Register(AArch64::LR)}); 5934 } 5935 } 5936 5937 I.eraseFromParent(); 5938 return true; 5939 } 5940 case Intrinsic::swift_async_context_addr: 5941 auto Sub = MIB.buildInstr(AArch64::SUBXri, {I.getOperand(0).getReg()}, 5942 {Register(AArch64::FP)}) 5943 .addImm(8) 5944 .addImm(0); 5945 constrainSelectedInstRegOperands(*Sub, TII, TRI, RBI); 5946 5947 MF->getFrameInfo().setFrameAddressIsTaken(true); 5948 MF->getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true); 5949 I.eraseFromParent(); 5950 return true; 5951 } 5952 return false; 5953 } 5954 5955 InstructionSelector::ComplexRendererFns 5956 AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const { 5957 auto MaybeImmed = getImmedFromMO(Root); 5958 if (MaybeImmed == std::nullopt || *MaybeImmed > 31) 5959 return std::nullopt; 5960 uint64_t Enc = (32 - *MaybeImmed) & 0x1f; 5961 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; 5962 } 5963 5964 InstructionSelector::ComplexRendererFns 5965 AArch64InstructionSelector::selectShiftB_32(const MachineOperand &Root) const { 5966 auto MaybeImmed = getImmedFromMO(Root); 5967 if (MaybeImmed == std::nullopt || *MaybeImmed > 31) 5968 return std::nullopt; 5969 uint64_t Enc = 31 - *MaybeImmed; 5970 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; 5971 } 5972 5973 InstructionSelector::ComplexRendererFns 5974 AArch64InstructionSelector::selectShiftA_64(const MachineOperand &Root) const { 5975 auto MaybeImmed = getImmedFromMO(Root); 5976 if (MaybeImmed == std::nullopt || *MaybeImmed > 63) 5977 return std::nullopt; 5978 uint64_t Enc = (64 - *MaybeImmed) & 0x3f; 5979 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; 5980 } 5981 5982 InstructionSelector::ComplexRendererFns 5983 AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const { 5984 auto MaybeImmed = getImmedFromMO(Root); 5985 if (MaybeImmed == std::nullopt || *MaybeImmed > 63) 5986 return std::nullopt; 5987 uint64_t Enc = 63 - *MaybeImmed; 5988 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; 5989 } 5990 5991 /// Helper to select an immediate value that can be represented as a 12-bit 5992 /// value shifted left by either 0 or 12. If it is possible to do so, return 5993 /// the immediate and shift value. If not, return std::nullopt. 5994 /// 5995 /// Used by selectArithImmed and selectNegArithImmed. 5996 InstructionSelector::ComplexRendererFns 5997 AArch64InstructionSelector::select12BitValueWithLeftShift( 5998 uint64_t Immed) const { 5999 unsigned ShiftAmt; 6000 if (Immed >> 12 == 0) { 6001 ShiftAmt = 0; 6002 } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) { 6003 ShiftAmt = 12; 6004 Immed = Immed >> 12; 6005 } else 6006 return std::nullopt; 6007 6008 unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt); 6009 return {{ 6010 [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); }, 6011 [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); }, 6012 }}; 6013 } 6014 6015 /// SelectArithImmed - Select an immediate value that can be represented as 6016 /// a 12-bit value shifted left by either 0 or 12. If so, return true with 6017 /// Val set to the 12-bit value and Shift set to the shifter operand. 6018 InstructionSelector::ComplexRendererFns 6019 AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const { 6020 // This function is called from the addsub_shifted_imm ComplexPattern, 6021 // which lists [imm] as the list of opcode it's interested in, however 6022 // we still need to check whether the operand is actually an immediate 6023 // here because the ComplexPattern opcode list is only used in 6024 // root-level opcode matching. 6025 auto MaybeImmed = getImmedFromMO(Root); 6026 if (MaybeImmed == std::nullopt) 6027 return std::nullopt; 6028 return select12BitValueWithLeftShift(*MaybeImmed); 6029 } 6030 6031 /// SelectNegArithImmed - As above, but negates the value before trying to 6032 /// select it. 6033 InstructionSelector::ComplexRendererFns 6034 AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const { 6035 // We need a register here, because we need to know if we have a 64 or 32 6036 // bit immediate. 6037 if (!Root.isReg()) 6038 return std::nullopt; 6039 auto MaybeImmed = getImmedFromMO(Root); 6040 if (MaybeImmed == std::nullopt) 6041 return std::nullopt; 6042 uint64_t Immed = *MaybeImmed; 6043 6044 // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0" 6045 // have the opposite effect on the C flag, so this pattern mustn't match under 6046 // those circumstances. 6047 if (Immed == 0) 6048 return std::nullopt; 6049 6050 // Check if we're dealing with a 32-bit type on the root or a 64-bit type on 6051 // the root. 6052 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 6053 if (MRI.getType(Root.getReg()).getSizeInBits() == 32) 6054 Immed = ~((uint32_t)Immed) + 1; 6055 else 6056 Immed = ~Immed + 1ULL; 6057 6058 if (Immed & 0xFFFFFFFFFF000000ULL) 6059 return std::nullopt; 6060 6061 Immed &= 0xFFFFFFULL; 6062 return select12BitValueWithLeftShift(Immed); 6063 } 6064 6065 /// Return true if it is worth folding MI into an extended register. That is, 6066 /// if it's safe to pull it into the addressing mode of a load or store as a 6067 /// shift. 6068 bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg( 6069 MachineInstr &MI, const MachineRegisterInfo &MRI) const { 6070 // Always fold if there is one use, or if we're optimizing for size. 6071 Register DefReg = MI.getOperand(0).getReg(); 6072 if (MRI.hasOneNonDBGUse(DefReg) || 6073 MI.getParent()->getParent()->getFunction().hasOptSize()) 6074 return true; 6075 6076 // It's better to avoid folding and recomputing shifts when we don't have a 6077 // fastpath. 6078 if (!STI.hasLSLFast()) 6079 return false; 6080 6081 // We have a fastpath, so folding a shift in and potentially computing it 6082 // many times may be beneficial. Check if this is only used in memory ops. 6083 // If it is, then we should fold. 6084 return all_of(MRI.use_nodbg_instructions(DefReg), 6085 [](MachineInstr &Use) { return Use.mayLoadOrStore(); }); 6086 } 6087 6088 static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) { 6089 switch (Type) { 6090 case AArch64_AM::SXTB: 6091 case AArch64_AM::SXTH: 6092 case AArch64_AM::SXTW: 6093 return true; 6094 default: 6095 return false; 6096 } 6097 } 6098 6099 InstructionSelector::ComplexRendererFns 6100 AArch64InstructionSelector::selectExtendedSHL( 6101 MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset, 6102 unsigned SizeInBytes, bool WantsExt) const { 6103 assert(Base.isReg() && "Expected base to be a register operand"); 6104 assert(Offset.isReg() && "Expected offset to be a register operand"); 6105 6106 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 6107 MachineInstr *OffsetInst = MRI.getVRegDef(Offset.getReg()); 6108 6109 unsigned OffsetOpc = OffsetInst->getOpcode(); 6110 bool LookedThroughZExt = false; 6111 if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) { 6112 // Try to look through a ZEXT. 6113 if (OffsetOpc != TargetOpcode::G_ZEXT || !WantsExt) 6114 return std::nullopt; 6115 6116 OffsetInst = MRI.getVRegDef(OffsetInst->getOperand(1).getReg()); 6117 OffsetOpc = OffsetInst->getOpcode(); 6118 LookedThroughZExt = true; 6119 6120 if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) 6121 return std::nullopt; 6122 } 6123 // Make sure that the memory op is a valid size. 6124 int64_t LegalShiftVal = Log2_32(SizeInBytes); 6125 if (LegalShiftVal == 0) 6126 return std::nullopt; 6127 if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI)) 6128 return std::nullopt; 6129 6130 // Now, try to find the specific G_CONSTANT. Start by assuming that the 6131 // register we will offset is the LHS, and the register containing the 6132 // constant is the RHS. 6133 Register OffsetReg = OffsetInst->getOperand(1).getReg(); 6134 Register ConstantReg = OffsetInst->getOperand(2).getReg(); 6135 auto ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI); 6136 if (!ValAndVReg) { 6137 // We didn't get a constant on the RHS. If the opcode is a shift, then 6138 // we're done. 6139 if (OffsetOpc == TargetOpcode::G_SHL) 6140 return std::nullopt; 6141 6142 // If we have a G_MUL, we can use either register. Try looking at the RHS. 6143 std::swap(OffsetReg, ConstantReg); 6144 ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI); 6145 if (!ValAndVReg) 6146 return std::nullopt; 6147 } 6148 6149 // The value must fit into 3 bits, and must be positive. Make sure that is 6150 // true. 6151 int64_t ImmVal = ValAndVReg->Value.getSExtValue(); 6152 6153 // Since we're going to pull this into a shift, the constant value must be 6154 // a power of 2. If we got a multiply, then we need to check this. 6155 if (OffsetOpc == TargetOpcode::G_MUL) { 6156 if (!isPowerOf2_32(ImmVal)) 6157 return std::nullopt; 6158 6159 // Got a power of 2. So, the amount we'll shift is the log base-2 of that. 6160 ImmVal = Log2_32(ImmVal); 6161 } 6162 6163 if ((ImmVal & 0x7) != ImmVal) 6164 return std::nullopt; 6165 6166 // We are only allowed to shift by LegalShiftVal. This shift value is built 6167 // into the instruction, so we can't just use whatever we want. 6168 if (ImmVal != LegalShiftVal) 6169 return std::nullopt; 6170 6171 unsigned SignExtend = 0; 6172 if (WantsExt) { 6173 // Check if the offset is defined by an extend, unless we looked through a 6174 // G_ZEXT earlier. 6175 if (!LookedThroughZExt) { 6176 MachineInstr *ExtInst = getDefIgnoringCopies(OffsetReg, MRI); 6177 auto Ext = getExtendTypeForInst(*ExtInst, MRI, true); 6178 if (Ext == AArch64_AM::InvalidShiftExtend) 6179 return std::nullopt; 6180 6181 SignExtend = isSignExtendShiftType(Ext) ? 1 : 0; 6182 // We only support SXTW for signed extension here. 6183 if (SignExtend && Ext != AArch64_AM::SXTW) 6184 return std::nullopt; 6185 OffsetReg = ExtInst->getOperand(1).getReg(); 6186 } 6187 6188 // Need a 32-bit wide register here. 6189 MachineIRBuilder MIB(*MRI.getVRegDef(Root.getReg())); 6190 OffsetReg = moveScalarRegClass(OffsetReg, AArch64::GPR32RegClass, MIB); 6191 } 6192 6193 // We can use the LHS of the GEP as the base, and the LHS of the shift as an 6194 // offset. Signify that we are shifting by setting the shift flag to 1. 6195 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(Base.getReg()); }, 6196 [=](MachineInstrBuilder &MIB) { MIB.addUse(OffsetReg); }, 6197 [=](MachineInstrBuilder &MIB) { 6198 // Need to add both immediates here to make sure that they are both 6199 // added to the instruction. 6200 MIB.addImm(SignExtend); 6201 MIB.addImm(1); 6202 }}}; 6203 } 6204 6205 /// This is used for computing addresses like this: 6206 /// 6207 /// ldr x1, [x2, x3, lsl #3] 6208 /// 6209 /// Where x2 is the base register, and x3 is an offset register. The shift-left 6210 /// is a constant value specific to this load instruction. That is, we'll never 6211 /// see anything other than a 3 here (which corresponds to the size of the 6212 /// element being loaded.) 6213 InstructionSelector::ComplexRendererFns 6214 AArch64InstructionSelector::selectAddrModeShiftedExtendXReg( 6215 MachineOperand &Root, unsigned SizeInBytes) const { 6216 if (!Root.isReg()) 6217 return std::nullopt; 6218 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 6219 6220 // We want to find something like this: 6221 // 6222 // val = G_CONSTANT LegalShiftVal 6223 // shift = G_SHL off_reg val 6224 // ptr = G_PTR_ADD base_reg shift 6225 // x = G_LOAD ptr 6226 // 6227 // And fold it into this addressing mode: 6228 // 6229 // ldr x, [base_reg, off_reg, lsl #LegalShiftVal] 6230 6231 // Check if we can find the G_PTR_ADD. 6232 MachineInstr *PtrAdd = 6233 getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); 6234 if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI)) 6235 return std::nullopt; 6236 6237 // Now, try to match an opcode which will match our specific offset. 6238 // We want a G_SHL or a G_MUL. 6239 MachineInstr *OffsetInst = 6240 getDefIgnoringCopies(PtrAdd->getOperand(2).getReg(), MRI); 6241 return selectExtendedSHL(Root, PtrAdd->getOperand(1), 6242 OffsetInst->getOperand(0), SizeInBytes, 6243 /*WantsExt=*/false); 6244 } 6245 6246 /// This is used for computing addresses like this: 6247 /// 6248 /// ldr x1, [x2, x3] 6249 /// 6250 /// Where x2 is the base register, and x3 is an offset register. 6251 /// 6252 /// When possible (or profitable) to fold a G_PTR_ADD into the address 6253 /// calculation, this will do so. Otherwise, it will return std::nullopt. 6254 InstructionSelector::ComplexRendererFns 6255 AArch64InstructionSelector::selectAddrModeRegisterOffset( 6256 MachineOperand &Root) const { 6257 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 6258 6259 // We need a GEP. 6260 MachineInstr *Gep = MRI.getVRegDef(Root.getReg()); 6261 if (Gep->getOpcode() != TargetOpcode::G_PTR_ADD) 6262 return std::nullopt; 6263 6264 // If this is used more than once, let's not bother folding. 6265 // TODO: Check if they are memory ops. If they are, then we can still fold 6266 // without having to recompute anything. 6267 if (!MRI.hasOneNonDBGUse(Gep->getOperand(0).getReg())) 6268 return std::nullopt; 6269 6270 // Base is the GEP's LHS, offset is its RHS. 6271 return {{[=](MachineInstrBuilder &MIB) { 6272 MIB.addUse(Gep->getOperand(1).getReg()); 6273 }, 6274 [=](MachineInstrBuilder &MIB) { 6275 MIB.addUse(Gep->getOperand(2).getReg()); 6276 }, 6277 [=](MachineInstrBuilder &MIB) { 6278 // Need to add both immediates here to make sure that they are both 6279 // added to the instruction. 6280 MIB.addImm(0); 6281 MIB.addImm(0); 6282 }}}; 6283 } 6284 6285 /// This is intended to be equivalent to selectAddrModeXRO in 6286 /// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads. 6287 InstructionSelector::ComplexRendererFns 6288 AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root, 6289 unsigned SizeInBytes) const { 6290 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 6291 if (!Root.isReg()) 6292 return std::nullopt; 6293 MachineInstr *PtrAdd = 6294 getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); 6295 if (!PtrAdd) 6296 return std::nullopt; 6297 6298 // Check for an immediates which cannot be encoded in the [base + imm] 6299 // addressing mode, and can't be encoded in an add/sub. If this happens, we'll 6300 // end up with code like: 6301 // 6302 // mov x0, wide 6303 // add x1 base, x0 6304 // ldr x2, [x1, x0] 6305 // 6306 // In this situation, we can use the [base, xreg] addressing mode to save an 6307 // add/sub: 6308 // 6309 // mov x0, wide 6310 // ldr x2, [base, x0] 6311 auto ValAndVReg = 6312 getIConstantVRegValWithLookThrough(PtrAdd->getOperand(2).getReg(), MRI); 6313 if (ValAndVReg) { 6314 unsigned Scale = Log2_32(SizeInBytes); 6315 int64_t ImmOff = ValAndVReg->Value.getSExtValue(); 6316 6317 // Skip immediates that can be selected in the load/store addresing 6318 // mode. 6319 if (ImmOff % SizeInBytes == 0 && ImmOff >= 0 && 6320 ImmOff < (0x1000 << Scale)) 6321 return std::nullopt; 6322 6323 // Helper lambda to decide whether or not it is preferable to emit an add. 6324 auto isPreferredADD = [](int64_t ImmOff) { 6325 // Constants in [0x0, 0xfff] can be encoded in an add. 6326 if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL) 6327 return true; 6328 6329 // Can it be encoded in an add lsl #12? 6330 if ((ImmOff & 0xffffffffff000fffLL) != 0x0LL) 6331 return false; 6332 6333 // It can be encoded in an add lsl #12, but we may not want to. If it is 6334 // possible to select this as a single movz, then prefer that. A single 6335 // movz is faster than an add with a shift. 6336 return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL && 6337 (ImmOff & 0xffffffffffff0fffLL) != 0x0LL; 6338 }; 6339 6340 // If the immediate can be encoded in a single add/sub, then bail out. 6341 if (isPreferredADD(ImmOff) || isPreferredADD(-ImmOff)) 6342 return std::nullopt; 6343 } 6344 6345 // Try to fold shifts into the addressing mode. 6346 auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes); 6347 if (AddrModeFns) 6348 return AddrModeFns; 6349 6350 // If that doesn't work, see if it's possible to fold in registers from 6351 // a GEP. 6352 return selectAddrModeRegisterOffset(Root); 6353 } 6354 6355 /// This is used for computing addresses like this: 6356 /// 6357 /// ldr x0, [xBase, wOffset, sxtw #LegalShiftVal] 6358 /// 6359 /// Where we have a 64-bit base register, a 32-bit offset register, and an 6360 /// extend (which may or may not be signed). 6361 InstructionSelector::ComplexRendererFns 6362 AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root, 6363 unsigned SizeInBytes) const { 6364 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 6365 6366 MachineInstr *PtrAdd = 6367 getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); 6368 if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI)) 6369 return std::nullopt; 6370 6371 MachineOperand &LHS = PtrAdd->getOperand(1); 6372 MachineOperand &RHS = PtrAdd->getOperand(2); 6373 MachineInstr *OffsetInst = getDefIgnoringCopies(RHS.getReg(), MRI); 6374 6375 // The first case is the same as selectAddrModeXRO, except we need an extend. 6376 // In this case, we try to find a shift and extend, and fold them into the 6377 // addressing mode. 6378 // 6379 // E.g. 6380 // 6381 // off_reg = G_Z/S/ANYEXT ext_reg 6382 // val = G_CONSTANT LegalShiftVal 6383 // shift = G_SHL off_reg val 6384 // ptr = G_PTR_ADD base_reg shift 6385 // x = G_LOAD ptr 6386 // 6387 // In this case we can get a load like this: 6388 // 6389 // ldr x0, [base_reg, ext_reg, sxtw #LegalShiftVal] 6390 auto ExtendedShl = selectExtendedSHL(Root, LHS, OffsetInst->getOperand(0), 6391 SizeInBytes, /*WantsExt=*/true); 6392 if (ExtendedShl) 6393 return ExtendedShl; 6394 6395 // There was no shift. We can try and fold a G_Z/S/ANYEXT in alone though. 6396 // 6397 // e.g. 6398 // ldr something, [base_reg, ext_reg, sxtw] 6399 if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI)) 6400 return std::nullopt; 6401 6402 // Check if this is an extend. We'll get an extend type if it is. 6403 AArch64_AM::ShiftExtendType Ext = 6404 getExtendTypeForInst(*OffsetInst, MRI, /*IsLoadStore=*/true); 6405 if (Ext == AArch64_AM::InvalidShiftExtend) 6406 return std::nullopt; 6407 6408 // Need a 32-bit wide register. 6409 MachineIRBuilder MIB(*PtrAdd); 6410 Register ExtReg = moveScalarRegClass(OffsetInst->getOperand(1).getReg(), 6411 AArch64::GPR32RegClass, MIB); 6412 unsigned SignExtend = Ext == AArch64_AM::SXTW; 6413 6414 // Base is LHS, offset is ExtReg. 6415 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(LHS.getReg()); }, 6416 [=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }, 6417 [=](MachineInstrBuilder &MIB) { 6418 MIB.addImm(SignExtend); 6419 MIB.addImm(0); 6420 }}}; 6421 } 6422 6423 /// Select a "register plus unscaled signed 9-bit immediate" address. This 6424 /// should only match when there is an offset that is not valid for a scaled 6425 /// immediate addressing mode. The "Size" argument is the size in bytes of the 6426 /// memory reference, which is needed here to know what is valid for a scaled 6427 /// immediate. 6428 InstructionSelector::ComplexRendererFns 6429 AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root, 6430 unsigned Size) const { 6431 MachineRegisterInfo &MRI = 6432 Root.getParent()->getParent()->getParent()->getRegInfo(); 6433 6434 if (!Root.isReg()) 6435 return std::nullopt; 6436 6437 if (!isBaseWithConstantOffset(Root, MRI)) 6438 return std::nullopt; 6439 6440 MachineInstr *RootDef = MRI.getVRegDef(Root.getReg()); 6441 6442 MachineOperand &OffImm = RootDef->getOperand(2); 6443 if (!OffImm.isReg()) 6444 return std::nullopt; 6445 MachineInstr *RHS = MRI.getVRegDef(OffImm.getReg()); 6446 if (RHS->getOpcode() != TargetOpcode::G_CONSTANT) 6447 return std::nullopt; 6448 int64_t RHSC; 6449 MachineOperand &RHSOp1 = RHS->getOperand(1); 6450 if (!RHSOp1.isCImm() || RHSOp1.getCImm()->getBitWidth() > 64) 6451 return std::nullopt; 6452 RHSC = RHSOp1.getCImm()->getSExtValue(); 6453 6454 // If the offset is valid as a scaled immediate, don't match here. 6455 if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Log2_32(Size))) 6456 return std::nullopt; 6457 if (RHSC >= -256 && RHSC < 256) { 6458 MachineOperand &Base = RootDef->getOperand(1); 6459 return {{ 6460 [=](MachineInstrBuilder &MIB) { MIB.add(Base); }, 6461 [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC); }, 6462 }}; 6463 } 6464 return std::nullopt; 6465 } 6466 6467 InstructionSelector::ComplexRendererFns 6468 AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef, 6469 unsigned Size, 6470 MachineRegisterInfo &MRI) const { 6471 if (RootDef.getOpcode() != AArch64::G_ADD_LOW) 6472 return std::nullopt; 6473 MachineInstr &Adrp = *MRI.getVRegDef(RootDef.getOperand(1).getReg()); 6474 if (Adrp.getOpcode() != AArch64::ADRP) 6475 return std::nullopt; 6476 6477 // TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG. 6478 auto Offset = Adrp.getOperand(1).getOffset(); 6479 if (Offset % Size != 0) 6480 return std::nullopt; 6481 6482 auto GV = Adrp.getOperand(1).getGlobal(); 6483 if (GV->isThreadLocal()) 6484 return std::nullopt; 6485 6486 auto &MF = *RootDef.getParent()->getParent(); 6487 if (GV->getPointerAlignment(MF.getDataLayout()) < Size) 6488 return std::nullopt; 6489 6490 unsigned OpFlags = STI.ClassifyGlobalReference(GV, MF.getTarget()); 6491 MachineIRBuilder MIRBuilder(RootDef); 6492 Register AdrpReg = Adrp.getOperand(0).getReg(); 6493 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(AdrpReg); }, 6494 [=](MachineInstrBuilder &MIB) { 6495 MIB.addGlobalAddress(GV, Offset, 6496 OpFlags | AArch64II::MO_PAGEOFF | 6497 AArch64II::MO_NC); 6498 }}}; 6499 } 6500 6501 /// Select a "register plus scaled unsigned 12-bit immediate" address. The 6502 /// "Size" argument is the size in bytes of the memory reference, which 6503 /// determines the scale. 6504 InstructionSelector::ComplexRendererFns 6505 AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root, 6506 unsigned Size) const { 6507 MachineFunction &MF = *Root.getParent()->getParent()->getParent(); 6508 MachineRegisterInfo &MRI = MF.getRegInfo(); 6509 6510 if (!Root.isReg()) 6511 return std::nullopt; 6512 6513 MachineInstr *RootDef = MRI.getVRegDef(Root.getReg()); 6514 if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) { 6515 return {{ 6516 [=](MachineInstrBuilder &MIB) { MIB.add(RootDef->getOperand(1)); }, 6517 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, 6518 }}; 6519 } 6520 6521 CodeModel::Model CM = MF.getTarget().getCodeModel(); 6522 // Check if we can fold in the ADD of small code model ADRP + ADD address. 6523 if (CM == CodeModel::Small) { 6524 auto OpFns = tryFoldAddLowIntoImm(*RootDef, Size, MRI); 6525 if (OpFns) 6526 return OpFns; 6527 } 6528 6529 if (isBaseWithConstantOffset(Root, MRI)) { 6530 MachineOperand &LHS = RootDef->getOperand(1); 6531 MachineOperand &RHS = RootDef->getOperand(2); 6532 MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg()); 6533 MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg()); 6534 6535 int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue(); 6536 unsigned Scale = Log2_32(Size); 6537 if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) { 6538 if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) 6539 return {{ 6540 [=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); }, 6541 [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); }, 6542 }}; 6543 6544 return {{ 6545 [=](MachineInstrBuilder &MIB) { MIB.add(LHS); }, 6546 [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); }, 6547 }}; 6548 } 6549 } 6550 6551 // Before falling back to our general case, check if the unscaled 6552 // instructions can handle this. If so, that's preferable. 6553 if (selectAddrModeUnscaled(Root, Size)) 6554 return std::nullopt; 6555 6556 return {{ 6557 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 6558 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, 6559 }}; 6560 } 6561 6562 /// Given a shift instruction, return the correct shift type for that 6563 /// instruction. 6564 static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) { 6565 switch (MI.getOpcode()) { 6566 default: 6567 return AArch64_AM::InvalidShiftExtend; 6568 case TargetOpcode::G_SHL: 6569 return AArch64_AM::LSL; 6570 case TargetOpcode::G_LSHR: 6571 return AArch64_AM::LSR; 6572 case TargetOpcode::G_ASHR: 6573 return AArch64_AM::ASR; 6574 case TargetOpcode::G_ROTR: 6575 return AArch64_AM::ROR; 6576 } 6577 } 6578 6579 /// Select a "shifted register" operand. If the value is not shifted, set the 6580 /// shift operand to a default value of "lsl 0". 6581 InstructionSelector::ComplexRendererFns 6582 AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root, 6583 bool AllowROR) const { 6584 if (!Root.isReg()) 6585 return std::nullopt; 6586 MachineRegisterInfo &MRI = 6587 Root.getParent()->getParent()->getParent()->getRegInfo(); 6588 6589 // Check if the operand is defined by an instruction which corresponds to 6590 // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc. 6591 MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg()); 6592 AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst); 6593 if (ShType == AArch64_AM::InvalidShiftExtend) 6594 return std::nullopt; 6595 if (ShType == AArch64_AM::ROR && !AllowROR) 6596 return std::nullopt; 6597 if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI)) 6598 return std::nullopt; 6599 6600 // Need an immediate on the RHS. 6601 MachineOperand &ShiftRHS = ShiftInst->getOperand(2); 6602 auto Immed = getImmedFromMO(ShiftRHS); 6603 if (!Immed) 6604 return std::nullopt; 6605 6606 // We have something that we can fold. Fold in the shift's LHS and RHS into 6607 // the instruction. 6608 MachineOperand &ShiftLHS = ShiftInst->getOperand(1); 6609 Register ShiftReg = ShiftLHS.getReg(); 6610 6611 unsigned NumBits = MRI.getType(ShiftReg).getSizeInBits(); 6612 unsigned Val = *Immed & (NumBits - 1); 6613 unsigned ShiftVal = AArch64_AM::getShifterImm(ShType, Val); 6614 6615 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ShiftReg); }, 6616 [=](MachineInstrBuilder &MIB) { MIB.addImm(ShiftVal); }}}; 6617 } 6618 6619 AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst( 6620 MachineInstr &MI, MachineRegisterInfo &MRI, bool IsLoadStore) const { 6621 unsigned Opc = MI.getOpcode(); 6622 6623 // Handle explicit extend instructions first. 6624 if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) { 6625 unsigned Size; 6626 if (Opc == TargetOpcode::G_SEXT) 6627 Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 6628 else 6629 Size = MI.getOperand(2).getImm(); 6630 assert(Size != 64 && "Extend from 64 bits?"); 6631 switch (Size) { 6632 case 8: 6633 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTB; 6634 case 16: 6635 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTH; 6636 case 32: 6637 return AArch64_AM::SXTW; 6638 default: 6639 return AArch64_AM::InvalidShiftExtend; 6640 } 6641 } 6642 6643 if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) { 6644 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 6645 assert(Size != 64 && "Extend from 64 bits?"); 6646 switch (Size) { 6647 case 8: 6648 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTB; 6649 case 16: 6650 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTH; 6651 case 32: 6652 return AArch64_AM::UXTW; 6653 default: 6654 return AArch64_AM::InvalidShiftExtend; 6655 } 6656 } 6657 6658 // Don't have an explicit extend. Try to handle a G_AND with a constant mask 6659 // on the RHS. 6660 if (Opc != TargetOpcode::G_AND) 6661 return AArch64_AM::InvalidShiftExtend; 6662 6663 std::optional<uint64_t> MaybeAndMask = getImmedFromMO(MI.getOperand(2)); 6664 if (!MaybeAndMask) 6665 return AArch64_AM::InvalidShiftExtend; 6666 uint64_t AndMask = *MaybeAndMask; 6667 switch (AndMask) { 6668 default: 6669 return AArch64_AM::InvalidShiftExtend; 6670 case 0xFF: 6671 return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend; 6672 case 0xFFFF: 6673 return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend; 6674 case 0xFFFFFFFF: 6675 return AArch64_AM::UXTW; 6676 } 6677 } 6678 6679 Register AArch64InstructionSelector::moveScalarRegClass( 6680 Register Reg, const TargetRegisterClass &RC, MachineIRBuilder &MIB) const { 6681 MachineRegisterInfo &MRI = *MIB.getMRI(); 6682 auto Ty = MRI.getType(Reg); 6683 assert(!Ty.isVector() && "Expected scalars only!"); 6684 if (Ty.getSizeInBits() == TRI.getRegSizeInBits(RC)) 6685 return Reg; 6686 6687 // Create a copy and immediately select it. 6688 // FIXME: We should have an emitCopy function? 6689 auto Copy = MIB.buildCopy({&RC}, {Reg}); 6690 selectCopy(*Copy, TII, MRI, TRI, RBI); 6691 return Copy.getReg(0); 6692 } 6693 6694 /// Select an "extended register" operand. This operand folds in an extend 6695 /// followed by an optional left shift. 6696 InstructionSelector::ComplexRendererFns 6697 AArch64InstructionSelector::selectArithExtendedRegister( 6698 MachineOperand &Root) const { 6699 if (!Root.isReg()) 6700 return std::nullopt; 6701 MachineRegisterInfo &MRI = 6702 Root.getParent()->getParent()->getParent()->getRegInfo(); 6703 6704 uint64_t ShiftVal = 0; 6705 Register ExtReg; 6706 AArch64_AM::ShiftExtendType Ext; 6707 MachineInstr *RootDef = getDefIgnoringCopies(Root.getReg(), MRI); 6708 if (!RootDef) 6709 return std::nullopt; 6710 6711 if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI)) 6712 return std::nullopt; 6713 6714 // Check if we can fold a shift and an extend. 6715 if (RootDef->getOpcode() == TargetOpcode::G_SHL) { 6716 // Look for a constant on the RHS of the shift. 6717 MachineOperand &RHS = RootDef->getOperand(2); 6718 std::optional<uint64_t> MaybeShiftVal = getImmedFromMO(RHS); 6719 if (!MaybeShiftVal) 6720 return std::nullopt; 6721 ShiftVal = *MaybeShiftVal; 6722 if (ShiftVal > 4) 6723 return std::nullopt; 6724 // Look for a valid extend instruction on the LHS of the shift. 6725 MachineOperand &LHS = RootDef->getOperand(1); 6726 MachineInstr *ExtDef = getDefIgnoringCopies(LHS.getReg(), MRI); 6727 if (!ExtDef) 6728 return std::nullopt; 6729 Ext = getExtendTypeForInst(*ExtDef, MRI); 6730 if (Ext == AArch64_AM::InvalidShiftExtend) 6731 return std::nullopt; 6732 ExtReg = ExtDef->getOperand(1).getReg(); 6733 } else { 6734 // Didn't get a shift. Try just folding an extend. 6735 Ext = getExtendTypeForInst(*RootDef, MRI); 6736 if (Ext == AArch64_AM::InvalidShiftExtend) 6737 return std::nullopt; 6738 ExtReg = RootDef->getOperand(1).getReg(); 6739 6740 // If we have a 32 bit instruction which zeroes out the high half of a 6741 // register, we get an implicit zero extend for free. Check if we have one. 6742 // FIXME: We actually emit the extend right now even though we don't have 6743 // to. 6744 if (Ext == AArch64_AM::UXTW && MRI.getType(ExtReg).getSizeInBits() == 32) { 6745 MachineInstr *ExtInst = MRI.getVRegDef(ExtReg); 6746 if (isDef32(*ExtInst)) 6747 return std::nullopt; 6748 } 6749 } 6750 6751 // We require a GPR32 here. Narrow the ExtReg if needed using a subregister 6752 // copy. 6753 MachineIRBuilder MIB(*RootDef); 6754 ExtReg = moveScalarRegClass(ExtReg, AArch64::GPR32RegClass, MIB); 6755 6756 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }, 6757 [=](MachineInstrBuilder &MIB) { 6758 MIB.addImm(getArithExtendImm(Ext, ShiftVal)); 6759 }}}; 6760 } 6761 6762 void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB, 6763 const MachineInstr &MI, 6764 int OpIdx) const { 6765 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 6766 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 6767 "Expected G_CONSTANT"); 6768 std::optional<int64_t> CstVal = 6769 getIConstantVRegSExtVal(MI.getOperand(0).getReg(), MRI); 6770 assert(CstVal && "Expected constant value"); 6771 MIB.addImm(*CstVal); 6772 } 6773 6774 void AArch64InstructionSelector::renderLogicalImm32( 6775 MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const { 6776 assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 6777 "Expected G_CONSTANT"); 6778 uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue(); 6779 uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 32); 6780 MIB.addImm(Enc); 6781 } 6782 6783 void AArch64InstructionSelector::renderLogicalImm64( 6784 MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const { 6785 assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 6786 "Expected G_CONSTANT"); 6787 uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue(); 6788 uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 64); 6789 MIB.addImm(Enc); 6790 } 6791 6792 void AArch64InstructionSelector::renderFPImm16(MachineInstrBuilder &MIB, 6793 const MachineInstr &MI, 6794 int OpIdx) const { 6795 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 && 6796 "Expected G_FCONSTANT"); 6797 MIB.addImm( 6798 AArch64_AM::getFP16Imm(MI.getOperand(1).getFPImm()->getValueAPF())); 6799 } 6800 6801 void AArch64InstructionSelector::renderFPImm32(MachineInstrBuilder &MIB, 6802 const MachineInstr &MI, 6803 int OpIdx) const { 6804 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 && 6805 "Expected G_FCONSTANT"); 6806 MIB.addImm( 6807 AArch64_AM::getFP32Imm(MI.getOperand(1).getFPImm()->getValueAPF())); 6808 } 6809 6810 void AArch64InstructionSelector::renderFPImm64(MachineInstrBuilder &MIB, 6811 const MachineInstr &MI, 6812 int OpIdx) const { 6813 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 && 6814 "Expected G_FCONSTANT"); 6815 MIB.addImm( 6816 AArch64_AM::getFP64Imm(MI.getOperand(1).getFPImm()->getValueAPF())); 6817 } 6818 6819 void AArch64InstructionSelector::renderFPImm32SIMDModImmType4( 6820 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { 6821 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 && 6822 "Expected G_FCONSTANT"); 6823 MIB.addImm(AArch64_AM::encodeAdvSIMDModImmType4(MI.getOperand(1) 6824 .getFPImm() 6825 ->getValueAPF() 6826 .bitcastToAPInt() 6827 .getZExtValue())); 6828 } 6829 6830 bool AArch64InstructionSelector::isLoadStoreOfNumBytes( 6831 const MachineInstr &MI, unsigned NumBytes) const { 6832 if (!MI.mayLoadOrStore()) 6833 return false; 6834 assert(MI.hasOneMemOperand() && 6835 "Expected load/store to have only one mem op!"); 6836 return (*MI.memoperands_begin())->getSize() == NumBytes; 6837 } 6838 6839 bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const { 6840 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 6841 if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() != 32) 6842 return false; 6843 6844 // Only return true if we know the operation will zero-out the high half of 6845 // the 64-bit register. Truncates can be subregister copies, which don't 6846 // zero out the high bits. Copies and other copy-like instructions can be 6847 // fed by truncates, or could be lowered as subregister copies. 6848 switch (MI.getOpcode()) { 6849 default: 6850 return true; 6851 case TargetOpcode::COPY: 6852 case TargetOpcode::G_BITCAST: 6853 case TargetOpcode::G_TRUNC: 6854 case TargetOpcode::G_PHI: 6855 return false; 6856 } 6857 } 6858 6859 6860 // Perform fixups on the given PHI instruction's operands to force them all 6861 // to be the same as the destination regbank. 6862 static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI, 6863 const AArch64RegisterBankInfo &RBI) { 6864 assert(MI.getOpcode() == TargetOpcode::G_PHI && "Expected a G_PHI"); 6865 Register DstReg = MI.getOperand(0).getReg(); 6866 const RegisterBank *DstRB = MRI.getRegBankOrNull(DstReg); 6867 assert(DstRB && "Expected PHI dst to have regbank assigned"); 6868 MachineIRBuilder MIB(MI); 6869 6870 // Go through each operand and ensure it has the same regbank. 6871 for (MachineOperand &MO : llvm::drop_begin(MI.operands())) { 6872 if (!MO.isReg()) 6873 continue; 6874 Register OpReg = MO.getReg(); 6875 const RegisterBank *RB = MRI.getRegBankOrNull(OpReg); 6876 if (RB != DstRB) { 6877 // Insert a cross-bank copy. 6878 auto *OpDef = MRI.getVRegDef(OpReg); 6879 const LLT &Ty = MRI.getType(OpReg); 6880 MachineBasicBlock &OpDefBB = *OpDef->getParent(); 6881 6882 // Any instruction we insert must appear after all PHIs in the block 6883 // for the block to be valid MIR. 6884 MachineBasicBlock::iterator InsertPt = std::next(OpDef->getIterator()); 6885 if (InsertPt != OpDefBB.end() && InsertPt->isPHI()) 6886 InsertPt = OpDefBB.getFirstNonPHI(); 6887 MIB.setInsertPt(*OpDef->getParent(), InsertPt); 6888 auto Copy = MIB.buildCopy(Ty, OpReg); 6889 MRI.setRegBank(Copy.getReg(0), *DstRB); 6890 MO.setReg(Copy.getReg(0)); 6891 } 6892 } 6893 } 6894 6895 void AArch64InstructionSelector::processPHIs(MachineFunction &MF) { 6896 // We're looking for PHIs, build a list so we don't invalidate iterators. 6897 MachineRegisterInfo &MRI = MF.getRegInfo(); 6898 SmallVector<MachineInstr *, 32> Phis; 6899 for (auto &BB : MF) { 6900 for (auto &MI : BB) { 6901 if (MI.getOpcode() == TargetOpcode::G_PHI) 6902 Phis.emplace_back(&MI); 6903 } 6904 } 6905 6906 for (auto *MI : Phis) { 6907 // We need to do some work here if the operand types are < 16 bit and they 6908 // are split across fpr/gpr banks. Since all types <32b on gpr 6909 // end up being assigned gpr32 regclasses, we can end up with PHIs here 6910 // which try to select between a gpr32 and an fpr16. Ideally RBS shouldn't 6911 // be selecting heterogenous regbanks for operands if possible, but we 6912 // still need to be able to deal with it here. 6913 // 6914 // To fix this, if we have a gpr-bank operand < 32b in size and at least 6915 // one other operand is on the fpr bank, then we add cross-bank copies 6916 // to homogenize the operand banks. For simplicity the bank that we choose 6917 // to settle on is whatever bank the def operand has. For example: 6918 // 6919 // %endbb: 6920 // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2:fpr(s16), %bb2 6921 // => 6922 // %bb2: 6923 // ... 6924 // %in2_copy:gpr(s16) = COPY %in2:fpr(s16) 6925 // ... 6926 // %endbb: 6927 // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2 6928 bool HasGPROp = false, HasFPROp = false; 6929 for (const MachineOperand &MO : llvm::drop_begin(MI->operands())) { 6930 if (!MO.isReg()) 6931 continue; 6932 const LLT &Ty = MRI.getType(MO.getReg()); 6933 if (!Ty.isValid() || !Ty.isScalar()) 6934 break; 6935 if (Ty.getSizeInBits() >= 32) 6936 break; 6937 const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg()); 6938 // If for some reason we don't have a regbank yet. Don't try anything. 6939 if (!RB) 6940 break; 6941 6942 if (RB->getID() == AArch64::GPRRegBankID) 6943 HasGPROp = true; 6944 else 6945 HasFPROp = true; 6946 } 6947 // We have heterogenous regbanks, need to fixup. 6948 if (HasGPROp && HasFPROp) 6949 fixupPHIOpBanks(*MI, MRI, RBI); 6950 } 6951 } 6952 6953 namespace llvm { 6954 InstructionSelector * 6955 createAArch64InstructionSelector(const AArch64TargetMachine &TM, 6956 AArch64Subtarget &Subtarget, 6957 AArch64RegisterBankInfo &RBI) { 6958 return new AArch64InstructionSelector(TM, Subtarget, RBI); 6959 } 6960 } 6961