1 //===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the InstructionSelector class for 10 /// AArch64. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AArch64GlobalISelUtils.h" 15 #include "AArch64InstrInfo.h" 16 #include "AArch64MachineFunctionInfo.h" 17 #include "AArch64RegisterBankInfo.h" 18 #include "AArch64RegisterInfo.h" 19 #include "AArch64Subtarget.h" 20 #include "AArch64TargetMachine.h" 21 #include "MCTargetDesc/AArch64AddressingModes.h" 22 #include "MCTargetDesc/AArch64MCTargetDesc.h" 23 #include "llvm/BinaryFormat/Dwarf.h" 24 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" 25 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 26 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" 27 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 29 #include "llvm/CodeGen/GlobalISel/Utils.h" 30 #include "llvm/CodeGen/MachineBasicBlock.h" 31 #include "llvm/CodeGen/MachineConstantPool.h" 32 #include "llvm/CodeGen/MachineFrameInfo.h" 33 #include "llvm/CodeGen/MachineFunction.h" 34 #include "llvm/CodeGen/MachineInstr.h" 35 #include "llvm/CodeGen/MachineInstrBuilder.h" 36 #include "llvm/CodeGen/MachineMemOperand.h" 37 #include "llvm/CodeGen/MachineOperand.h" 38 #include "llvm/CodeGen/MachineRegisterInfo.h" 39 #include "llvm/CodeGen/TargetOpcodes.h" 40 #include "llvm/CodeGen/TargetRegisterInfo.h" 41 #include "llvm/IR/Constants.h" 42 #include "llvm/IR/DerivedTypes.h" 43 #include "llvm/IR/Instructions.h" 44 #include "llvm/IR/IntrinsicsAArch64.h" 45 #include "llvm/IR/PatternMatch.h" 46 #include "llvm/IR/Type.h" 47 #include "llvm/Pass.h" 48 #include "llvm/Support/Debug.h" 49 #include "llvm/Support/raw_ostream.h" 50 #include <optional> 51 52 #define DEBUG_TYPE "aarch64-isel" 53 54 using namespace llvm; 55 using namespace MIPatternMatch; 56 using namespace AArch64GISelUtils; 57 58 namespace llvm { 59 class BlockFrequencyInfo; 60 class ProfileSummaryInfo; 61 } 62 63 namespace { 64 65 #define GET_GLOBALISEL_PREDICATE_BITSET 66 #include "AArch64GenGlobalISel.inc" 67 #undef GET_GLOBALISEL_PREDICATE_BITSET 68 69 70 class AArch64InstructionSelector : public InstructionSelector { 71 public: 72 AArch64InstructionSelector(const AArch64TargetMachine &TM, 73 const AArch64Subtarget &STI, 74 const AArch64RegisterBankInfo &RBI); 75 76 bool select(MachineInstr &I) override; 77 static const char *getName() { return DEBUG_TYPE; } 78 79 void setupMF(MachineFunction &MF, GISelKnownBits *KB, 80 CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, 81 BlockFrequencyInfo *BFI) override { 82 InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI); 83 MIB.setMF(MF); 84 85 // hasFnAttribute() is expensive to call on every BRCOND selection, so 86 // cache it here for each run of the selector. 87 ProduceNonFlagSettingCondBr = 88 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening); 89 MFReturnAddr = Register(); 90 91 processPHIs(MF); 92 } 93 94 private: 95 /// tblgen-erated 'select' implementation, used as the initial selector for 96 /// the patterns that don't require complex C++. 97 bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; 98 99 // A lowering phase that runs before any selection attempts. 100 // Returns true if the instruction was modified. 101 bool preISelLower(MachineInstr &I); 102 103 // An early selection function that runs before the selectImpl() call. 104 bool earlySelect(MachineInstr &I); 105 106 /// Save state that is shared between select calls, call select on \p I and 107 /// then restore the saved state. This can be used to recursively call select 108 /// within a select call. 109 bool selectAndRestoreState(MachineInstr &I); 110 111 // Do some preprocessing of G_PHIs before we begin selection. 112 void processPHIs(MachineFunction &MF); 113 114 bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI); 115 116 /// Eliminate same-sized cross-bank copies into stores before selectImpl(). 117 bool contractCrossBankCopyIntoStore(MachineInstr &I, 118 MachineRegisterInfo &MRI); 119 120 bool convertPtrAddToAdd(MachineInstr &I, MachineRegisterInfo &MRI); 121 122 bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF, 123 MachineRegisterInfo &MRI) const; 124 bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF, 125 MachineRegisterInfo &MRI) const; 126 127 ///@{ 128 /// Helper functions for selectCompareBranch. 129 bool selectCompareBranchFedByFCmp(MachineInstr &I, MachineInstr &FCmp, 130 MachineIRBuilder &MIB) const; 131 bool selectCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp, 132 MachineIRBuilder &MIB) const; 133 bool tryOptCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp, 134 MachineIRBuilder &MIB) const; 135 bool tryOptAndIntoCompareBranch(MachineInstr &AndInst, bool Invert, 136 MachineBasicBlock *DstMBB, 137 MachineIRBuilder &MIB) const; 138 ///@} 139 140 bool selectCompareBranch(MachineInstr &I, MachineFunction &MF, 141 MachineRegisterInfo &MRI); 142 143 bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI); 144 bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI); 145 146 // Helper to generate an equivalent of scalar_to_vector into a new register, 147 // returned via 'Dst'. 148 MachineInstr *emitScalarToVector(unsigned EltSize, 149 const TargetRegisterClass *DstRC, 150 Register Scalar, 151 MachineIRBuilder &MIRBuilder) const; 152 /// Helper to narrow vector that was widened by emitScalarToVector. 153 /// Copy lowest part of 128-bit or 64-bit vector to 64-bit or 32-bit 154 /// vector, correspondingly. 155 MachineInstr *emitNarrowVector(Register DstReg, Register SrcReg, 156 MachineIRBuilder &MIRBuilder, 157 MachineRegisterInfo &MRI) const; 158 159 /// Emit a lane insert into \p DstReg, or a new vector register if 160 /// std::nullopt is provided. 161 /// 162 /// The lane inserted into is defined by \p LaneIdx. The vector source 163 /// register is given by \p SrcReg. The register containing the element is 164 /// given by \p EltReg. 165 MachineInstr *emitLaneInsert(std::optional<Register> DstReg, Register SrcReg, 166 Register EltReg, unsigned LaneIdx, 167 const RegisterBank &RB, 168 MachineIRBuilder &MIRBuilder) const; 169 170 /// Emit a sequence of instructions representing a constant \p CV for a 171 /// vector register \p Dst. (E.g. a MOV, or a load from a constant pool.) 172 /// 173 /// \returns the last instruction in the sequence on success, and nullptr 174 /// otherwise. 175 MachineInstr *emitConstantVector(Register Dst, Constant *CV, 176 MachineIRBuilder &MIRBuilder, 177 MachineRegisterInfo &MRI); 178 179 MachineInstr *tryAdvSIMDModImm8(Register Dst, unsigned DstSize, APInt Bits, 180 MachineIRBuilder &MIRBuilder); 181 182 MachineInstr *tryAdvSIMDModImm16(Register Dst, unsigned DstSize, APInt Bits, 183 MachineIRBuilder &MIRBuilder, bool Inv); 184 185 MachineInstr *tryAdvSIMDModImm32(Register Dst, unsigned DstSize, APInt Bits, 186 MachineIRBuilder &MIRBuilder, bool Inv); 187 MachineInstr *tryAdvSIMDModImm64(Register Dst, unsigned DstSize, APInt Bits, 188 MachineIRBuilder &MIRBuilder); 189 MachineInstr *tryAdvSIMDModImm321s(Register Dst, unsigned DstSize, APInt Bits, 190 MachineIRBuilder &MIRBuilder, bool Inv); 191 MachineInstr *tryAdvSIMDModImmFP(Register Dst, unsigned DstSize, APInt Bits, 192 MachineIRBuilder &MIRBuilder); 193 194 bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy, 195 MachineRegisterInfo &MRI); 196 /// \returns true if a G_BUILD_VECTOR instruction \p MI can be selected as a 197 /// SUBREG_TO_REG. 198 bool tryOptBuildVecToSubregToReg(MachineInstr &MI, MachineRegisterInfo &MRI); 199 bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI); 200 bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI); 201 bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI); 202 203 bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI); 204 bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI); 205 bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI); 206 bool selectSplitVectorUnmerge(MachineInstr &I, MachineRegisterInfo &MRI); 207 208 /// Helper function to select vector load intrinsics like 209 /// @llvm.aarch64.neon.ld2.*, @llvm.aarch64.neon.ld4.*, etc. 210 /// \p Opc is the opcode that the selected instruction should use. 211 /// \p NumVecs is the number of vector destinations for the instruction. 212 /// \p I is the original G_INTRINSIC_W_SIDE_EFFECTS instruction. 213 bool selectVectorLoadIntrinsic(unsigned Opc, unsigned NumVecs, 214 MachineInstr &I); 215 bool selectVectorLoadLaneIntrinsic(unsigned Opc, unsigned NumVecs, 216 MachineInstr &I); 217 void selectVectorStoreIntrinsic(MachineInstr &I, unsigned NumVecs, 218 unsigned Opc); 219 bool selectVectorStoreLaneIntrinsic(MachineInstr &I, unsigned NumVecs, 220 unsigned Opc); 221 bool selectIntrinsicWithSideEffects(MachineInstr &I, 222 MachineRegisterInfo &MRI); 223 bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI); 224 bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI); 225 bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI); 226 bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI); 227 bool selectPtrAuthGlobalValue(MachineInstr &I, 228 MachineRegisterInfo &MRI) const; 229 bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI); 230 bool selectMOPS(MachineInstr &I, MachineRegisterInfo &MRI); 231 bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI); 232 void SelectTable(MachineInstr &I, MachineRegisterInfo &MRI, unsigned NumVecs, 233 unsigned Opc1, unsigned Opc2, bool isExt); 234 235 bool selectIndexedExtLoad(MachineInstr &I, MachineRegisterInfo &MRI); 236 bool selectIndexedLoad(MachineInstr &I, MachineRegisterInfo &MRI); 237 bool selectIndexedStore(GIndexedStore &I, MachineRegisterInfo &MRI); 238 239 unsigned emitConstantPoolEntry(const Constant *CPVal, 240 MachineFunction &MF) const; 241 MachineInstr *emitLoadFromConstantPool(const Constant *CPVal, 242 MachineIRBuilder &MIRBuilder) const; 243 244 // Emit a vector concat operation. 245 MachineInstr *emitVectorConcat(std::optional<Register> Dst, Register Op1, 246 Register Op2, 247 MachineIRBuilder &MIRBuilder) const; 248 249 // Emit an integer compare between LHS and RHS, which checks for Predicate. 250 MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS, 251 MachineOperand &Predicate, 252 MachineIRBuilder &MIRBuilder) const; 253 254 /// Emit a floating point comparison between \p LHS and \p RHS. 255 /// \p Pred if given is the intended predicate to use. 256 MachineInstr * 257 emitFPCompare(Register LHS, Register RHS, MachineIRBuilder &MIRBuilder, 258 std::optional<CmpInst::Predicate> = std::nullopt) const; 259 260 MachineInstr * 261 emitInstr(unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps, 262 std::initializer_list<llvm::SrcOp> SrcOps, 263 MachineIRBuilder &MIRBuilder, 264 const ComplexRendererFns &RenderFns = std::nullopt) const; 265 /// Helper function to emit an add or sub instruction. 266 /// 267 /// \p AddrModeAndSizeToOpcode must contain each of the opcode variants above 268 /// in a specific order. 269 /// 270 /// Below is an example of the expected input to \p AddrModeAndSizeToOpcode. 271 /// 272 /// \code 273 /// const std::array<std::array<unsigned, 2>, 4> Table { 274 /// {{AArch64::ADDXri, AArch64::ADDWri}, 275 /// {AArch64::ADDXrs, AArch64::ADDWrs}, 276 /// {AArch64::ADDXrr, AArch64::ADDWrr}, 277 /// {AArch64::SUBXri, AArch64::SUBWri}, 278 /// {AArch64::ADDXrx, AArch64::ADDWrx}}}; 279 /// \endcode 280 /// 281 /// Each row in the table corresponds to a different addressing mode. Each 282 /// column corresponds to a different register size. 283 /// 284 /// \attention Rows must be structured as follows: 285 /// - Row 0: The ri opcode variants 286 /// - Row 1: The rs opcode variants 287 /// - Row 2: The rr opcode variants 288 /// - Row 3: The ri opcode variants for negative immediates 289 /// - Row 4: The rx opcode variants 290 /// 291 /// \attention Columns must be structured as follows: 292 /// - Column 0: The 64-bit opcode variants 293 /// - Column 1: The 32-bit opcode variants 294 /// 295 /// \p Dst is the destination register of the binop to emit. 296 /// \p LHS is the left-hand operand of the binop to emit. 297 /// \p RHS is the right-hand operand of the binop to emit. 298 MachineInstr *emitAddSub( 299 const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode, 300 Register Dst, MachineOperand &LHS, MachineOperand &RHS, 301 MachineIRBuilder &MIRBuilder) const; 302 MachineInstr *emitADD(Register DefReg, MachineOperand &LHS, 303 MachineOperand &RHS, 304 MachineIRBuilder &MIRBuilder) const; 305 MachineInstr *emitADDS(Register Dst, MachineOperand &LHS, MachineOperand &RHS, 306 MachineIRBuilder &MIRBuilder) const; 307 MachineInstr *emitSUBS(Register Dst, MachineOperand &LHS, MachineOperand &RHS, 308 MachineIRBuilder &MIRBuilder) const; 309 MachineInstr *emitADCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS, 310 MachineIRBuilder &MIRBuilder) const; 311 MachineInstr *emitSBCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS, 312 MachineIRBuilder &MIRBuilder) const; 313 MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS, 314 MachineIRBuilder &MIRBuilder) const; 315 MachineInstr *emitTST(MachineOperand &LHS, MachineOperand &RHS, 316 MachineIRBuilder &MIRBuilder) const; 317 MachineInstr *emitSelect(Register Dst, Register LHS, Register RHS, 318 AArch64CC::CondCode CC, 319 MachineIRBuilder &MIRBuilder) const; 320 MachineInstr *emitExtractVectorElt(std::optional<Register> DstReg, 321 const RegisterBank &DstRB, LLT ScalarTy, 322 Register VecReg, unsigned LaneIdx, 323 MachineIRBuilder &MIRBuilder) const; 324 MachineInstr *emitCSINC(Register Dst, Register Src1, Register Src2, 325 AArch64CC::CondCode Pred, 326 MachineIRBuilder &MIRBuilder) const; 327 /// Emit a CSet for a FP compare. 328 /// 329 /// \p Dst is expected to be a 32-bit scalar register. 330 MachineInstr *emitCSetForFCmp(Register Dst, CmpInst::Predicate Pred, 331 MachineIRBuilder &MIRBuilder) const; 332 333 /// Emit an instruction that sets NZCV to the carry-in expected by \p I. 334 /// Might elide the instruction if the previous instruction already sets NZCV 335 /// correctly. 336 MachineInstr *emitCarryIn(MachineInstr &I, Register CarryReg); 337 338 /// Emit the overflow op for \p Opcode. 339 /// 340 /// \p Opcode is expected to be an overflow op's opcode, e.g. G_UADDO, 341 /// G_USUBO, etc. 342 std::pair<MachineInstr *, AArch64CC::CondCode> 343 emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS, 344 MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const; 345 346 bool selectOverflowOp(MachineInstr &I, MachineRegisterInfo &MRI); 347 348 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops). 349 /// In some cases this is even possible with OR operations in the expression. 350 MachineInstr *emitConjunction(Register Val, AArch64CC::CondCode &OutCC, 351 MachineIRBuilder &MIB) const; 352 MachineInstr *emitConditionalComparison(Register LHS, Register RHS, 353 CmpInst::Predicate CC, 354 AArch64CC::CondCode Predicate, 355 AArch64CC::CondCode OutCC, 356 MachineIRBuilder &MIB) const; 357 MachineInstr *emitConjunctionRec(Register Val, AArch64CC::CondCode &OutCC, 358 bool Negate, Register CCOp, 359 AArch64CC::CondCode Predicate, 360 MachineIRBuilder &MIB) const; 361 362 /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg. 363 /// \p IsNegative is true if the test should be "not zero". 364 /// This will also optimize the test bit instruction when possible. 365 MachineInstr *emitTestBit(Register TestReg, uint64_t Bit, bool IsNegative, 366 MachineBasicBlock *DstMBB, 367 MachineIRBuilder &MIB) const; 368 369 /// Emit a CB(N)Z instruction which branches to \p DestMBB. 370 MachineInstr *emitCBZ(Register CompareReg, bool IsNegative, 371 MachineBasicBlock *DestMBB, 372 MachineIRBuilder &MIB) const; 373 374 // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td. 375 // We use these manually instead of using the importer since it doesn't 376 // support SDNodeXForm. 377 ComplexRendererFns selectShiftA_32(const MachineOperand &Root) const; 378 ComplexRendererFns selectShiftB_32(const MachineOperand &Root) const; 379 ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const; 380 ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const; 381 382 ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const; 383 ComplexRendererFns selectArithImmed(MachineOperand &Root) const; 384 ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const; 385 386 ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root, 387 unsigned Size) const; 388 389 ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const { 390 return selectAddrModeUnscaled(Root, 1); 391 } 392 ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const { 393 return selectAddrModeUnscaled(Root, 2); 394 } 395 ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const { 396 return selectAddrModeUnscaled(Root, 4); 397 } 398 ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const { 399 return selectAddrModeUnscaled(Root, 8); 400 } 401 ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const { 402 return selectAddrModeUnscaled(Root, 16); 403 } 404 405 /// Helper to try to fold in a GISEL_ADD_LOW into an immediate, to be used 406 /// from complex pattern matchers like selectAddrModeIndexed(). 407 ComplexRendererFns tryFoldAddLowIntoImm(MachineInstr &RootDef, unsigned Size, 408 MachineRegisterInfo &MRI) const; 409 410 ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root, 411 unsigned Size) const; 412 template <int Width> 413 ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const { 414 return selectAddrModeIndexed(Root, Width / 8); 415 } 416 417 std::optional<bool> 418 isWorthFoldingIntoAddrMode(MachineInstr &MI, 419 const MachineRegisterInfo &MRI) const; 420 421 bool isWorthFoldingIntoExtendedReg(MachineInstr &MI, 422 const MachineRegisterInfo &MRI, 423 bool IsAddrOperand) const; 424 ComplexRendererFns 425 selectAddrModeShiftedExtendXReg(MachineOperand &Root, 426 unsigned SizeInBytes) const; 427 428 /// Returns a \p ComplexRendererFns which contains a base, offset, and whether 429 /// or not a shift + extend should be folded into an addressing mode. Returns 430 /// None when this is not profitable or possible. 431 ComplexRendererFns 432 selectExtendedSHL(MachineOperand &Root, MachineOperand &Base, 433 MachineOperand &Offset, unsigned SizeInBytes, 434 bool WantsExt) const; 435 ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const; 436 ComplexRendererFns selectAddrModeXRO(MachineOperand &Root, 437 unsigned SizeInBytes) const; 438 template <int Width> 439 ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const { 440 return selectAddrModeXRO(Root, Width / 8); 441 } 442 443 ComplexRendererFns selectAddrModeWRO(MachineOperand &Root, 444 unsigned SizeInBytes) const; 445 template <int Width> 446 ComplexRendererFns selectAddrModeWRO(MachineOperand &Root) const { 447 return selectAddrModeWRO(Root, Width / 8); 448 } 449 450 ComplexRendererFns selectShiftedRegister(MachineOperand &Root, 451 bool AllowROR = false) const; 452 453 ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const { 454 return selectShiftedRegister(Root); 455 } 456 457 ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const { 458 return selectShiftedRegister(Root, true); 459 } 460 461 /// Given an extend instruction, determine the correct shift-extend type for 462 /// that instruction. 463 /// 464 /// If the instruction is going to be used in a load or store, pass 465 /// \p IsLoadStore = true. 466 AArch64_AM::ShiftExtendType 467 getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI, 468 bool IsLoadStore = false) const; 469 470 /// Move \p Reg to \p RC if \p Reg is not already on \p RC. 471 /// 472 /// \returns Either \p Reg if no change was necessary, or the new register 473 /// created by moving \p Reg. 474 /// 475 /// Note: This uses emitCopy right now. 476 Register moveScalarRegClass(Register Reg, const TargetRegisterClass &RC, 477 MachineIRBuilder &MIB) const; 478 479 ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const; 480 481 ComplexRendererFns selectExtractHigh(MachineOperand &Root) const; 482 483 void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI, 484 int OpIdx = -1) const; 485 void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I, 486 int OpIdx = -1) const; 487 void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I, 488 int OpIdx = -1) const; 489 void renderUbsanTrap(MachineInstrBuilder &MIB, const MachineInstr &MI, 490 int OpIdx) const; 491 void renderFPImm16(MachineInstrBuilder &MIB, const MachineInstr &MI, 492 int OpIdx = -1) const; 493 void renderFPImm32(MachineInstrBuilder &MIB, const MachineInstr &MI, 494 int OpIdx = -1) const; 495 void renderFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI, 496 int OpIdx = -1) const; 497 void renderFPImm32SIMDModImmType4(MachineInstrBuilder &MIB, 498 const MachineInstr &MI, 499 int OpIdx = -1) const; 500 501 // Materialize a GlobalValue or BlockAddress using a movz+movk sequence. 502 void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags); 503 504 // Optimization methods. 505 bool tryOptSelect(GSelect &Sel); 506 bool tryOptSelectConjunction(GSelect &Sel, MachineInstr &CondMI); 507 MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS, 508 MachineOperand &Predicate, 509 MachineIRBuilder &MIRBuilder) const; 510 511 /// Return true if \p MI is a load or store of \p NumBytes bytes. 512 bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const; 513 514 /// Returns true if \p MI is guaranteed to have the high-half of a 64-bit 515 /// register zeroed out. In other words, the result of MI has been explicitly 516 /// zero extended. 517 bool isDef32(const MachineInstr &MI) const; 518 519 const AArch64TargetMachine &TM; 520 const AArch64Subtarget &STI; 521 const AArch64InstrInfo &TII; 522 const AArch64RegisterInfo &TRI; 523 const AArch64RegisterBankInfo &RBI; 524 525 bool ProduceNonFlagSettingCondBr = false; 526 527 // Some cached values used during selection. 528 // We use LR as a live-in register, and we keep track of it here as it can be 529 // clobbered by calls. 530 Register MFReturnAddr; 531 532 MachineIRBuilder MIB; 533 534 #define GET_GLOBALISEL_PREDICATES_DECL 535 #include "AArch64GenGlobalISel.inc" 536 #undef GET_GLOBALISEL_PREDICATES_DECL 537 538 // We declare the temporaries used by selectImpl() in the class to minimize the 539 // cost of constructing placeholder values. 540 #define GET_GLOBALISEL_TEMPORARIES_DECL 541 #include "AArch64GenGlobalISel.inc" 542 #undef GET_GLOBALISEL_TEMPORARIES_DECL 543 }; 544 545 } // end anonymous namespace 546 547 #define GET_GLOBALISEL_IMPL 548 #include "AArch64GenGlobalISel.inc" 549 #undef GET_GLOBALISEL_IMPL 550 551 AArch64InstructionSelector::AArch64InstructionSelector( 552 const AArch64TargetMachine &TM, const AArch64Subtarget &STI, 553 const AArch64RegisterBankInfo &RBI) 554 : TM(TM), STI(STI), TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), 555 RBI(RBI), 556 #define GET_GLOBALISEL_PREDICATES_INIT 557 #include "AArch64GenGlobalISel.inc" 558 #undef GET_GLOBALISEL_PREDICATES_INIT 559 #define GET_GLOBALISEL_TEMPORARIES_INIT 560 #include "AArch64GenGlobalISel.inc" 561 #undef GET_GLOBALISEL_TEMPORARIES_INIT 562 { 563 } 564 565 // FIXME: This should be target-independent, inferred from the types declared 566 // for each class in the bank. 567 // 568 /// Given a register bank, and a type, return the smallest register class that 569 /// can represent that combination. 570 static const TargetRegisterClass * 571 getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB, 572 bool GetAllRegSet = false) { 573 if (RB.getID() == AArch64::GPRRegBankID) { 574 if (Ty.getSizeInBits() <= 32) 575 return GetAllRegSet ? &AArch64::GPR32allRegClass 576 : &AArch64::GPR32RegClass; 577 if (Ty.getSizeInBits() == 64) 578 return GetAllRegSet ? &AArch64::GPR64allRegClass 579 : &AArch64::GPR64RegClass; 580 if (Ty.getSizeInBits() == 128) 581 return &AArch64::XSeqPairsClassRegClass; 582 return nullptr; 583 } 584 585 if (RB.getID() == AArch64::FPRRegBankID) { 586 switch (Ty.getSizeInBits()) { 587 case 8: 588 return &AArch64::FPR8RegClass; 589 case 16: 590 return &AArch64::FPR16RegClass; 591 case 32: 592 return &AArch64::FPR32RegClass; 593 case 64: 594 return &AArch64::FPR64RegClass; 595 case 128: 596 return &AArch64::FPR128RegClass; 597 } 598 return nullptr; 599 } 600 601 return nullptr; 602 } 603 604 /// Given a register bank, and size in bits, return the smallest register class 605 /// that can represent that combination. 606 static const TargetRegisterClass * 607 getMinClassForRegBank(const RegisterBank &RB, TypeSize SizeInBits, 608 bool GetAllRegSet = false) { 609 if (SizeInBits.isScalable()) { 610 assert(RB.getID() == AArch64::FPRRegBankID && 611 "Expected FPR regbank for scalable type size"); 612 return &AArch64::ZPRRegClass; 613 } 614 615 unsigned RegBankID = RB.getID(); 616 617 if (RegBankID == AArch64::GPRRegBankID) { 618 if (SizeInBits <= 32) 619 return GetAllRegSet ? &AArch64::GPR32allRegClass 620 : &AArch64::GPR32RegClass; 621 if (SizeInBits == 64) 622 return GetAllRegSet ? &AArch64::GPR64allRegClass 623 : &AArch64::GPR64RegClass; 624 if (SizeInBits == 128) 625 return &AArch64::XSeqPairsClassRegClass; 626 } 627 628 if (RegBankID == AArch64::FPRRegBankID) { 629 switch (SizeInBits) { 630 default: 631 return nullptr; 632 case 8: 633 return &AArch64::FPR8RegClass; 634 case 16: 635 return &AArch64::FPR16RegClass; 636 case 32: 637 return &AArch64::FPR32RegClass; 638 case 64: 639 return &AArch64::FPR64RegClass; 640 case 128: 641 return &AArch64::FPR128RegClass; 642 } 643 } 644 645 return nullptr; 646 } 647 648 /// Returns the correct subregister to use for a given register class. 649 static bool getSubRegForClass(const TargetRegisterClass *RC, 650 const TargetRegisterInfo &TRI, unsigned &SubReg) { 651 switch (TRI.getRegSizeInBits(*RC)) { 652 case 8: 653 SubReg = AArch64::bsub; 654 break; 655 case 16: 656 SubReg = AArch64::hsub; 657 break; 658 case 32: 659 if (RC != &AArch64::FPR32RegClass) 660 SubReg = AArch64::sub_32; 661 else 662 SubReg = AArch64::ssub; 663 break; 664 case 64: 665 SubReg = AArch64::dsub; 666 break; 667 default: 668 LLVM_DEBUG( 669 dbgs() << "Couldn't find appropriate subregister for register class."); 670 return false; 671 } 672 673 return true; 674 } 675 676 /// Returns the minimum size the given register bank can hold. 677 static unsigned getMinSizeForRegBank(const RegisterBank &RB) { 678 switch (RB.getID()) { 679 case AArch64::GPRRegBankID: 680 return 32; 681 case AArch64::FPRRegBankID: 682 return 8; 683 default: 684 llvm_unreachable("Tried to get minimum size for unknown register bank."); 685 } 686 } 687 688 /// Create a REG_SEQUENCE instruction using the registers in \p Regs. 689 /// Helper function for functions like createDTuple and createQTuple. 690 /// 691 /// \p RegClassIDs - The list of register class IDs available for some tuple of 692 /// a scalar class. E.g. QQRegClassID, QQQRegClassID, QQQQRegClassID. This is 693 /// expected to contain between 2 and 4 tuple classes. 694 /// 695 /// \p SubRegs - The list of subregister classes associated with each register 696 /// class ID in \p RegClassIDs. E.g., QQRegClassID should use the qsub0 697 /// subregister class. The index of each subregister class is expected to 698 /// correspond with the index of each register class. 699 /// 700 /// \returns Either the destination register of REG_SEQUENCE instruction that 701 /// was created, or the 0th element of \p Regs if \p Regs contains a single 702 /// element. 703 static Register createTuple(ArrayRef<Register> Regs, 704 const unsigned RegClassIDs[], 705 const unsigned SubRegs[], MachineIRBuilder &MIB) { 706 unsigned NumRegs = Regs.size(); 707 if (NumRegs == 1) 708 return Regs[0]; 709 assert(NumRegs >= 2 && NumRegs <= 4 && 710 "Only support between two and 4 registers in a tuple!"); 711 const TargetRegisterInfo *TRI = MIB.getMF().getSubtarget().getRegisterInfo(); 712 auto *DesiredClass = TRI->getRegClass(RegClassIDs[NumRegs - 2]); 713 auto RegSequence = 714 MIB.buildInstr(TargetOpcode::REG_SEQUENCE, {DesiredClass}, {}); 715 for (unsigned I = 0, E = Regs.size(); I < E; ++I) { 716 RegSequence.addUse(Regs[I]); 717 RegSequence.addImm(SubRegs[I]); 718 } 719 return RegSequence.getReg(0); 720 } 721 722 /// Create a tuple of D-registers using the registers in \p Regs. 723 static Register createDTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) { 724 static const unsigned RegClassIDs[] = { 725 AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID}; 726 static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1, 727 AArch64::dsub2, AArch64::dsub3}; 728 return createTuple(Regs, RegClassIDs, SubRegs, MIB); 729 } 730 731 /// Create a tuple of Q-registers using the registers in \p Regs. 732 static Register createQTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) { 733 static const unsigned RegClassIDs[] = { 734 AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID}; 735 static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1, 736 AArch64::qsub2, AArch64::qsub3}; 737 return createTuple(Regs, RegClassIDs, SubRegs, MIB); 738 } 739 740 static std::optional<uint64_t> getImmedFromMO(const MachineOperand &Root) { 741 auto &MI = *Root.getParent(); 742 auto &MBB = *MI.getParent(); 743 auto &MF = *MBB.getParent(); 744 auto &MRI = MF.getRegInfo(); 745 uint64_t Immed; 746 if (Root.isImm()) 747 Immed = Root.getImm(); 748 else if (Root.isCImm()) 749 Immed = Root.getCImm()->getZExtValue(); 750 else if (Root.isReg()) { 751 auto ValAndVReg = 752 getIConstantVRegValWithLookThrough(Root.getReg(), MRI, true); 753 if (!ValAndVReg) 754 return std::nullopt; 755 Immed = ValAndVReg->Value.getSExtValue(); 756 } else 757 return std::nullopt; 758 return Immed; 759 } 760 761 /// Check whether \p I is a currently unsupported binary operation: 762 /// - it has an unsized type 763 /// - an operand is not a vreg 764 /// - all operands are not in the same bank 765 /// These are checks that should someday live in the verifier, but right now, 766 /// these are mostly limitations of the aarch64 selector. 767 static bool unsupportedBinOp(const MachineInstr &I, 768 const AArch64RegisterBankInfo &RBI, 769 const MachineRegisterInfo &MRI, 770 const AArch64RegisterInfo &TRI) { 771 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 772 if (!Ty.isValid()) { 773 LLVM_DEBUG(dbgs() << "Generic binop register should be typed\n"); 774 return true; 775 } 776 777 const RegisterBank *PrevOpBank = nullptr; 778 for (auto &MO : I.operands()) { 779 // FIXME: Support non-register operands. 780 if (!MO.isReg()) { 781 LLVM_DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n"); 782 return true; 783 } 784 785 // FIXME: Can generic operations have physical registers operands? If 786 // so, this will need to be taught about that, and we'll need to get the 787 // bank out of the minimal class for the register. 788 // Either way, this needs to be documented (and possibly verified). 789 if (!MO.getReg().isVirtual()) { 790 LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n"); 791 return true; 792 } 793 794 const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI); 795 if (!OpBank) { 796 LLVM_DEBUG(dbgs() << "Generic register has no bank or class\n"); 797 return true; 798 } 799 800 if (PrevOpBank && OpBank != PrevOpBank) { 801 LLVM_DEBUG(dbgs() << "Generic inst operands have different banks\n"); 802 return true; 803 } 804 PrevOpBank = OpBank; 805 } 806 return false; 807 } 808 809 /// Select the AArch64 opcode for the basic binary operation \p GenericOpc 810 /// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID 811 /// and of size \p OpSize. 812 /// \returns \p GenericOpc if the combination is unsupported. 813 static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID, 814 unsigned OpSize) { 815 switch (RegBankID) { 816 case AArch64::GPRRegBankID: 817 if (OpSize == 32) { 818 switch (GenericOpc) { 819 case TargetOpcode::G_SHL: 820 return AArch64::LSLVWr; 821 case TargetOpcode::G_LSHR: 822 return AArch64::LSRVWr; 823 case TargetOpcode::G_ASHR: 824 return AArch64::ASRVWr; 825 default: 826 return GenericOpc; 827 } 828 } else if (OpSize == 64) { 829 switch (GenericOpc) { 830 case TargetOpcode::G_PTR_ADD: 831 return AArch64::ADDXrr; 832 case TargetOpcode::G_SHL: 833 return AArch64::LSLVXr; 834 case TargetOpcode::G_LSHR: 835 return AArch64::LSRVXr; 836 case TargetOpcode::G_ASHR: 837 return AArch64::ASRVXr; 838 default: 839 return GenericOpc; 840 } 841 } 842 break; 843 case AArch64::FPRRegBankID: 844 switch (OpSize) { 845 case 32: 846 switch (GenericOpc) { 847 case TargetOpcode::G_FADD: 848 return AArch64::FADDSrr; 849 case TargetOpcode::G_FSUB: 850 return AArch64::FSUBSrr; 851 case TargetOpcode::G_FMUL: 852 return AArch64::FMULSrr; 853 case TargetOpcode::G_FDIV: 854 return AArch64::FDIVSrr; 855 default: 856 return GenericOpc; 857 } 858 case 64: 859 switch (GenericOpc) { 860 case TargetOpcode::G_FADD: 861 return AArch64::FADDDrr; 862 case TargetOpcode::G_FSUB: 863 return AArch64::FSUBDrr; 864 case TargetOpcode::G_FMUL: 865 return AArch64::FMULDrr; 866 case TargetOpcode::G_FDIV: 867 return AArch64::FDIVDrr; 868 case TargetOpcode::G_OR: 869 return AArch64::ORRv8i8; 870 default: 871 return GenericOpc; 872 } 873 } 874 break; 875 } 876 return GenericOpc; 877 } 878 879 /// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc, 880 /// appropriate for the (value) register bank \p RegBankID and of memory access 881 /// size \p OpSize. This returns the variant with the base+unsigned-immediate 882 /// addressing mode (e.g., LDRXui). 883 /// \returns \p GenericOpc if the combination is unsupported. 884 static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID, 885 unsigned OpSize) { 886 const bool isStore = GenericOpc == TargetOpcode::G_STORE; 887 switch (RegBankID) { 888 case AArch64::GPRRegBankID: 889 switch (OpSize) { 890 case 8: 891 return isStore ? AArch64::STRBBui : AArch64::LDRBBui; 892 case 16: 893 return isStore ? AArch64::STRHHui : AArch64::LDRHHui; 894 case 32: 895 return isStore ? AArch64::STRWui : AArch64::LDRWui; 896 case 64: 897 return isStore ? AArch64::STRXui : AArch64::LDRXui; 898 } 899 break; 900 case AArch64::FPRRegBankID: 901 switch (OpSize) { 902 case 8: 903 return isStore ? AArch64::STRBui : AArch64::LDRBui; 904 case 16: 905 return isStore ? AArch64::STRHui : AArch64::LDRHui; 906 case 32: 907 return isStore ? AArch64::STRSui : AArch64::LDRSui; 908 case 64: 909 return isStore ? AArch64::STRDui : AArch64::LDRDui; 910 case 128: 911 return isStore ? AArch64::STRQui : AArch64::LDRQui; 912 } 913 break; 914 } 915 return GenericOpc; 916 } 917 918 /// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg 919 /// to \p *To. 920 /// 921 /// E.g "To = COPY SrcReg:SubReg" 922 static bool copySubReg(MachineInstr &I, MachineRegisterInfo &MRI, 923 const RegisterBankInfo &RBI, Register SrcReg, 924 const TargetRegisterClass *To, unsigned SubReg) { 925 assert(SrcReg.isValid() && "Expected a valid source register?"); 926 assert(To && "Destination register class cannot be null"); 927 assert(SubReg && "Expected a valid subregister"); 928 929 MachineIRBuilder MIB(I); 930 auto SubRegCopy = 931 MIB.buildInstr(TargetOpcode::COPY, {To}, {}).addReg(SrcReg, 0, SubReg); 932 MachineOperand &RegOp = I.getOperand(1); 933 RegOp.setReg(SubRegCopy.getReg(0)); 934 935 // It's possible that the destination register won't be constrained. Make 936 // sure that happens. 937 if (!I.getOperand(0).getReg().isPhysical()) 938 RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI); 939 940 return true; 941 } 942 943 /// Helper function to get the source and destination register classes for a 944 /// copy. Returns a std::pair containing the source register class for the 945 /// copy, and the destination register class for the copy. If a register class 946 /// cannot be determined, then it will be nullptr. 947 static std::pair<const TargetRegisterClass *, const TargetRegisterClass *> 948 getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII, 949 MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, 950 const RegisterBankInfo &RBI) { 951 Register DstReg = I.getOperand(0).getReg(); 952 Register SrcReg = I.getOperand(1).getReg(); 953 const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); 954 const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); 955 956 TypeSize DstSize = RBI.getSizeInBits(DstReg, MRI, TRI); 957 TypeSize SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); 958 959 // Special casing for cross-bank copies of s1s. We can technically represent 960 // a 1-bit value with any size of register. The minimum size for a GPR is 32 961 // bits. So, we need to put the FPR on 32 bits as well. 962 // 963 // FIXME: I'm not sure if this case holds true outside of copies. If it does, 964 // then we can pull it into the helpers that get the appropriate class for a 965 // register bank. Or make a new helper that carries along some constraint 966 // information. 967 if (SrcRegBank != DstRegBank && (DstSize == 1 && SrcSize == 1)) 968 SrcSize = DstSize = TypeSize::getFixed(32); 969 970 return {getMinClassForRegBank(SrcRegBank, SrcSize, true), 971 getMinClassForRegBank(DstRegBank, DstSize, true)}; 972 } 973 974 // FIXME: We need some sort of API in RBI/TRI to allow generic code to 975 // constrain operands of simple instructions given a TargetRegisterClass 976 // and LLT 977 static bool selectDebugInstr(MachineInstr &I, MachineRegisterInfo &MRI, 978 const RegisterBankInfo &RBI) { 979 for (MachineOperand &MO : I.operands()) { 980 if (!MO.isReg()) 981 continue; 982 Register Reg = MO.getReg(); 983 if (!Reg) 984 continue; 985 if (Reg.isPhysical()) 986 continue; 987 LLT Ty = MRI.getType(Reg); 988 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 989 const TargetRegisterClass *RC = 990 RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); 991 if (!RC) { 992 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); 993 RC = getRegClassForTypeOnBank(Ty, RB); 994 if (!RC) { 995 LLVM_DEBUG( 996 dbgs() << "Warning: DBG_VALUE operand has unexpected size/bank\n"); 997 break; 998 } 999 } 1000 RBI.constrainGenericRegister(Reg, *RC, MRI); 1001 } 1002 1003 return true; 1004 } 1005 1006 static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, 1007 MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, 1008 const RegisterBankInfo &RBI) { 1009 Register DstReg = I.getOperand(0).getReg(); 1010 Register SrcReg = I.getOperand(1).getReg(); 1011 const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); 1012 const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); 1013 1014 // Find the correct register classes for the source and destination registers. 1015 const TargetRegisterClass *SrcRC; 1016 const TargetRegisterClass *DstRC; 1017 std::tie(SrcRC, DstRC) = getRegClassesForCopy(I, TII, MRI, TRI, RBI); 1018 1019 if (!DstRC) { 1020 LLVM_DEBUG(dbgs() << "Unexpected dest size " 1021 << RBI.getSizeInBits(DstReg, MRI, TRI) << '\n'); 1022 return false; 1023 } 1024 1025 // Is this a copy? If so, then we may need to insert a subregister copy. 1026 if (I.isCopy()) { 1027 // Yes. Check if there's anything to fix up. 1028 if (!SrcRC) { 1029 LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n"); 1030 return false; 1031 } 1032 1033 const TypeSize SrcSize = TRI.getRegSizeInBits(*SrcRC); 1034 const TypeSize DstSize = TRI.getRegSizeInBits(*DstRC); 1035 unsigned SubReg; 1036 1037 // If the source bank doesn't support a subregister copy small enough, 1038 // then we first need to copy to the destination bank. 1039 if (getMinSizeForRegBank(SrcRegBank) > DstSize) { 1040 const TargetRegisterClass *DstTempRC = 1041 getMinClassForRegBank(DstRegBank, SrcSize, /* GetAllRegSet */ true); 1042 getSubRegForClass(DstRC, TRI, SubReg); 1043 1044 MachineIRBuilder MIB(I); 1045 auto Copy = MIB.buildCopy({DstTempRC}, {SrcReg}); 1046 copySubReg(I, MRI, RBI, Copy.getReg(0), DstRC, SubReg); 1047 } else if (SrcSize > DstSize) { 1048 // If the source register is bigger than the destination we need to 1049 // perform a subregister copy. 1050 const TargetRegisterClass *SubRegRC = 1051 getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true); 1052 getSubRegForClass(SubRegRC, TRI, SubReg); 1053 copySubReg(I, MRI, RBI, SrcReg, DstRC, SubReg); 1054 } else if (DstSize > SrcSize) { 1055 // If the destination register is bigger than the source we need to do 1056 // a promotion using SUBREG_TO_REG. 1057 const TargetRegisterClass *PromotionRC = 1058 getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true); 1059 getSubRegForClass(SrcRC, TRI, SubReg); 1060 1061 Register PromoteReg = MRI.createVirtualRegister(PromotionRC); 1062 BuildMI(*I.getParent(), I, I.getDebugLoc(), 1063 TII.get(AArch64::SUBREG_TO_REG), PromoteReg) 1064 .addImm(0) 1065 .addUse(SrcReg) 1066 .addImm(SubReg); 1067 MachineOperand &RegOp = I.getOperand(1); 1068 RegOp.setReg(PromoteReg); 1069 } 1070 1071 // If the destination is a physical register, then there's nothing to 1072 // change, so we're done. 1073 if (DstReg.isPhysical()) 1074 return true; 1075 } 1076 1077 // No need to constrain SrcReg. It will get constrained when we hit another 1078 // of its use or its defs. Copies do not have constraints. 1079 if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { 1080 LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode()) 1081 << " operand\n"); 1082 return false; 1083 } 1084 1085 // If this a GPR ZEXT that we want to just reduce down into a copy. 1086 // The sizes will be mismatched with the source < 32b but that's ok. 1087 if (I.getOpcode() == TargetOpcode::G_ZEXT) { 1088 I.setDesc(TII.get(AArch64::COPY)); 1089 assert(SrcRegBank.getID() == AArch64::GPRRegBankID); 1090 return selectCopy(I, TII, MRI, TRI, RBI); 1091 } 1092 1093 I.setDesc(TII.get(AArch64::COPY)); 1094 return true; 1095 } 1096 1097 static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) { 1098 if (!DstTy.isScalar() || !SrcTy.isScalar()) 1099 return GenericOpc; 1100 1101 const unsigned DstSize = DstTy.getSizeInBits(); 1102 const unsigned SrcSize = SrcTy.getSizeInBits(); 1103 1104 switch (DstSize) { 1105 case 32: 1106 switch (SrcSize) { 1107 case 32: 1108 switch (GenericOpc) { 1109 case TargetOpcode::G_SITOFP: 1110 return AArch64::SCVTFUWSri; 1111 case TargetOpcode::G_UITOFP: 1112 return AArch64::UCVTFUWSri; 1113 case TargetOpcode::G_FPTOSI: 1114 return AArch64::FCVTZSUWSr; 1115 case TargetOpcode::G_FPTOUI: 1116 return AArch64::FCVTZUUWSr; 1117 default: 1118 return GenericOpc; 1119 } 1120 case 64: 1121 switch (GenericOpc) { 1122 case TargetOpcode::G_SITOFP: 1123 return AArch64::SCVTFUXSri; 1124 case TargetOpcode::G_UITOFP: 1125 return AArch64::UCVTFUXSri; 1126 case TargetOpcode::G_FPTOSI: 1127 return AArch64::FCVTZSUWDr; 1128 case TargetOpcode::G_FPTOUI: 1129 return AArch64::FCVTZUUWDr; 1130 default: 1131 return GenericOpc; 1132 } 1133 default: 1134 return GenericOpc; 1135 } 1136 case 64: 1137 switch (SrcSize) { 1138 case 32: 1139 switch (GenericOpc) { 1140 case TargetOpcode::G_SITOFP: 1141 return AArch64::SCVTFUWDri; 1142 case TargetOpcode::G_UITOFP: 1143 return AArch64::UCVTFUWDri; 1144 case TargetOpcode::G_FPTOSI: 1145 return AArch64::FCVTZSUXSr; 1146 case TargetOpcode::G_FPTOUI: 1147 return AArch64::FCVTZUUXSr; 1148 default: 1149 return GenericOpc; 1150 } 1151 case 64: 1152 switch (GenericOpc) { 1153 case TargetOpcode::G_SITOFP: 1154 return AArch64::SCVTFUXDri; 1155 case TargetOpcode::G_UITOFP: 1156 return AArch64::UCVTFUXDri; 1157 case TargetOpcode::G_FPTOSI: 1158 return AArch64::FCVTZSUXDr; 1159 case TargetOpcode::G_FPTOUI: 1160 return AArch64::FCVTZUUXDr; 1161 default: 1162 return GenericOpc; 1163 } 1164 default: 1165 return GenericOpc; 1166 } 1167 default: 1168 return GenericOpc; 1169 }; 1170 return GenericOpc; 1171 } 1172 1173 MachineInstr * 1174 AArch64InstructionSelector::emitSelect(Register Dst, Register True, 1175 Register False, AArch64CC::CondCode CC, 1176 MachineIRBuilder &MIB) const { 1177 MachineRegisterInfo &MRI = *MIB.getMRI(); 1178 assert(RBI.getRegBank(False, MRI, TRI)->getID() == 1179 RBI.getRegBank(True, MRI, TRI)->getID() && 1180 "Expected both select operands to have the same regbank?"); 1181 LLT Ty = MRI.getType(True); 1182 if (Ty.isVector()) 1183 return nullptr; 1184 const unsigned Size = Ty.getSizeInBits(); 1185 assert((Size == 32 || Size == 64) && 1186 "Expected 32 bit or 64 bit select only?"); 1187 const bool Is32Bit = Size == 32; 1188 if (RBI.getRegBank(True, MRI, TRI)->getID() != AArch64::GPRRegBankID) { 1189 unsigned Opc = Is32Bit ? AArch64::FCSELSrrr : AArch64::FCSELDrrr; 1190 auto FCSel = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC); 1191 constrainSelectedInstRegOperands(*FCSel, TII, TRI, RBI); 1192 return &*FCSel; 1193 } 1194 1195 // By default, we'll try and emit a CSEL. 1196 unsigned Opc = Is32Bit ? AArch64::CSELWr : AArch64::CSELXr; 1197 bool Optimized = false; 1198 auto TryFoldBinOpIntoSelect = [&Opc, Is32Bit, &CC, &MRI, 1199 &Optimized](Register &Reg, Register &OtherReg, 1200 bool Invert) { 1201 if (Optimized) 1202 return false; 1203 1204 // Attempt to fold: 1205 // 1206 // %sub = G_SUB 0, %x 1207 // %select = G_SELECT cc, %reg, %sub 1208 // 1209 // Into: 1210 // %select = CSNEG %reg, %x, cc 1211 Register MatchReg; 1212 if (mi_match(Reg, MRI, m_Neg(m_Reg(MatchReg)))) { 1213 Opc = Is32Bit ? AArch64::CSNEGWr : AArch64::CSNEGXr; 1214 Reg = MatchReg; 1215 if (Invert) { 1216 CC = AArch64CC::getInvertedCondCode(CC); 1217 std::swap(Reg, OtherReg); 1218 } 1219 return true; 1220 } 1221 1222 // Attempt to fold: 1223 // 1224 // %xor = G_XOR %x, -1 1225 // %select = G_SELECT cc, %reg, %xor 1226 // 1227 // Into: 1228 // %select = CSINV %reg, %x, cc 1229 if (mi_match(Reg, MRI, m_Not(m_Reg(MatchReg)))) { 1230 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; 1231 Reg = MatchReg; 1232 if (Invert) { 1233 CC = AArch64CC::getInvertedCondCode(CC); 1234 std::swap(Reg, OtherReg); 1235 } 1236 return true; 1237 } 1238 1239 // Attempt to fold: 1240 // 1241 // %add = G_ADD %x, 1 1242 // %select = G_SELECT cc, %reg, %add 1243 // 1244 // Into: 1245 // %select = CSINC %reg, %x, cc 1246 if (mi_match(Reg, MRI, 1247 m_any_of(m_GAdd(m_Reg(MatchReg), m_SpecificICst(1)), 1248 m_GPtrAdd(m_Reg(MatchReg), m_SpecificICst(1))))) { 1249 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; 1250 Reg = MatchReg; 1251 if (Invert) { 1252 CC = AArch64CC::getInvertedCondCode(CC); 1253 std::swap(Reg, OtherReg); 1254 } 1255 return true; 1256 } 1257 1258 return false; 1259 }; 1260 1261 // Helper lambda which tries to use CSINC/CSINV for the instruction when its 1262 // true/false values are constants. 1263 // FIXME: All of these patterns already exist in tablegen. We should be 1264 // able to import these. 1265 auto TryOptSelectCst = [&Opc, &True, &False, &CC, Is32Bit, &MRI, 1266 &Optimized]() { 1267 if (Optimized) 1268 return false; 1269 auto TrueCst = getIConstantVRegValWithLookThrough(True, MRI); 1270 auto FalseCst = getIConstantVRegValWithLookThrough(False, MRI); 1271 if (!TrueCst && !FalseCst) 1272 return false; 1273 1274 Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR; 1275 if (TrueCst && FalseCst) { 1276 int64_t T = TrueCst->Value.getSExtValue(); 1277 int64_t F = FalseCst->Value.getSExtValue(); 1278 1279 if (T == 0 && F == 1) { 1280 // G_SELECT cc, 0, 1 -> CSINC zreg, zreg, cc 1281 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; 1282 True = ZReg; 1283 False = ZReg; 1284 return true; 1285 } 1286 1287 if (T == 0 && F == -1) { 1288 // G_SELECT cc 0, -1 -> CSINV zreg, zreg cc 1289 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; 1290 True = ZReg; 1291 False = ZReg; 1292 return true; 1293 } 1294 } 1295 1296 if (TrueCst) { 1297 int64_t T = TrueCst->Value.getSExtValue(); 1298 if (T == 1) { 1299 // G_SELECT cc, 1, f -> CSINC f, zreg, inv_cc 1300 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; 1301 True = False; 1302 False = ZReg; 1303 CC = AArch64CC::getInvertedCondCode(CC); 1304 return true; 1305 } 1306 1307 if (T == -1) { 1308 // G_SELECT cc, -1, f -> CSINV f, zreg, inv_cc 1309 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; 1310 True = False; 1311 False = ZReg; 1312 CC = AArch64CC::getInvertedCondCode(CC); 1313 return true; 1314 } 1315 } 1316 1317 if (FalseCst) { 1318 int64_t F = FalseCst->Value.getSExtValue(); 1319 if (F == 1) { 1320 // G_SELECT cc, t, 1 -> CSINC t, zreg, cc 1321 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; 1322 False = ZReg; 1323 return true; 1324 } 1325 1326 if (F == -1) { 1327 // G_SELECT cc, t, -1 -> CSINC t, zreg, cc 1328 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; 1329 False = ZReg; 1330 return true; 1331 } 1332 } 1333 return false; 1334 }; 1335 1336 Optimized |= TryFoldBinOpIntoSelect(False, True, /*Invert = */ false); 1337 Optimized |= TryFoldBinOpIntoSelect(True, False, /*Invert = */ true); 1338 Optimized |= TryOptSelectCst(); 1339 auto SelectInst = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC); 1340 constrainSelectedInstRegOperands(*SelectInst, TII, TRI, RBI); 1341 return &*SelectInst; 1342 } 1343 1344 static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) { 1345 switch (P) { 1346 default: 1347 llvm_unreachable("Unknown condition code!"); 1348 case CmpInst::ICMP_NE: 1349 return AArch64CC::NE; 1350 case CmpInst::ICMP_EQ: 1351 return AArch64CC::EQ; 1352 case CmpInst::ICMP_SGT: 1353 return AArch64CC::GT; 1354 case CmpInst::ICMP_SGE: 1355 return AArch64CC::GE; 1356 case CmpInst::ICMP_SLT: 1357 return AArch64CC::LT; 1358 case CmpInst::ICMP_SLE: 1359 return AArch64CC::LE; 1360 case CmpInst::ICMP_UGT: 1361 return AArch64CC::HI; 1362 case CmpInst::ICMP_UGE: 1363 return AArch64CC::HS; 1364 case CmpInst::ICMP_ULT: 1365 return AArch64CC::LO; 1366 case CmpInst::ICMP_ULE: 1367 return AArch64CC::LS; 1368 } 1369 } 1370 1371 /// changeFPCCToORAArch64CC - Convert an IR fp condition code to an AArch64 CC. 1372 static void changeFPCCToORAArch64CC(CmpInst::Predicate CC, 1373 AArch64CC::CondCode &CondCode, 1374 AArch64CC::CondCode &CondCode2) { 1375 CondCode2 = AArch64CC::AL; 1376 switch (CC) { 1377 default: 1378 llvm_unreachable("Unknown FP condition!"); 1379 case CmpInst::FCMP_OEQ: 1380 CondCode = AArch64CC::EQ; 1381 break; 1382 case CmpInst::FCMP_OGT: 1383 CondCode = AArch64CC::GT; 1384 break; 1385 case CmpInst::FCMP_OGE: 1386 CondCode = AArch64CC::GE; 1387 break; 1388 case CmpInst::FCMP_OLT: 1389 CondCode = AArch64CC::MI; 1390 break; 1391 case CmpInst::FCMP_OLE: 1392 CondCode = AArch64CC::LS; 1393 break; 1394 case CmpInst::FCMP_ONE: 1395 CondCode = AArch64CC::MI; 1396 CondCode2 = AArch64CC::GT; 1397 break; 1398 case CmpInst::FCMP_ORD: 1399 CondCode = AArch64CC::VC; 1400 break; 1401 case CmpInst::FCMP_UNO: 1402 CondCode = AArch64CC::VS; 1403 break; 1404 case CmpInst::FCMP_UEQ: 1405 CondCode = AArch64CC::EQ; 1406 CondCode2 = AArch64CC::VS; 1407 break; 1408 case CmpInst::FCMP_UGT: 1409 CondCode = AArch64CC::HI; 1410 break; 1411 case CmpInst::FCMP_UGE: 1412 CondCode = AArch64CC::PL; 1413 break; 1414 case CmpInst::FCMP_ULT: 1415 CondCode = AArch64CC::LT; 1416 break; 1417 case CmpInst::FCMP_ULE: 1418 CondCode = AArch64CC::LE; 1419 break; 1420 case CmpInst::FCMP_UNE: 1421 CondCode = AArch64CC::NE; 1422 break; 1423 } 1424 } 1425 1426 /// Convert an IR fp condition code to an AArch64 CC. 1427 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that 1428 /// should be AND'ed instead of OR'ed. 1429 static void changeFPCCToANDAArch64CC(CmpInst::Predicate CC, 1430 AArch64CC::CondCode &CondCode, 1431 AArch64CC::CondCode &CondCode2) { 1432 CondCode2 = AArch64CC::AL; 1433 switch (CC) { 1434 default: 1435 changeFPCCToORAArch64CC(CC, CondCode, CondCode2); 1436 assert(CondCode2 == AArch64CC::AL); 1437 break; 1438 case CmpInst::FCMP_ONE: 1439 // (a one b) 1440 // == ((a olt b) || (a ogt b)) 1441 // == ((a ord b) && (a une b)) 1442 CondCode = AArch64CC::VC; 1443 CondCode2 = AArch64CC::NE; 1444 break; 1445 case CmpInst::FCMP_UEQ: 1446 // (a ueq b) 1447 // == ((a uno b) || (a oeq b)) 1448 // == ((a ule b) && (a uge b)) 1449 CondCode = AArch64CC::PL; 1450 CondCode2 = AArch64CC::LE; 1451 break; 1452 } 1453 } 1454 1455 /// Return a register which can be used as a bit to test in a TB(N)Z. 1456 static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert, 1457 MachineRegisterInfo &MRI) { 1458 assert(Reg.isValid() && "Expected valid register!"); 1459 bool HasZext = false; 1460 while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) { 1461 unsigned Opc = MI->getOpcode(); 1462 1463 if (!MI->getOperand(0).isReg() || 1464 !MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) 1465 break; 1466 1467 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits. 1468 // 1469 // (tbz (trunc x), b) -> (tbz x, b) is always safe, because the bit number 1470 // on the truncated x is the same as the bit number on x. 1471 if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT || 1472 Opc == TargetOpcode::G_TRUNC) { 1473 if (Opc == TargetOpcode::G_ZEXT) 1474 HasZext = true; 1475 1476 Register NextReg = MI->getOperand(1).getReg(); 1477 // Did we find something worth folding? 1478 if (!NextReg.isValid() || !MRI.hasOneNonDBGUse(NextReg)) 1479 break; 1480 1481 // NextReg is worth folding. Keep looking. 1482 Reg = NextReg; 1483 continue; 1484 } 1485 1486 // Attempt to find a suitable operation with a constant on one side. 1487 std::optional<uint64_t> C; 1488 Register TestReg; 1489 switch (Opc) { 1490 default: 1491 break; 1492 case TargetOpcode::G_AND: 1493 case TargetOpcode::G_XOR: { 1494 TestReg = MI->getOperand(1).getReg(); 1495 Register ConstantReg = MI->getOperand(2).getReg(); 1496 auto VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI); 1497 if (!VRegAndVal) { 1498 // AND commutes, check the other side for a constant. 1499 // FIXME: Can we canonicalize the constant so that it's always on the 1500 // same side at some point earlier? 1501 std::swap(ConstantReg, TestReg); 1502 VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI); 1503 } 1504 if (VRegAndVal) { 1505 if (HasZext) 1506 C = VRegAndVal->Value.getZExtValue(); 1507 else 1508 C = VRegAndVal->Value.getSExtValue(); 1509 } 1510 break; 1511 } 1512 case TargetOpcode::G_ASHR: 1513 case TargetOpcode::G_LSHR: 1514 case TargetOpcode::G_SHL: { 1515 TestReg = MI->getOperand(1).getReg(); 1516 auto VRegAndVal = 1517 getIConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI); 1518 if (VRegAndVal) 1519 C = VRegAndVal->Value.getSExtValue(); 1520 break; 1521 } 1522 } 1523 1524 // Didn't find a constant or viable register. Bail out of the loop. 1525 if (!C || !TestReg.isValid()) 1526 break; 1527 1528 // We found a suitable instruction with a constant. Check to see if we can 1529 // walk through the instruction. 1530 Register NextReg; 1531 unsigned TestRegSize = MRI.getType(TestReg).getSizeInBits(); 1532 switch (Opc) { 1533 default: 1534 break; 1535 case TargetOpcode::G_AND: 1536 // (tbz (and x, m), b) -> (tbz x, b) when the b-th bit of m is set. 1537 if ((*C >> Bit) & 1) 1538 NextReg = TestReg; 1539 break; 1540 case TargetOpcode::G_SHL: 1541 // (tbz (shl x, c), b) -> (tbz x, b-c) when b-c is positive and fits in 1542 // the type of the register. 1543 if (*C <= Bit && (Bit - *C) < TestRegSize) { 1544 NextReg = TestReg; 1545 Bit = Bit - *C; 1546 } 1547 break; 1548 case TargetOpcode::G_ASHR: 1549 // (tbz (ashr x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits 1550 // in x 1551 NextReg = TestReg; 1552 Bit = Bit + *C; 1553 if (Bit >= TestRegSize) 1554 Bit = TestRegSize - 1; 1555 break; 1556 case TargetOpcode::G_LSHR: 1557 // (tbz (lshr x, c), b) -> (tbz x, b+c) when b + c is < # bits in x 1558 if ((Bit + *C) < TestRegSize) { 1559 NextReg = TestReg; 1560 Bit = Bit + *C; 1561 } 1562 break; 1563 case TargetOpcode::G_XOR: 1564 // We can walk through a G_XOR by inverting whether we use tbz/tbnz when 1565 // appropriate. 1566 // 1567 // e.g. If x' = xor x, c, and the b-th bit is set in c then 1568 // 1569 // tbz x', b -> tbnz x, b 1570 // 1571 // Because x' only has the b-th bit set if x does not. 1572 if ((*C >> Bit) & 1) 1573 Invert = !Invert; 1574 NextReg = TestReg; 1575 break; 1576 } 1577 1578 // Check if we found anything worth folding. 1579 if (!NextReg.isValid()) 1580 return Reg; 1581 Reg = NextReg; 1582 } 1583 1584 return Reg; 1585 } 1586 1587 MachineInstr *AArch64InstructionSelector::emitTestBit( 1588 Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB, 1589 MachineIRBuilder &MIB) const { 1590 assert(TestReg.isValid()); 1591 assert(ProduceNonFlagSettingCondBr && 1592 "Cannot emit TB(N)Z with speculation tracking!"); 1593 MachineRegisterInfo &MRI = *MIB.getMRI(); 1594 1595 // Attempt to optimize the test bit by walking over instructions. 1596 TestReg = getTestBitReg(TestReg, Bit, IsNegative, MRI); 1597 LLT Ty = MRI.getType(TestReg); 1598 unsigned Size = Ty.getSizeInBits(); 1599 assert(!Ty.isVector() && "Expected a scalar!"); 1600 assert(Bit < 64 && "Bit is too large!"); 1601 1602 // When the test register is a 64-bit register, we have to narrow to make 1603 // TBNZW work. 1604 bool UseWReg = Bit < 32; 1605 unsigned NecessarySize = UseWReg ? 32 : 64; 1606 if (Size != NecessarySize) 1607 TestReg = moveScalarRegClass( 1608 TestReg, UseWReg ? AArch64::GPR32RegClass : AArch64::GPR64RegClass, 1609 MIB); 1610 1611 static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX}, 1612 {AArch64::TBZW, AArch64::TBNZW}}; 1613 unsigned Opc = OpcTable[UseWReg][IsNegative]; 1614 auto TestBitMI = 1615 MIB.buildInstr(Opc).addReg(TestReg).addImm(Bit).addMBB(DstMBB); 1616 constrainSelectedInstRegOperands(*TestBitMI, TII, TRI, RBI); 1617 return &*TestBitMI; 1618 } 1619 1620 bool AArch64InstructionSelector::tryOptAndIntoCompareBranch( 1621 MachineInstr &AndInst, bool Invert, MachineBasicBlock *DstMBB, 1622 MachineIRBuilder &MIB) const { 1623 assert(AndInst.getOpcode() == TargetOpcode::G_AND && "Expected G_AND only?"); 1624 // Given something like this: 1625 // 1626 // %x = ...Something... 1627 // %one = G_CONSTANT i64 1 1628 // %zero = G_CONSTANT i64 0 1629 // %and = G_AND %x, %one 1630 // %cmp = G_ICMP intpred(ne), %and, %zero 1631 // %cmp_trunc = G_TRUNC %cmp 1632 // G_BRCOND %cmp_trunc, %bb.3 1633 // 1634 // We want to try and fold the AND into the G_BRCOND and produce either a 1635 // TBNZ (when we have intpred(ne)) or a TBZ (when we have intpred(eq)). 1636 // 1637 // In this case, we'd get 1638 // 1639 // TBNZ %x %bb.3 1640 // 1641 1642 // Check if the AND has a constant on its RHS which we can use as a mask. 1643 // If it's a power of 2, then it's the same as checking a specific bit. 1644 // (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set) 1645 auto MaybeBit = getIConstantVRegValWithLookThrough( 1646 AndInst.getOperand(2).getReg(), *MIB.getMRI()); 1647 if (!MaybeBit) 1648 return false; 1649 1650 int32_t Bit = MaybeBit->Value.exactLogBase2(); 1651 if (Bit < 0) 1652 return false; 1653 1654 Register TestReg = AndInst.getOperand(1).getReg(); 1655 1656 // Emit a TB(N)Z. 1657 emitTestBit(TestReg, Bit, Invert, DstMBB, MIB); 1658 return true; 1659 } 1660 1661 MachineInstr *AArch64InstructionSelector::emitCBZ(Register CompareReg, 1662 bool IsNegative, 1663 MachineBasicBlock *DestMBB, 1664 MachineIRBuilder &MIB) const { 1665 assert(ProduceNonFlagSettingCondBr && "CBZ does not set flags!"); 1666 MachineRegisterInfo &MRI = *MIB.getMRI(); 1667 assert(RBI.getRegBank(CompareReg, MRI, TRI)->getID() == 1668 AArch64::GPRRegBankID && 1669 "Expected GPRs only?"); 1670 auto Ty = MRI.getType(CompareReg); 1671 unsigned Width = Ty.getSizeInBits(); 1672 assert(!Ty.isVector() && "Expected scalar only?"); 1673 assert(Width <= 64 && "Expected width to be at most 64?"); 1674 static const unsigned OpcTable[2][2] = {{AArch64::CBZW, AArch64::CBZX}, 1675 {AArch64::CBNZW, AArch64::CBNZX}}; 1676 unsigned Opc = OpcTable[IsNegative][Width == 64]; 1677 auto BranchMI = MIB.buildInstr(Opc, {}, {CompareReg}).addMBB(DestMBB); 1678 constrainSelectedInstRegOperands(*BranchMI, TII, TRI, RBI); 1679 return &*BranchMI; 1680 } 1681 1682 bool AArch64InstructionSelector::selectCompareBranchFedByFCmp( 1683 MachineInstr &I, MachineInstr &FCmp, MachineIRBuilder &MIB) const { 1684 assert(FCmp.getOpcode() == TargetOpcode::G_FCMP); 1685 assert(I.getOpcode() == TargetOpcode::G_BRCOND); 1686 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't 1687 // totally clean. Some of them require two branches to implement. 1688 auto Pred = (CmpInst::Predicate)FCmp.getOperand(1).getPredicate(); 1689 emitFPCompare(FCmp.getOperand(2).getReg(), FCmp.getOperand(3).getReg(), MIB, 1690 Pred); 1691 AArch64CC::CondCode CC1, CC2; 1692 changeFCMPPredToAArch64CC(static_cast<CmpInst::Predicate>(Pred), CC1, CC2); 1693 MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); 1694 MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC1).addMBB(DestMBB); 1695 if (CC2 != AArch64CC::AL) 1696 MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC2).addMBB(DestMBB); 1697 I.eraseFromParent(); 1698 return true; 1699 } 1700 1701 bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp( 1702 MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const { 1703 assert(ICmp.getOpcode() == TargetOpcode::G_ICMP); 1704 assert(I.getOpcode() == TargetOpcode::G_BRCOND); 1705 // Attempt to optimize the G_BRCOND + G_ICMP into a TB(N)Z/CB(N)Z. 1706 // 1707 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z 1708 // instructions will not be produced, as they are conditional branch 1709 // instructions that do not set flags. 1710 if (!ProduceNonFlagSettingCondBr) 1711 return false; 1712 1713 MachineRegisterInfo &MRI = *MIB.getMRI(); 1714 MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); 1715 auto Pred = 1716 static_cast<CmpInst::Predicate>(ICmp.getOperand(1).getPredicate()); 1717 Register LHS = ICmp.getOperand(2).getReg(); 1718 Register RHS = ICmp.getOperand(3).getReg(); 1719 1720 // We're allowed to emit a TB(N)Z/CB(N)Z. Try to do that. 1721 auto VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI); 1722 MachineInstr *AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI); 1723 1724 // When we can emit a TB(N)Z, prefer that. 1725 // 1726 // Handle non-commutative condition codes first. 1727 // Note that we don't want to do this when we have a G_AND because it can 1728 // become a tst. The tst will make the test bit in the TB(N)Z redundant. 1729 if (VRegAndVal && !AndInst) { 1730 int64_t C = VRegAndVal->Value.getSExtValue(); 1731 1732 // When we have a greater-than comparison, we can just test if the msb is 1733 // zero. 1734 if (C == -1 && Pred == CmpInst::ICMP_SGT) { 1735 uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1; 1736 emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB); 1737 I.eraseFromParent(); 1738 return true; 1739 } 1740 1741 // When we have a less than comparison, we can just test if the msb is not 1742 // zero. 1743 if (C == 0 && Pred == CmpInst::ICMP_SLT) { 1744 uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1; 1745 emitTestBit(LHS, Bit, /*IsNegative = */ true, DestMBB, MIB); 1746 I.eraseFromParent(); 1747 return true; 1748 } 1749 1750 // Inversely, if we have a signed greater-than-or-equal comparison to zero, 1751 // we can test if the msb is zero. 1752 if (C == 0 && Pred == CmpInst::ICMP_SGE) { 1753 uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1; 1754 emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB); 1755 I.eraseFromParent(); 1756 return true; 1757 } 1758 } 1759 1760 // Attempt to handle commutative condition codes. Right now, that's only 1761 // eq/ne. 1762 if (ICmpInst::isEquality(Pred)) { 1763 if (!VRegAndVal) { 1764 std::swap(RHS, LHS); 1765 VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI); 1766 AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI); 1767 } 1768 1769 if (VRegAndVal && VRegAndVal->Value == 0) { 1770 // If there's a G_AND feeding into this branch, try to fold it away by 1771 // emitting a TB(N)Z instead. 1772 // 1773 // Note: If we have LT, then it *is* possible to fold, but it wouldn't be 1774 // beneficial. When we have an AND and LT, we need a TST/ANDS, so folding 1775 // would be redundant. 1776 if (AndInst && 1777 tryOptAndIntoCompareBranch( 1778 *AndInst, /*Invert = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB)) { 1779 I.eraseFromParent(); 1780 return true; 1781 } 1782 1783 // Otherwise, try to emit a CB(N)Z instead. 1784 auto LHSTy = MRI.getType(LHS); 1785 if (!LHSTy.isVector() && LHSTy.getSizeInBits() <= 64) { 1786 emitCBZ(LHS, /*IsNegative = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB); 1787 I.eraseFromParent(); 1788 return true; 1789 } 1790 } 1791 } 1792 1793 return false; 1794 } 1795 1796 bool AArch64InstructionSelector::selectCompareBranchFedByICmp( 1797 MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const { 1798 assert(ICmp.getOpcode() == TargetOpcode::G_ICMP); 1799 assert(I.getOpcode() == TargetOpcode::G_BRCOND); 1800 if (tryOptCompareBranchFedByICmp(I, ICmp, MIB)) 1801 return true; 1802 1803 // Couldn't optimize. Emit a compare + a Bcc. 1804 MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); 1805 auto PredOp = ICmp.getOperand(1); 1806 emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB); 1807 const AArch64CC::CondCode CC = changeICMPPredToAArch64CC( 1808 static_cast<CmpInst::Predicate>(PredOp.getPredicate())); 1809 MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB); 1810 I.eraseFromParent(); 1811 return true; 1812 } 1813 1814 bool AArch64InstructionSelector::selectCompareBranch( 1815 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) { 1816 Register CondReg = I.getOperand(0).getReg(); 1817 MachineInstr *CCMI = MRI.getVRegDef(CondReg); 1818 // Try to select the G_BRCOND using whatever is feeding the condition if 1819 // possible. 1820 unsigned CCMIOpc = CCMI->getOpcode(); 1821 if (CCMIOpc == TargetOpcode::G_FCMP) 1822 return selectCompareBranchFedByFCmp(I, *CCMI, MIB); 1823 if (CCMIOpc == TargetOpcode::G_ICMP) 1824 return selectCompareBranchFedByICmp(I, *CCMI, MIB); 1825 1826 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z 1827 // instructions will not be produced, as they are conditional branch 1828 // instructions that do not set flags. 1829 if (ProduceNonFlagSettingCondBr) { 1830 emitTestBit(CondReg, /*Bit = */ 0, /*IsNegative = */ true, 1831 I.getOperand(1).getMBB(), MIB); 1832 I.eraseFromParent(); 1833 return true; 1834 } 1835 1836 // Can't emit TB(N)Z/CB(N)Z. Emit a tst + bcc instead. 1837 auto TstMI = 1838 MIB.buildInstr(AArch64::ANDSWri, {LLT::scalar(32)}, {CondReg}).addImm(1); 1839 constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI); 1840 auto Bcc = MIB.buildInstr(AArch64::Bcc) 1841 .addImm(AArch64CC::NE) 1842 .addMBB(I.getOperand(1).getMBB()); 1843 I.eraseFromParent(); 1844 return constrainSelectedInstRegOperands(*Bcc, TII, TRI, RBI); 1845 } 1846 1847 /// Returns the element immediate value of a vector shift operand if found. 1848 /// This needs to detect a splat-like operation, e.g. a G_BUILD_VECTOR. 1849 static std::optional<int64_t> getVectorShiftImm(Register Reg, 1850 MachineRegisterInfo &MRI) { 1851 assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand"); 1852 MachineInstr *OpMI = MRI.getVRegDef(Reg); 1853 return getAArch64VectorSplatScalar(*OpMI, MRI); 1854 } 1855 1856 /// Matches and returns the shift immediate value for a SHL instruction given 1857 /// a shift operand. 1858 static std::optional<int64_t> getVectorSHLImm(LLT SrcTy, Register Reg, 1859 MachineRegisterInfo &MRI) { 1860 std::optional<int64_t> ShiftImm = getVectorShiftImm(Reg, MRI); 1861 if (!ShiftImm) 1862 return std::nullopt; 1863 // Check the immediate is in range for a SHL. 1864 int64_t Imm = *ShiftImm; 1865 if (Imm < 0) 1866 return std::nullopt; 1867 switch (SrcTy.getElementType().getSizeInBits()) { 1868 default: 1869 LLVM_DEBUG(dbgs() << "Unhandled element type for vector shift"); 1870 return std::nullopt; 1871 case 8: 1872 if (Imm > 7) 1873 return std::nullopt; 1874 break; 1875 case 16: 1876 if (Imm > 15) 1877 return std::nullopt; 1878 break; 1879 case 32: 1880 if (Imm > 31) 1881 return std::nullopt; 1882 break; 1883 case 64: 1884 if (Imm > 63) 1885 return std::nullopt; 1886 break; 1887 } 1888 return Imm; 1889 } 1890 1891 bool AArch64InstructionSelector::selectVectorSHL(MachineInstr &I, 1892 MachineRegisterInfo &MRI) { 1893 assert(I.getOpcode() == TargetOpcode::G_SHL); 1894 Register DstReg = I.getOperand(0).getReg(); 1895 const LLT Ty = MRI.getType(DstReg); 1896 Register Src1Reg = I.getOperand(1).getReg(); 1897 Register Src2Reg = I.getOperand(2).getReg(); 1898 1899 if (!Ty.isVector()) 1900 return false; 1901 1902 // Check if we have a vector of constants on RHS that we can select as the 1903 // immediate form. 1904 std::optional<int64_t> ImmVal = getVectorSHLImm(Ty, Src2Reg, MRI); 1905 1906 unsigned Opc = 0; 1907 if (Ty == LLT::fixed_vector(2, 64)) { 1908 Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64; 1909 } else if (Ty == LLT::fixed_vector(4, 32)) { 1910 Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32; 1911 } else if (Ty == LLT::fixed_vector(2, 32)) { 1912 Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32; 1913 } else if (Ty == LLT::fixed_vector(4, 16)) { 1914 Opc = ImmVal ? AArch64::SHLv4i16_shift : AArch64::USHLv4i16; 1915 } else if (Ty == LLT::fixed_vector(8, 16)) { 1916 Opc = ImmVal ? AArch64::SHLv8i16_shift : AArch64::USHLv8i16; 1917 } else if (Ty == LLT::fixed_vector(16, 8)) { 1918 Opc = ImmVal ? AArch64::SHLv16i8_shift : AArch64::USHLv16i8; 1919 } else if (Ty == LLT::fixed_vector(8, 8)) { 1920 Opc = ImmVal ? AArch64::SHLv8i8_shift : AArch64::USHLv8i8; 1921 } else { 1922 LLVM_DEBUG(dbgs() << "Unhandled G_SHL type"); 1923 return false; 1924 } 1925 1926 auto Shl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg}); 1927 if (ImmVal) 1928 Shl.addImm(*ImmVal); 1929 else 1930 Shl.addUse(Src2Reg); 1931 constrainSelectedInstRegOperands(*Shl, TII, TRI, RBI); 1932 I.eraseFromParent(); 1933 return true; 1934 } 1935 1936 bool AArch64InstructionSelector::selectVectorAshrLshr( 1937 MachineInstr &I, MachineRegisterInfo &MRI) { 1938 assert(I.getOpcode() == TargetOpcode::G_ASHR || 1939 I.getOpcode() == TargetOpcode::G_LSHR); 1940 Register DstReg = I.getOperand(0).getReg(); 1941 const LLT Ty = MRI.getType(DstReg); 1942 Register Src1Reg = I.getOperand(1).getReg(); 1943 Register Src2Reg = I.getOperand(2).getReg(); 1944 1945 if (!Ty.isVector()) 1946 return false; 1947 1948 bool IsASHR = I.getOpcode() == TargetOpcode::G_ASHR; 1949 1950 // We expect the immediate case to be lowered in the PostLegalCombiner to 1951 // AArch64ISD::VASHR or AArch64ISD::VLSHR equivalents. 1952 1953 // There is not a shift right register instruction, but the shift left 1954 // register instruction takes a signed value, where negative numbers specify a 1955 // right shift. 1956 1957 unsigned Opc = 0; 1958 unsigned NegOpc = 0; 1959 const TargetRegisterClass *RC = 1960 getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID)); 1961 if (Ty == LLT::fixed_vector(2, 64)) { 1962 Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64; 1963 NegOpc = AArch64::NEGv2i64; 1964 } else if (Ty == LLT::fixed_vector(4, 32)) { 1965 Opc = IsASHR ? AArch64::SSHLv4i32 : AArch64::USHLv4i32; 1966 NegOpc = AArch64::NEGv4i32; 1967 } else if (Ty == LLT::fixed_vector(2, 32)) { 1968 Opc = IsASHR ? AArch64::SSHLv2i32 : AArch64::USHLv2i32; 1969 NegOpc = AArch64::NEGv2i32; 1970 } else if (Ty == LLT::fixed_vector(4, 16)) { 1971 Opc = IsASHR ? AArch64::SSHLv4i16 : AArch64::USHLv4i16; 1972 NegOpc = AArch64::NEGv4i16; 1973 } else if (Ty == LLT::fixed_vector(8, 16)) { 1974 Opc = IsASHR ? AArch64::SSHLv8i16 : AArch64::USHLv8i16; 1975 NegOpc = AArch64::NEGv8i16; 1976 } else if (Ty == LLT::fixed_vector(16, 8)) { 1977 Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8; 1978 NegOpc = AArch64::NEGv16i8; 1979 } else if (Ty == LLT::fixed_vector(8, 8)) { 1980 Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8; 1981 NegOpc = AArch64::NEGv8i8; 1982 } else { 1983 LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type"); 1984 return false; 1985 } 1986 1987 auto Neg = MIB.buildInstr(NegOpc, {RC}, {Src2Reg}); 1988 constrainSelectedInstRegOperands(*Neg, TII, TRI, RBI); 1989 auto SShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Neg}); 1990 constrainSelectedInstRegOperands(*SShl, TII, TRI, RBI); 1991 I.eraseFromParent(); 1992 return true; 1993 } 1994 1995 bool AArch64InstructionSelector::selectVaStartAAPCS( 1996 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { 1997 return false; 1998 } 1999 2000 bool AArch64InstructionSelector::selectVaStartDarwin( 2001 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { 2002 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 2003 Register ListReg = I.getOperand(0).getReg(); 2004 2005 Register ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); 2006 2007 int FrameIdx = FuncInfo->getVarArgsStackIndex(); 2008 if (MF.getSubtarget<AArch64Subtarget>().isCallingConvWin64( 2009 MF.getFunction().getCallingConv(), MF.getFunction().isVarArg())) { 2010 FrameIdx = FuncInfo->getVarArgsGPRSize() > 0 2011 ? FuncInfo->getVarArgsGPRIndex() 2012 : FuncInfo->getVarArgsStackIndex(); 2013 } 2014 2015 auto MIB = 2016 BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri)) 2017 .addDef(ArgsAddrReg) 2018 .addFrameIndex(FrameIdx) 2019 .addImm(0) 2020 .addImm(0); 2021 2022 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 2023 2024 MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui)) 2025 .addUse(ArgsAddrReg) 2026 .addUse(ListReg) 2027 .addImm(0) 2028 .addMemOperand(*I.memoperands_begin()); 2029 2030 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 2031 I.eraseFromParent(); 2032 return true; 2033 } 2034 2035 void AArch64InstructionSelector::materializeLargeCMVal( 2036 MachineInstr &I, const Value *V, unsigned OpFlags) { 2037 MachineBasicBlock &MBB = *I.getParent(); 2038 MachineFunction &MF = *MBB.getParent(); 2039 MachineRegisterInfo &MRI = MF.getRegInfo(); 2040 2041 auto MovZ = MIB.buildInstr(AArch64::MOVZXi, {&AArch64::GPR64RegClass}, {}); 2042 MovZ->addOperand(MF, I.getOperand(1)); 2043 MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 | 2044 AArch64II::MO_NC); 2045 MovZ->addOperand(MF, MachineOperand::CreateImm(0)); 2046 constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI); 2047 2048 auto BuildMovK = [&](Register SrcReg, unsigned char Flags, unsigned Offset, 2049 Register ForceDstReg) { 2050 Register DstReg = ForceDstReg 2051 ? ForceDstReg 2052 : MRI.createVirtualRegister(&AArch64::GPR64RegClass); 2053 auto MovI = MIB.buildInstr(AArch64::MOVKXi).addDef(DstReg).addUse(SrcReg); 2054 if (auto *GV = dyn_cast<GlobalValue>(V)) { 2055 MovI->addOperand(MF, MachineOperand::CreateGA( 2056 GV, MovZ->getOperand(1).getOffset(), Flags)); 2057 } else { 2058 MovI->addOperand( 2059 MF, MachineOperand::CreateBA(cast<BlockAddress>(V), 2060 MovZ->getOperand(1).getOffset(), Flags)); 2061 } 2062 MovI->addOperand(MF, MachineOperand::CreateImm(Offset)); 2063 constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI); 2064 return DstReg; 2065 }; 2066 Register DstReg = BuildMovK(MovZ.getReg(0), 2067 AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0); 2068 DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0); 2069 BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg()); 2070 } 2071 2072 bool AArch64InstructionSelector::preISelLower(MachineInstr &I) { 2073 MachineBasicBlock &MBB = *I.getParent(); 2074 MachineFunction &MF = *MBB.getParent(); 2075 MachineRegisterInfo &MRI = MF.getRegInfo(); 2076 2077 switch (I.getOpcode()) { 2078 case TargetOpcode::G_STORE: { 2079 bool Changed = contractCrossBankCopyIntoStore(I, MRI); 2080 MachineOperand &SrcOp = I.getOperand(0); 2081 if (MRI.getType(SrcOp.getReg()).isPointer()) { 2082 // Allow matching with imported patterns for stores of pointers. Unlike 2083 // G_LOAD/G_PTR_ADD, we may not have selected all users. So, emit a copy 2084 // and constrain. 2085 auto Copy = MIB.buildCopy(LLT::scalar(64), SrcOp); 2086 Register NewSrc = Copy.getReg(0); 2087 SrcOp.setReg(NewSrc); 2088 RBI.constrainGenericRegister(NewSrc, AArch64::GPR64RegClass, MRI); 2089 Changed = true; 2090 } 2091 return Changed; 2092 } 2093 case TargetOpcode::G_PTR_ADD: 2094 return convertPtrAddToAdd(I, MRI); 2095 case TargetOpcode::G_LOAD: { 2096 // For scalar loads of pointers, we try to convert the dest type from p0 2097 // to s64 so that our imported patterns can match. Like with the G_PTR_ADD 2098 // conversion, this should be ok because all users should have been 2099 // selected already, so the type doesn't matter for them. 2100 Register DstReg = I.getOperand(0).getReg(); 2101 const LLT DstTy = MRI.getType(DstReg); 2102 if (!DstTy.isPointer()) 2103 return false; 2104 MRI.setType(DstReg, LLT::scalar(64)); 2105 return true; 2106 } 2107 case AArch64::G_DUP: { 2108 // Convert the type from p0 to s64 to help selection. 2109 LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 2110 if (!DstTy.isPointerVector()) 2111 return false; 2112 auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(1).getReg()); 2113 MRI.setType(I.getOperand(0).getReg(), 2114 DstTy.changeElementType(LLT::scalar(64))); 2115 MRI.setRegClass(NewSrc.getReg(0), &AArch64::GPR64RegClass); 2116 I.getOperand(1).setReg(NewSrc.getReg(0)); 2117 return true; 2118 } 2119 case TargetOpcode::G_UITOFP: 2120 case TargetOpcode::G_SITOFP: { 2121 // If both source and destination regbanks are FPR, then convert the opcode 2122 // to G_SITOF so that the importer can select it to an fpr variant. 2123 // Otherwise, it ends up matching an fpr/gpr variant and adding a cross-bank 2124 // copy. 2125 Register SrcReg = I.getOperand(1).getReg(); 2126 LLT SrcTy = MRI.getType(SrcReg); 2127 LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 2128 if (SrcTy.isVector() || SrcTy.getSizeInBits() != DstTy.getSizeInBits()) 2129 return false; 2130 2131 if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::FPRRegBankID) { 2132 if (I.getOpcode() == TargetOpcode::G_SITOFP) 2133 I.setDesc(TII.get(AArch64::G_SITOF)); 2134 else 2135 I.setDesc(TII.get(AArch64::G_UITOF)); 2136 return true; 2137 } 2138 return false; 2139 } 2140 default: 2141 return false; 2142 } 2143 } 2144 2145 /// This lowering tries to look for G_PTR_ADD instructions and then converts 2146 /// them to a standard G_ADD with a COPY on the source. 2147 /// 2148 /// The motivation behind this is to expose the add semantics to the imported 2149 /// tablegen patterns. We shouldn't need to check for uses being loads/stores, 2150 /// because the selector works bottom up, uses before defs. By the time we 2151 /// end up trying to select a G_PTR_ADD, we should have already attempted to 2152 /// fold this into addressing modes and were therefore unsuccessful. 2153 bool AArch64InstructionSelector::convertPtrAddToAdd( 2154 MachineInstr &I, MachineRegisterInfo &MRI) { 2155 assert(I.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD"); 2156 Register DstReg = I.getOperand(0).getReg(); 2157 Register AddOp1Reg = I.getOperand(1).getReg(); 2158 const LLT PtrTy = MRI.getType(DstReg); 2159 if (PtrTy.getAddressSpace() != 0) 2160 return false; 2161 2162 const LLT CastPtrTy = 2163 PtrTy.isVector() ? LLT::fixed_vector(2, 64) : LLT::scalar(64); 2164 auto PtrToInt = MIB.buildPtrToInt(CastPtrTy, AddOp1Reg); 2165 // Set regbanks on the registers. 2166 if (PtrTy.isVector()) 2167 MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::FPRRegBankID)); 2168 else 2169 MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID)); 2170 2171 // Now turn the %dst(p0) = G_PTR_ADD %base, off into: 2172 // %dst(intty) = G_ADD %intbase, off 2173 I.setDesc(TII.get(TargetOpcode::G_ADD)); 2174 MRI.setType(DstReg, CastPtrTy); 2175 I.getOperand(1).setReg(PtrToInt.getReg(0)); 2176 if (!select(*PtrToInt)) { 2177 LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd"); 2178 return false; 2179 } 2180 2181 // Also take the opportunity here to try to do some optimization. 2182 // Try to convert this into a G_SUB if the offset is a 0-x negate idiom. 2183 Register NegatedReg; 2184 if (!mi_match(I.getOperand(2).getReg(), MRI, m_Neg(m_Reg(NegatedReg)))) 2185 return true; 2186 I.getOperand(2).setReg(NegatedReg); 2187 I.setDesc(TII.get(TargetOpcode::G_SUB)); 2188 return true; 2189 } 2190 2191 bool AArch64InstructionSelector::earlySelectSHL(MachineInstr &I, 2192 MachineRegisterInfo &MRI) { 2193 // We try to match the immediate variant of LSL, which is actually an alias 2194 // for a special case of UBFM. Otherwise, we fall back to the imported 2195 // selector which will match the register variant. 2196 assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op"); 2197 const auto &MO = I.getOperand(2); 2198 auto VRegAndVal = getIConstantVRegVal(MO.getReg(), MRI); 2199 if (!VRegAndVal) 2200 return false; 2201 2202 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 2203 if (DstTy.isVector()) 2204 return false; 2205 bool Is64Bit = DstTy.getSizeInBits() == 64; 2206 auto Imm1Fn = Is64Bit ? selectShiftA_64(MO) : selectShiftA_32(MO); 2207 auto Imm2Fn = Is64Bit ? selectShiftB_64(MO) : selectShiftB_32(MO); 2208 2209 if (!Imm1Fn || !Imm2Fn) 2210 return false; 2211 2212 auto NewI = 2213 MIB.buildInstr(Is64Bit ? AArch64::UBFMXri : AArch64::UBFMWri, 2214 {I.getOperand(0).getReg()}, {I.getOperand(1).getReg()}); 2215 2216 for (auto &RenderFn : *Imm1Fn) 2217 RenderFn(NewI); 2218 for (auto &RenderFn : *Imm2Fn) 2219 RenderFn(NewI); 2220 2221 I.eraseFromParent(); 2222 return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI); 2223 } 2224 2225 bool AArch64InstructionSelector::contractCrossBankCopyIntoStore( 2226 MachineInstr &I, MachineRegisterInfo &MRI) { 2227 assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE"); 2228 // If we're storing a scalar, it doesn't matter what register bank that 2229 // scalar is on. All that matters is the size. 2230 // 2231 // So, if we see something like this (with a 32-bit scalar as an example): 2232 // 2233 // %x:gpr(s32) = ... something ... 2234 // %y:fpr(s32) = COPY %x:gpr(s32) 2235 // G_STORE %y:fpr(s32) 2236 // 2237 // We can fix this up into something like this: 2238 // 2239 // G_STORE %x:gpr(s32) 2240 // 2241 // And then continue the selection process normally. 2242 Register DefDstReg = getSrcRegIgnoringCopies(I.getOperand(0).getReg(), MRI); 2243 if (!DefDstReg.isValid()) 2244 return false; 2245 LLT DefDstTy = MRI.getType(DefDstReg); 2246 Register StoreSrcReg = I.getOperand(0).getReg(); 2247 LLT StoreSrcTy = MRI.getType(StoreSrcReg); 2248 2249 // If we get something strange like a physical register, then we shouldn't 2250 // go any further. 2251 if (!DefDstTy.isValid()) 2252 return false; 2253 2254 // Are the source and dst types the same size? 2255 if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits()) 2256 return false; 2257 2258 if (RBI.getRegBank(StoreSrcReg, MRI, TRI) == 2259 RBI.getRegBank(DefDstReg, MRI, TRI)) 2260 return false; 2261 2262 // We have a cross-bank copy, which is entering a store. Let's fold it. 2263 I.getOperand(0).setReg(DefDstReg); 2264 return true; 2265 } 2266 2267 bool AArch64InstructionSelector::earlySelect(MachineInstr &I) { 2268 assert(I.getParent() && "Instruction should be in a basic block!"); 2269 assert(I.getParent()->getParent() && "Instruction should be in a function!"); 2270 2271 MachineBasicBlock &MBB = *I.getParent(); 2272 MachineFunction &MF = *MBB.getParent(); 2273 MachineRegisterInfo &MRI = MF.getRegInfo(); 2274 2275 switch (I.getOpcode()) { 2276 case AArch64::G_DUP: { 2277 // Before selecting a DUP instruction, check if it is better selected as a 2278 // MOV or load from a constant pool. 2279 Register Src = I.getOperand(1).getReg(); 2280 auto ValAndVReg = getAnyConstantVRegValWithLookThrough(Src, MRI); 2281 if (!ValAndVReg) 2282 return false; 2283 LLVMContext &Ctx = MF.getFunction().getContext(); 2284 Register Dst = I.getOperand(0).getReg(); 2285 auto *CV = ConstantDataVector::getSplat( 2286 MRI.getType(Dst).getNumElements(), 2287 ConstantInt::get(Type::getIntNTy(Ctx, MRI.getType(Src).getSizeInBits()), 2288 ValAndVReg->Value)); 2289 if (!emitConstantVector(Dst, CV, MIB, MRI)) 2290 return false; 2291 I.eraseFromParent(); 2292 return true; 2293 } 2294 case TargetOpcode::G_SEXT: 2295 // Check for i64 sext(i32 vector_extract) prior to tablegen to select SMOV 2296 // over a normal extend. 2297 if (selectUSMovFromExtend(I, MRI)) 2298 return true; 2299 return false; 2300 case TargetOpcode::G_BR: 2301 return false; 2302 case TargetOpcode::G_SHL: 2303 return earlySelectSHL(I, MRI); 2304 case TargetOpcode::G_CONSTANT: { 2305 bool IsZero = false; 2306 if (I.getOperand(1).isCImm()) 2307 IsZero = I.getOperand(1).getCImm()->isZero(); 2308 else if (I.getOperand(1).isImm()) 2309 IsZero = I.getOperand(1).getImm() == 0; 2310 2311 if (!IsZero) 2312 return false; 2313 2314 Register DefReg = I.getOperand(0).getReg(); 2315 LLT Ty = MRI.getType(DefReg); 2316 if (Ty.getSizeInBits() == 64) { 2317 I.getOperand(1).ChangeToRegister(AArch64::XZR, false); 2318 RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI); 2319 } else if (Ty.getSizeInBits() == 32) { 2320 I.getOperand(1).ChangeToRegister(AArch64::WZR, false); 2321 RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI); 2322 } else 2323 return false; 2324 2325 I.setDesc(TII.get(TargetOpcode::COPY)); 2326 return true; 2327 } 2328 2329 case TargetOpcode::G_ADD: { 2330 // Check if this is being fed by a G_ICMP on either side. 2331 // 2332 // (cmp pred, x, y) + z 2333 // 2334 // In the above case, when the cmp is true, we increment z by 1. So, we can 2335 // fold the add into the cset for the cmp by using cinc. 2336 // 2337 // FIXME: This would probably be a lot nicer in PostLegalizerLowering. 2338 Register AddDst = I.getOperand(0).getReg(); 2339 Register AddLHS = I.getOperand(1).getReg(); 2340 Register AddRHS = I.getOperand(2).getReg(); 2341 // Only handle scalars. 2342 LLT Ty = MRI.getType(AddLHS); 2343 if (Ty.isVector()) 2344 return false; 2345 // Since G_ICMP is modeled as ADDS/SUBS/ANDS, we can handle 32 bits or 64 2346 // bits. 2347 unsigned Size = Ty.getSizeInBits(); 2348 if (Size != 32 && Size != 64) 2349 return false; 2350 auto MatchCmp = [&](Register Reg) -> MachineInstr * { 2351 if (!MRI.hasOneNonDBGUse(Reg)) 2352 return nullptr; 2353 // If the LHS of the add is 32 bits, then we want to fold a 32-bit 2354 // compare. 2355 if (Size == 32) 2356 return getOpcodeDef(TargetOpcode::G_ICMP, Reg, MRI); 2357 // We model scalar compares using 32-bit destinations right now. 2358 // If it's a 64-bit compare, it'll have 64-bit sources. 2359 Register ZExt; 2360 if (!mi_match(Reg, MRI, 2361 m_OneNonDBGUse(m_GZExt(m_OneNonDBGUse(m_Reg(ZExt)))))) 2362 return nullptr; 2363 auto *Cmp = getOpcodeDef(TargetOpcode::G_ICMP, ZExt, MRI); 2364 if (!Cmp || 2365 MRI.getType(Cmp->getOperand(2).getReg()).getSizeInBits() != 64) 2366 return nullptr; 2367 return Cmp; 2368 }; 2369 // Try to match 2370 // z + (cmp pred, x, y) 2371 MachineInstr *Cmp = MatchCmp(AddRHS); 2372 if (!Cmp) { 2373 // (cmp pred, x, y) + z 2374 std::swap(AddLHS, AddRHS); 2375 Cmp = MatchCmp(AddRHS); 2376 if (!Cmp) 2377 return false; 2378 } 2379 auto &PredOp = Cmp->getOperand(1); 2380 auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate()); 2381 const AArch64CC::CondCode InvCC = 2382 changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred)); 2383 MIB.setInstrAndDebugLoc(I); 2384 emitIntegerCompare(/*LHS=*/Cmp->getOperand(2), 2385 /*RHS=*/Cmp->getOperand(3), PredOp, MIB); 2386 emitCSINC(/*Dst=*/AddDst, /*Src =*/AddLHS, /*Src2=*/AddLHS, InvCC, MIB); 2387 I.eraseFromParent(); 2388 return true; 2389 } 2390 case TargetOpcode::G_OR: { 2391 // Look for operations that take the lower `Width=Size-ShiftImm` bits of 2392 // `ShiftSrc` and insert them into the upper `Width` bits of `MaskSrc` via 2393 // shifting and masking that we can replace with a BFI (encoded as a BFM). 2394 Register Dst = I.getOperand(0).getReg(); 2395 LLT Ty = MRI.getType(Dst); 2396 2397 if (!Ty.isScalar()) 2398 return false; 2399 2400 unsigned Size = Ty.getSizeInBits(); 2401 if (Size != 32 && Size != 64) 2402 return false; 2403 2404 Register ShiftSrc; 2405 int64_t ShiftImm; 2406 Register MaskSrc; 2407 int64_t MaskImm; 2408 if (!mi_match( 2409 Dst, MRI, 2410 m_GOr(m_OneNonDBGUse(m_GShl(m_Reg(ShiftSrc), m_ICst(ShiftImm))), 2411 m_OneNonDBGUse(m_GAnd(m_Reg(MaskSrc), m_ICst(MaskImm)))))) 2412 return false; 2413 2414 if (ShiftImm > Size || ((1ULL << ShiftImm) - 1ULL) != uint64_t(MaskImm)) 2415 return false; 2416 2417 int64_t Immr = Size - ShiftImm; 2418 int64_t Imms = Size - ShiftImm - 1; 2419 unsigned Opc = Size == 32 ? AArch64::BFMWri : AArch64::BFMXri; 2420 emitInstr(Opc, {Dst}, {MaskSrc, ShiftSrc, Immr, Imms}, MIB); 2421 I.eraseFromParent(); 2422 return true; 2423 } 2424 case TargetOpcode::G_FENCE: { 2425 if (I.getOperand(1).getImm() == 0) 2426 BuildMI(MBB, I, MIMetadata(I), TII.get(TargetOpcode::MEMBARRIER)); 2427 else 2428 BuildMI(MBB, I, MIMetadata(I), TII.get(AArch64::DMB)) 2429 .addImm(I.getOperand(0).getImm() == 4 ? 0x9 : 0xb); 2430 I.eraseFromParent(); 2431 return true; 2432 } 2433 default: 2434 return false; 2435 } 2436 } 2437 2438 bool AArch64InstructionSelector::select(MachineInstr &I) { 2439 assert(I.getParent() && "Instruction should be in a basic block!"); 2440 assert(I.getParent()->getParent() && "Instruction should be in a function!"); 2441 2442 MachineBasicBlock &MBB = *I.getParent(); 2443 MachineFunction &MF = *MBB.getParent(); 2444 MachineRegisterInfo &MRI = MF.getRegInfo(); 2445 2446 const AArch64Subtarget *Subtarget = &MF.getSubtarget<AArch64Subtarget>(); 2447 if (Subtarget->requiresStrictAlign()) { 2448 // We don't support this feature yet. 2449 LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n"); 2450 return false; 2451 } 2452 2453 MIB.setInstrAndDebugLoc(I); 2454 2455 unsigned Opcode = I.getOpcode(); 2456 // G_PHI requires same handling as PHI 2457 if (!I.isPreISelOpcode() || Opcode == TargetOpcode::G_PHI) { 2458 // Certain non-generic instructions also need some special handling. 2459 2460 if (Opcode == TargetOpcode::LOAD_STACK_GUARD) 2461 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2462 2463 if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) { 2464 const Register DefReg = I.getOperand(0).getReg(); 2465 const LLT DefTy = MRI.getType(DefReg); 2466 2467 const RegClassOrRegBank &RegClassOrBank = 2468 MRI.getRegClassOrRegBank(DefReg); 2469 2470 const TargetRegisterClass *DefRC 2471 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); 2472 if (!DefRC) { 2473 if (!DefTy.isValid()) { 2474 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); 2475 return false; 2476 } 2477 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); 2478 DefRC = getRegClassForTypeOnBank(DefTy, RB); 2479 if (!DefRC) { 2480 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); 2481 return false; 2482 } 2483 } 2484 2485 I.setDesc(TII.get(TargetOpcode::PHI)); 2486 2487 return RBI.constrainGenericRegister(DefReg, *DefRC, MRI); 2488 } 2489 2490 if (I.isCopy()) 2491 return selectCopy(I, TII, MRI, TRI, RBI); 2492 2493 if (I.isDebugInstr()) 2494 return selectDebugInstr(I, MRI, RBI); 2495 2496 return true; 2497 } 2498 2499 2500 if (I.getNumOperands() != I.getNumExplicitOperands()) { 2501 LLVM_DEBUG( 2502 dbgs() << "Generic instruction has unexpected implicit operands\n"); 2503 return false; 2504 } 2505 2506 // Try to do some lowering before we start instruction selecting. These 2507 // lowerings are purely transformations on the input G_MIR and so selection 2508 // must continue after any modification of the instruction. 2509 if (preISelLower(I)) { 2510 Opcode = I.getOpcode(); // The opcode may have been modified, refresh it. 2511 } 2512 2513 // There may be patterns where the importer can't deal with them optimally, 2514 // but does select it to a suboptimal sequence so our custom C++ selection 2515 // code later never has a chance to work on it. Therefore, we have an early 2516 // selection attempt here to give priority to certain selection routines 2517 // over the imported ones. 2518 if (earlySelect(I)) 2519 return true; 2520 2521 if (selectImpl(I, *CoverageInfo)) 2522 return true; 2523 2524 LLT Ty = 2525 I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{}; 2526 2527 switch (Opcode) { 2528 case TargetOpcode::G_SBFX: 2529 case TargetOpcode::G_UBFX: { 2530 static const unsigned OpcTable[2][2] = { 2531 {AArch64::UBFMWri, AArch64::UBFMXri}, 2532 {AArch64::SBFMWri, AArch64::SBFMXri}}; 2533 bool IsSigned = Opcode == TargetOpcode::G_SBFX; 2534 unsigned Size = Ty.getSizeInBits(); 2535 unsigned Opc = OpcTable[IsSigned][Size == 64]; 2536 auto Cst1 = 2537 getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), MRI); 2538 assert(Cst1 && "Should have gotten a constant for src 1?"); 2539 auto Cst2 = 2540 getIConstantVRegValWithLookThrough(I.getOperand(3).getReg(), MRI); 2541 assert(Cst2 && "Should have gotten a constant for src 2?"); 2542 auto LSB = Cst1->Value.getZExtValue(); 2543 auto Width = Cst2->Value.getZExtValue(); 2544 auto BitfieldInst = 2545 MIB.buildInstr(Opc, {I.getOperand(0)}, {I.getOperand(1)}) 2546 .addImm(LSB) 2547 .addImm(LSB + Width - 1); 2548 I.eraseFromParent(); 2549 return constrainSelectedInstRegOperands(*BitfieldInst, TII, TRI, RBI); 2550 } 2551 case TargetOpcode::G_BRCOND: 2552 return selectCompareBranch(I, MF, MRI); 2553 2554 case TargetOpcode::G_BRINDIRECT: { 2555 const Function &Fn = MF.getFunction(); 2556 if (std::optional<uint16_t> BADisc = 2557 STI.getPtrAuthBlockAddressDiscriminatorIfEnabled(Fn)) { 2558 auto MI = MIB.buildInstr(AArch64::BRA, {}, {I.getOperand(0).getReg()}); 2559 MI.addImm(AArch64PACKey::IA); 2560 MI.addImm(*BADisc); 2561 MI.addReg(/*AddrDisc=*/AArch64::XZR); 2562 I.eraseFromParent(); 2563 return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI); 2564 } 2565 I.setDesc(TII.get(AArch64::BR)); 2566 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2567 } 2568 2569 case TargetOpcode::G_BRJT: 2570 return selectBrJT(I, MRI); 2571 2572 case AArch64::G_ADD_LOW: { 2573 // This op may have been separated from it's ADRP companion by the localizer 2574 // or some other code motion pass. Given that many CPUs will try to 2575 // macro fuse these operations anyway, select this into a MOVaddr pseudo 2576 // which will later be expanded into an ADRP+ADD pair after scheduling. 2577 MachineInstr *BaseMI = MRI.getVRegDef(I.getOperand(1).getReg()); 2578 if (BaseMI->getOpcode() != AArch64::ADRP) { 2579 I.setDesc(TII.get(AArch64::ADDXri)); 2580 I.addOperand(MachineOperand::CreateImm(0)); 2581 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2582 } 2583 assert(TM.getCodeModel() == CodeModel::Small && 2584 "Expected small code model"); 2585 auto Op1 = BaseMI->getOperand(1); 2586 auto Op2 = I.getOperand(2); 2587 auto MovAddr = MIB.buildInstr(AArch64::MOVaddr, {I.getOperand(0)}, {}) 2588 .addGlobalAddress(Op1.getGlobal(), Op1.getOffset(), 2589 Op1.getTargetFlags()) 2590 .addGlobalAddress(Op2.getGlobal(), Op2.getOffset(), 2591 Op2.getTargetFlags()); 2592 I.eraseFromParent(); 2593 return constrainSelectedInstRegOperands(*MovAddr, TII, TRI, RBI); 2594 } 2595 2596 case TargetOpcode::G_FCONSTANT: 2597 case TargetOpcode::G_CONSTANT: { 2598 const bool isFP = Opcode == TargetOpcode::G_FCONSTANT; 2599 2600 const LLT s8 = LLT::scalar(8); 2601 const LLT s16 = LLT::scalar(16); 2602 const LLT s32 = LLT::scalar(32); 2603 const LLT s64 = LLT::scalar(64); 2604 const LLT s128 = LLT::scalar(128); 2605 const LLT p0 = LLT::pointer(0, 64); 2606 2607 const Register DefReg = I.getOperand(0).getReg(); 2608 const LLT DefTy = MRI.getType(DefReg); 2609 const unsigned DefSize = DefTy.getSizeInBits(); 2610 const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); 2611 2612 // FIXME: Redundant check, but even less readable when factored out. 2613 if (isFP) { 2614 if (Ty != s16 && Ty != s32 && Ty != s64 && Ty != s128) { 2615 LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty 2616 << " constant, expected: " << s16 << " or " << s32 2617 << " or " << s64 << " or " << s128 << '\n'); 2618 return false; 2619 } 2620 2621 if (RB.getID() != AArch64::FPRRegBankID) { 2622 LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty 2623 << " constant on bank: " << RB 2624 << ", expected: FPR\n"); 2625 return false; 2626 } 2627 2628 // The case when we have 0.0 is covered by tablegen. Reject it here so we 2629 // can be sure tablegen works correctly and isn't rescued by this code. 2630 // 0.0 is not covered by tablegen for FP128. So we will handle this 2631 // scenario in the code here. 2632 if (DefSize != 128 && I.getOperand(1).getFPImm()->isExactlyValue(0.0)) 2633 return false; 2634 } else { 2635 // s32 and s64 are covered by tablegen. 2636 if (Ty != p0 && Ty != s8 && Ty != s16) { 2637 LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty 2638 << " constant, expected: " << s32 << ", " << s64 2639 << ", or " << p0 << '\n'); 2640 return false; 2641 } 2642 2643 if (RB.getID() != AArch64::GPRRegBankID) { 2644 LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty 2645 << " constant on bank: " << RB 2646 << ", expected: GPR\n"); 2647 return false; 2648 } 2649 } 2650 2651 if (isFP) { 2652 const TargetRegisterClass &FPRRC = *getRegClassForTypeOnBank(DefTy, RB); 2653 // For 16, 64, and 128b values, emit a constant pool load. 2654 switch (DefSize) { 2655 default: 2656 llvm_unreachable("Unexpected destination size for G_FCONSTANT?"); 2657 case 32: 2658 case 64: { 2659 bool OptForSize = shouldOptForSize(&MF); 2660 const auto &TLI = MF.getSubtarget().getTargetLowering(); 2661 // If TLI says that this fpimm is illegal, then we'll expand to a 2662 // constant pool load. 2663 if (TLI->isFPImmLegal(I.getOperand(1).getFPImm()->getValueAPF(), 2664 EVT::getFloatingPointVT(DefSize), OptForSize)) 2665 break; 2666 [[fallthrough]]; 2667 } 2668 case 16: 2669 case 128: { 2670 auto *FPImm = I.getOperand(1).getFPImm(); 2671 auto *LoadMI = emitLoadFromConstantPool(FPImm, MIB); 2672 if (!LoadMI) { 2673 LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n"); 2674 return false; 2675 } 2676 MIB.buildCopy({DefReg}, {LoadMI->getOperand(0).getReg()}); 2677 I.eraseFromParent(); 2678 return RBI.constrainGenericRegister(DefReg, FPRRC, MRI); 2679 } 2680 } 2681 2682 assert((DefSize == 32 || DefSize == 64) && "Unexpected const def size"); 2683 // Either emit a FMOV, or emit a copy to emit a normal mov. 2684 const Register DefGPRReg = MRI.createVirtualRegister( 2685 DefSize == 32 ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass); 2686 MachineOperand &RegOp = I.getOperand(0); 2687 RegOp.setReg(DefGPRReg); 2688 MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator())); 2689 MIB.buildCopy({DefReg}, {DefGPRReg}); 2690 2691 if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) { 2692 LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n"); 2693 return false; 2694 } 2695 2696 MachineOperand &ImmOp = I.getOperand(1); 2697 // FIXME: Is going through int64_t always correct? 2698 ImmOp.ChangeToImmediate( 2699 ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue()); 2700 } else if (I.getOperand(1).isCImm()) { 2701 uint64_t Val = I.getOperand(1).getCImm()->getZExtValue(); 2702 I.getOperand(1).ChangeToImmediate(Val); 2703 } else if (I.getOperand(1).isImm()) { 2704 uint64_t Val = I.getOperand(1).getImm(); 2705 I.getOperand(1).ChangeToImmediate(Val); 2706 } 2707 2708 const unsigned MovOpc = 2709 DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm; 2710 I.setDesc(TII.get(MovOpc)); 2711 constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2712 return true; 2713 } 2714 case TargetOpcode::G_EXTRACT: { 2715 Register DstReg = I.getOperand(0).getReg(); 2716 Register SrcReg = I.getOperand(1).getReg(); 2717 LLT SrcTy = MRI.getType(SrcReg); 2718 LLT DstTy = MRI.getType(DstReg); 2719 (void)DstTy; 2720 unsigned SrcSize = SrcTy.getSizeInBits(); 2721 2722 if (SrcTy.getSizeInBits() > 64) { 2723 // This should be an extract of an s128, which is like a vector extract. 2724 if (SrcTy.getSizeInBits() != 128) 2725 return false; 2726 // Only support extracting 64 bits from an s128 at the moment. 2727 if (DstTy.getSizeInBits() != 64) 2728 return false; 2729 2730 unsigned Offset = I.getOperand(2).getImm(); 2731 if (Offset % 64 != 0) 2732 return false; 2733 2734 // Check we have the right regbank always. 2735 const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI); 2736 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); 2737 assert(SrcRB.getID() == DstRB.getID() && "Wrong extract regbank!"); 2738 2739 if (SrcRB.getID() == AArch64::GPRRegBankID) { 2740 auto NewI = 2741 MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}) 2742 .addUse(SrcReg, 0, 2743 Offset == 0 ? AArch64::sube64 : AArch64::subo64); 2744 constrainOperandRegClass(MF, TRI, MRI, TII, RBI, *NewI, 2745 AArch64::GPR64RegClass, NewI->getOperand(0)); 2746 I.eraseFromParent(); 2747 return true; 2748 } 2749 2750 // Emit the same code as a vector extract. 2751 // Offset must be a multiple of 64. 2752 unsigned LaneIdx = Offset / 64; 2753 MachineInstr *Extract = emitExtractVectorElt( 2754 DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB); 2755 if (!Extract) 2756 return false; 2757 I.eraseFromParent(); 2758 return true; 2759 } 2760 2761 I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri)); 2762 MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() + 2763 Ty.getSizeInBits() - 1); 2764 2765 if (SrcSize < 64) { 2766 assert(SrcSize == 32 && DstTy.getSizeInBits() == 16 && 2767 "unexpected G_EXTRACT types"); 2768 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2769 } 2770 2771 DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); 2772 MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator())); 2773 MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {}) 2774 .addReg(DstReg, 0, AArch64::sub_32); 2775 RBI.constrainGenericRegister(I.getOperand(0).getReg(), 2776 AArch64::GPR32RegClass, MRI); 2777 I.getOperand(0).setReg(DstReg); 2778 2779 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2780 } 2781 2782 case TargetOpcode::G_INSERT: { 2783 LLT SrcTy = MRI.getType(I.getOperand(2).getReg()); 2784 LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 2785 unsigned DstSize = DstTy.getSizeInBits(); 2786 // Larger inserts are vectors, same-size ones should be something else by 2787 // now (split up or turned into COPYs). 2788 if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32) 2789 return false; 2790 2791 I.setDesc(TII.get(DstSize == 64 ? AArch64::BFMXri : AArch64::BFMWri)); 2792 unsigned LSB = I.getOperand(3).getImm(); 2793 unsigned Width = MRI.getType(I.getOperand(2).getReg()).getSizeInBits(); 2794 I.getOperand(3).setImm((DstSize - LSB) % DstSize); 2795 MachineInstrBuilder(MF, I).addImm(Width - 1); 2796 2797 if (DstSize < 64) { 2798 assert(DstSize == 32 && SrcTy.getSizeInBits() == 16 && 2799 "unexpected G_INSERT types"); 2800 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2801 } 2802 2803 Register SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); 2804 BuildMI(MBB, I.getIterator(), I.getDebugLoc(), 2805 TII.get(AArch64::SUBREG_TO_REG)) 2806 .addDef(SrcReg) 2807 .addImm(0) 2808 .addUse(I.getOperand(2).getReg()) 2809 .addImm(AArch64::sub_32); 2810 RBI.constrainGenericRegister(I.getOperand(2).getReg(), 2811 AArch64::GPR32RegClass, MRI); 2812 I.getOperand(2).setReg(SrcReg); 2813 2814 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2815 } 2816 case TargetOpcode::G_FRAME_INDEX: { 2817 // allocas and G_FRAME_INDEX are only supported in addrspace(0). 2818 if (Ty != LLT::pointer(0, 64)) { 2819 LLVM_DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty 2820 << ", expected: " << LLT::pointer(0, 64) << '\n'); 2821 return false; 2822 } 2823 I.setDesc(TII.get(AArch64::ADDXri)); 2824 2825 // MOs for a #0 shifted immediate. 2826 I.addOperand(MachineOperand::CreateImm(0)); 2827 I.addOperand(MachineOperand::CreateImm(0)); 2828 2829 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2830 } 2831 2832 case TargetOpcode::G_GLOBAL_VALUE: { 2833 const GlobalValue *GV = nullptr; 2834 unsigned OpFlags; 2835 if (I.getOperand(1).isSymbol()) { 2836 OpFlags = I.getOperand(1).getTargetFlags(); 2837 // Currently only used by "RtLibUseGOT". 2838 assert(OpFlags == AArch64II::MO_GOT); 2839 } else { 2840 GV = I.getOperand(1).getGlobal(); 2841 if (GV->isThreadLocal()) 2842 return selectTLSGlobalValue(I, MRI); 2843 OpFlags = STI.ClassifyGlobalReference(GV, TM); 2844 } 2845 2846 if (OpFlags & AArch64II::MO_GOT) { 2847 I.setDesc(TII.get(AArch64::LOADgot)); 2848 I.getOperand(1).setTargetFlags(OpFlags); 2849 } else if (TM.getCodeModel() == CodeModel::Large && 2850 !TM.isPositionIndependent()) { 2851 // Materialize the global using movz/movk instructions. 2852 materializeLargeCMVal(I, GV, OpFlags); 2853 I.eraseFromParent(); 2854 return true; 2855 } else if (TM.getCodeModel() == CodeModel::Tiny) { 2856 I.setDesc(TII.get(AArch64::ADR)); 2857 I.getOperand(1).setTargetFlags(OpFlags); 2858 } else { 2859 I.setDesc(TII.get(AArch64::MOVaddr)); 2860 I.getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_PAGE); 2861 MachineInstrBuilder MIB(MF, I); 2862 MIB.addGlobalAddress(GV, I.getOperand(1).getOffset(), 2863 OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 2864 } 2865 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2866 } 2867 2868 case TargetOpcode::G_PTRAUTH_GLOBAL_VALUE: 2869 return selectPtrAuthGlobalValue(I, MRI); 2870 2871 case TargetOpcode::G_ZEXTLOAD: 2872 case TargetOpcode::G_LOAD: 2873 case TargetOpcode::G_STORE: { 2874 GLoadStore &LdSt = cast<GLoadStore>(I); 2875 bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD; 2876 LLT PtrTy = MRI.getType(LdSt.getPointerReg()); 2877 2878 if (PtrTy != LLT::pointer(0, 64)) { 2879 LLVM_DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy 2880 << ", expected: " << LLT::pointer(0, 64) << '\n'); 2881 return false; 2882 } 2883 2884 uint64_t MemSizeInBytes = LdSt.getMemSize().getValue(); 2885 unsigned MemSizeInBits = LdSt.getMemSizeInBits().getValue(); 2886 AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering(); 2887 2888 // Need special instructions for atomics that affect ordering. 2889 if (Order != AtomicOrdering::NotAtomic && 2890 Order != AtomicOrdering::Unordered && 2891 Order != AtomicOrdering::Monotonic) { 2892 assert(!isa<GZExtLoad>(LdSt)); 2893 assert(MemSizeInBytes <= 8 && 2894 "128-bit atomics should already be custom-legalized"); 2895 2896 if (isa<GLoad>(LdSt)) { 2897 static constexpr unsigned LDAPROpcodes[] = { 2898 AArch64::LDAPRB, AArch64::LDAPRH, AArch64::LDAPRW, AArch64::LDAPRX}; 2899 static constexpr unsigned LDAROpcodes[] = { 2900 AArch64::LDARB, AArch64::LDARH, AArch64::LDARW, AArch64::LDARX}; 2901 ArrayRef<unsigned> Opcodes = 2902 STI.hasRCPC() && Order != AtomicOrdering::SequentiallyConsistent 2903 ? LDAPROpcodes 2904 : LDAROpcodes; 2905 I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)])); 2906 } else { 2907 static constexpr unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH, 2908 AArch64::STLRW, AArch64::STLRX}; 2909 Register ValReg = LdSt.getReg(0); 2910 if (MRI.getType(ValReg).getSizeInBits() == 64 && MemSizeInBits != 64) { 2911 // Emit a subreg copy of 32 bits. 2912 Register NewVal = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 2913 MIB.buildInstr(TargetOpcode::COPY, {NewVal}, {}) 2914 .addReg(I.getOperand(0).getReg(), 0, AArch64::sub_32); 2915 I.getOperand(0).setReg(NewVal); 2916 } 2917 I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)])); 2918 } 2919 constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2920 return true; 2921 } 2922 2923 #ifndef NDEBUG 2924 const Register PtrReg = LdSt.getPointerReg(); 2925 const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI); 2926 // Check that the pointer register is valid. 2927 assert(PtrRB.getID() == AArch64::GPRRegBankID && 2928 "Load/Store pointer operand isn't a GPR"); 2929 assert(MRI.getType(PtrReg).isPointer() && 2930 "Load/Store pointer operand isn't a pointer"); 2931 #endif 2932 2933 const Register ValReg = LdSt.getReg(0); 2934 const LLT ValTy = MRI.getType(ValReg); 2935 const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI); 2936 2937 // The code below doesn't support truncating stores, so we need to split it 2938 // again. 2939 if (isa<GStore>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) { 2940 unsigned SubReg; 2941 LLT MemTy = LdSt.getMMO().getMemoryType(); 2942 auto *RC = getRegClassForTypeOnBank(MemTy, RB); 2943 if (!getSubRegForClass(RC, TRI, SubReg)) 2944 return false; 2945 2946 // Generate a subreg copy. 2947 auto Copy = MIB.buildInstr(TargetOpcode::COPY, {MemTy}, {}) 2948 .addReg(ValReg, 0, SubReg) 2949 .getReg(0); 2950 RBI.constrainGenericRegister(Copy, *RC, MRI); 2951 LdSt.getOperand(0).setReg(Copy); 2952 } else if (isa<GLoad>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) { 2953 // If this is an any-extending load from the FPR bank, split it into a regular 2954 // load + extend. 2955 if (RB.getID() == AArch64::FPRRegBankID) { 2956 unsigned SubReg; 2957 LLT MemTy = LdSt.getMMO().getMemoryType(); 2958 auto *RC = getRegClassForTypeOnBank(MemTy, RB); 2959 if (!getSubRegForClass(RC, TRI, SubReg)) 2960 return false; 2961 Register OldDst = LdSt.getReg(0); 2962 Register NewDst = 2963 MRI.createGenericVirtualRegister(LdSt.getMMO().getMemoryType()); 2964 LdSt.getOperand(0).setReg(NewDst); 2965 MRI.setRegBank(NewDst, RB); 2966 // Generate a SUBREG_TO_REG to extend it. 2967 MIB.setInsertPt(MIB.getMBB(), std::next(LdSt.getIterator())); 2968 MIB.buildInstr(AArch64::SUBREG_TO_REG, {OldDst}, {}) 2969 .addImm(0) 2970 .addUse(NewDst) 2971 .addImm(SubReg); 2972 auto SubRegRC = getRegClassForTypeOnBank(MRI.getType(OldDst), RB); 2973 RBI.constrainGenericRegister(OldDst, *SubRegRC, MRI); 2974 MIB.setInstr(LdSt); 2975 } 2976 } 2977 2978 // Helper lambda for partially selecting I. Either returns the original 2979 // instruction with an updated opcode, or a new instruction. 2980 auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * { 2981 bool IsStore = isa<GStore>(I); 2982 const unsigned NewOpc = 2983 selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits); 2984 if (NewOpc == I.getOpcode()) 2985 return nullptr; 2986 // Check if we can fold anything into the addressing mode. 2987 auto AddrModeFns = 2988 selectAddrModeIndexed(I.getOperand(1), MemSizeInBytes); 2989 if (!AddrModeFns) { 2990 // Can't fold anything. Use the original instruction. 2991 I.setDesc(TII.get(NewOpc)); 2992 I.addOperand(MachineOperand::CreateImm(0)); 2993 return &I; 2994 } 2995 2996 // Folded something. Create a new instruction and return it. 2997 auto NewInst = MIB.buildInstr(NewOpc, {}, {}, I.getFlags()); 2998 Register CurValReg = I.getOperand(0).getReg(); 2999 IsStore ? NewInst.addUse(CurValReg) : NewInst.addDef(CurValReg); 3000 NewInst.cloneMemRefs(I); 3001 for (auto &Fn : *AddrModeFns) 3002 Fn(NewInst); 3003 I.eraseFromParent(); 3004 return &*NewInst; 3005 }; 3006 3007 MachineInstr *LoadStore = SelectLoadStoreAddressingMode(); 3008 if (!LoadStore) 3009 return false; 3010 3011 // If we're storing a 0, use WZR/XZR. 3012 if (Opcode == TargetOpcode::G_STORE) { 3013 auto CVal = getIConstantVRegValWithLookThrough( 3014 LoadStore->getOperand(0).getReg(), MRI); 3015 if (CVal && CVal->Value == 0) { 3016 switch (LoadStore->getOpcode()) { 3017 case AArch64::STRWui: 3018 case AArch64::STRHHui: 3019 case AArch64::STRBBui: 3020 LoadStore->getOperand(0).setReg(AArch64::WZR); 3021 break; 3022 case AArch64::STRXui: 3023 LoadStore->getOperand(0).setReg(AArch64::XZR); 3024 break; 3025 } 3026 } 3027 } 3028 3029 if (IsZExtLoad || (Opcode == TargetOpcode::G_LOAD && 3030 ValTy == LLT::scalar(64) && MemSizeInBits == 32)) { 3031 // The any/zextload from a smaller type to i32 should be handled by the 3032 // importer. 3033 if (MRI.getType(LoadStore->getOperand(0).getReg()).getSizeInBits() != 64) 3034 return false; 3035 // If we have an extending load then change the load's type to be a 3036 // narrower reg and zero_extend with SUBREG_TO_REG. 3037 Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 3038 Register DstReg = LoadStore->getOperand(0).getReg(); 3039 LoadStore->getOperand(0).setReg(LdReg); 3040 3041 MIB.setInsertPt(MIB.getMBB(), std::next(LoadStore->getIterator())); 3042 MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {}) 3043 .addImm(0) 3044 .addUse(LdReg) 3045 .addImm(AArch64::sub_32); 3046 constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI); 3047 return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass, 3048 MRI); 3049 } 3050 return constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI); 3051 } 3052 3053 case TargetOpcode::G_INDEXED_ZEXTLOAD: 3054 case TargetOpcode::G_INDEXED_SEXTLOAD: 3055 return selectIndexedExtLoad(I, MRI); 3056 case TargetOpcode::G_INDEXED_LOAD: 3057 return selectIndexedLoad(I, MRI); 3058 case TargetOpcode::G_INDEXED_STORE: 3059 return selectIndexedStore(cast<GIndexedStore>(I), MRI); 3060 3061 case TargetOpcode::G_LSHR: 3062 case TargetOpcode::G_ASHR: 3063 if (MRI.getType(I.getOperand(0).getReg()).isVector()) 3064 return selectVectorAshrLshr(I, MRI); 3065 [[fallthrough]]; 3066 case TargetOpcode::G_SHL: 3067 if (Opcode == TargetOpcode::G_SHL && 3068 MRI.getType(I.getOperand(0).getReg()).isVector()) 3069 return selectVectorSHL(I, MRI); 3070 3071 // These shifts were legalized to have 64 bit shift amounts because we 3072 // want to take advantage of the selection patterns that assume the 3073 // immediates are s64s, however, selectBinaryOp will assume both operands 3074 // will have the same bit size. 3075 { 3076 Register SrcReg = I.getOperand(1).getReg(); 3077 Register ShiftReg = I.getOperand(2).getReg(); 3078 const LLT ShiftTy = MRI.getType(ShiftReg); 3079 const LLT SrcTy = MRI.getType(SrcReg); 3080 if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 32 && 3081 ShiftTy.getSizeInBits() == 64) { 3082 assert(!ShiftTy.isVector() && "unexpected vector shift ty"); 3083 // Insert a subregister copy to implement a 64->32 trunc 3084 auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {}) 3085 .addReg(ShiftReg, 0, AArch64::sub_32); 3086 MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID)); 3087 I.getOperand(2).setReg(Trunc.getReg(0)); 3088 } 3089 } 3090 [[fallthrough]]; 3091 case TargetOpcode::G_OR: { 3092 // Reject the various things we don't support yet. 3093 if (unsupportedBinOp(I, RBI, MRI, TRI)) 3094 return false; 3095 3096 const unsigned OpSize = Ty.getSizeInBits(); 3097 3098 const Register DefReg = I.getOperand(0).getReg(); 3099 const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); 3100 3101 const unsigned NewOpc = selectBinaryOp(I.getOpcode(), RB.getID(), OpSize); 3102 if (NewOpc == I.getOpcode()) 3103 return false; 3104 3105 I.setDesc(TII.get(NewOpc)); 3106 // FIXME: Should the type be always reset in setDesc? 3107 3108 // Now that we selected an opcode, we need to constrain the register 3109 // operands to use appropriate classes. 3110 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3111 } 3112 3113 case TargetOpcode::G_PTR_ADD: { 3114 emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2), MIB); 3115 I.eraseFromParent(); 3116 return true; 3117 } 3118 3119 case TargetOpcode::G_SADDE: 3120 case TargetOpcode::G_UADDE: 3121 case TargetOpcode::G_SSUBE: 3122 case TargetOpcode::G_USUBE: 3123 case TargetOpcode::G_SADDO: 3124 case TargetOpcode::G_UADDO: 3125 case TargetOpcode::G_SSUBO: 3126 case TargetOpcode::G_USUBO: 3127 return selectOverflowOp(I, MRI); 3128 3129 case TargetOpcode::G_PTRMASK: { 3130 Register MaskReg = I.getOperand(2).getReg(); 3131 std::optional<int64_t> MaskVal = getIConstantVRegSExtVal(MaskReg, MRI); 3132 // TODO: Implement arbitrary cases 3133 if (!MaskVal || !isShiftedMask_64(*MaskVal)) 3134 return false; 3135 3136 uint64_t Mask = *MaskVal; 3137 I.setDesc(TII.get(AArch64::ANDXri)); 3138 I.getOperand(2).ChangeToImmediate( 3139 AArch64_AM::encodeLogicalImmediate(Mask, 64)); 3140 3141 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3142 } 3143 case TargetOpcode::G_PTRTOINT: 3144 case TargetOpcode::G_TRUNC: { 3145 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 3146 const LLT SrcTy = MRI.getType(I.getOperand(1).getReg()); 3147 3148 const Register DstReg = I.getOperand(0).getReg(); 3149 const Register SrcReg = I.getOperand(1).getReg(); 3150 3151 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); 3152 const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI); 3153 3154 if (DstRB.getID() != SrcRB.getID()) { 3155 LLVM_DEBUG( 3156 dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n"); 3157 return false; 3158 } 3159 3160 if (DstRB.getID() == AArch64::GPRRegBankID) { 3161 const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB); 3162 if (!DstRC) 3163 return false; 3164 3165 const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(SrcTy, SrcRB); 3166 if (!SrcRC) 3167 return false; 3168 3169 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) || 3170 !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { 3171 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n"); 3172 return false; 3173 } 3174 3175 if (DstRC == SrcRC) { 3176 // Nothing to be done 3177 } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) && 3178 SrcTy == LLT::scalar(64)) { 3179 llvm_unreachable("TableGen can import this case"); 3180 return false; 3181 } else if (DstRC == &AArch64::GPR32RegClass && 3182 SrcRC == &AArch64::GPR64RegClass) { 3183 I.getOperand(1).setSubReg(AArch64::sub_32); 3184 } else { 3185 LLVM_DEBUG( 3186 dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n"); 3187 return false; 3188 } 3189 3190 I.setDesc(TII.get(TargetOpcode::COPY)); 3191 return true; 3192 } else if (DstRB.getID() == AArch64::FPRRegBankID) { 3193 if (DstTy == LLT::fixed_vector(4, 16) && 3194 SrcTy == LLT::fixed_vector(4, 32)) { 3195 I.setDesc(TII.get(AArch64::XTNv4i16)); 3196 constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3197 return true; 3198 } 3199 3200 if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) { 3201 MachineInstr *Extract = emitExtractVectorElt( 3202 DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB); 3203 if (!Extract) 3204 return false; 3205 I.eraseFromParent(); 3206 return true; 3207 } 3208 3209 // We might have a vector G_PTRTOINT, in which case just emit a COPY. 3210 if (Opcode == TargetOpcode::G_PTRTOINT) { 3211 assert(DstTy.isVector() && "Expected an FPR ptrtoint to be a vector"); 3212 I.setDesc(TII.get(TargetOpcode::COPY)); 3213 return selectCopy(I, TII, MRI, TRI, RBI); 3214 } 3215 } 3216 3217 return false; 3218 } 3219 3220 case TargetOpcode::G_ANYEXT: { 3221 if (selectUSMovFromExtend(I, MRI)) 3222 return true; 3223 3224 const Register DstReg = I.getOperand(0).getReg(); 3225 const Register SrcReg = I.getOperand(1).getReg(); 3226 3227 const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI); 3228 if (RBDst.getID() != AArch64::GPRRegBankID) { 3229 LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst 3230 << ", expected: GPR\n"); 3231 return false; 3232 } 3233 3234 const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI); 3235 if (RBSrc.getID() != AArch64::GPRRegBankID) { 3236 LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc 3237 << ", expected: GPR\n"); 3238 return false; 3239 } 3240 3241 const unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); 3242 3243 if (DstSize == 0) { 3244 LLVM_DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n"); 3245 return false; 3246 } 3247 3248 if (DstSize != 64 && DstSize > 32) { 3249 LLVM_DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize 3250 << ", expected: 32 or 64\n"); 3251 return false; 3252 } 3253 // At this point G_ANYEXT is just like a plain COPY, but we need 3254 // to explicitly form the 64-bit value if any. 3255 if (DstSize > 32) { 3256 Register ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass); 3257 BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG)) 3258 .addDef(ExtSrc) 3259 .addImm(0) 3260 .addUse(SrcReg) 3261 .addImm(AArch64::sub_32); 3262 I.getOperand(1).setReg(ExtSrc); 3263 } 3264 return selectCopy(I, TII, MRI, TRI, RBI); 3265 } 3266 3267 case TargetOpcode::G_ZEXT: 3268 case TargetOpcode::G_SEXT_INREG: 3269 case TargetOpcode::G_SEXT: { 3270 if (selectUSMovFromExtend(I, MRI)) 3271 return true; 3272 3273 unsigned Opcode = I.getOpcode(); 3274 const bool IsSigned = Opcode != TargetOpcode::G_ZEXT; 3275 const Register DefReg = I.getOperand(0).getReg(); 3276 Register SrcReg = I.getOperand(1).getReg(); 3277 const LLT DstTy = MRI.getType(DefReg); 3278 const LLT SrcTy = MRI.getType(SrcReg); 3279 unsigned DstSize = DstTy.getSizeInBits(); 3280 unsigned SrcSize = SrcTy.getSizeInBits(); 3281 3282 // SEXT_INREG has the same src reg size as dst, the size of the value to be 3283 // extended is encoded in the imm. 3284 if (Opcode == TargetOpcode::G_SEXT_INREG) 3285 SrcSize = I.getOperand(2).getImm(); 3286 3287 if (DstTy.isVector()) 3288 return false; // Should be handled by imported patterns. 3289 3290 assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() == 3291 AArch64::GPRRegBankID && 3292 "Unexpected ext regbank"); 3293 3294 MachineInstr *ExtI; 3295 3296 // First check if we're extending the result of a load which has a dest type 3297 // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest 3298 // GPR register on AArch64 and all loads which are smaller automatically 3299 // zero-extend the upper bits. E.g. 3300 // %v(s8) = G_LOAD %p, :: (load 1) 3301 // %v2(s32) = G_ZEXT %v(s8) 3302 if (!IsSigned) { 3303 auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI); 3304 bool IsGPR = 3305 RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID; 3306 if (LoadMI && IsGPR) { 3307 const MachineMemOperand *MemOp = *LoadMI->memoperands_begin(); 3308 unsigned BytesLoaded = MemOp->getSize().getValue(); 3309 if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded) 3310 return selectCopy(I, TII, MRI, TRI, RBI); 3311 } 3312 3313 // For the 32-bit -> 64-bit case, we can emit a mov (ORRWrs) 3314 // + SUBREG_TO_REG. 3315 if (IsGPR && SrcSize == 32 && DstSize == 64) { 3316 Register SubregToRegSrc = 3317 MRI.createVirtualRegister(&AArch64::GPR32RegClass); 3318 const Register ZReg = AArch64::WZR; 3319 MIB.buildInstr(AArch64::ORRWrs, {SubregToRegSrc}, {ZReg, SrcReg}) 3320 .addImm(0); 3321 3322 MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {}) 3323 .addImm(0) 3324 .addUse(SubregToRegSrc) 3325 .addImm(AArch64::sub_32); 3326 3327 if (!RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, 3328 MRI)) { 3329 LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n"); 3330 return false; 3331 } 3332 3333 if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, 3334 MRI)) { 3335 LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n"); 3336 return false; 3337 } 3338 3339 I.eraseFromParent(); 3340 return true; 3341 } 3342 } 3343 3344 if (DstSize == 64) { 3345 if (Opcode != TargetOpcode::G_SEXT_INREG) { 3346 // FIXME: Can we avoid manually doing this? 3347 if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, 3348 MRI)) { 3349 LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode) 3350 << " operand\n"); 3351 return false; 3352 } 3353 SrcReg = MIB.buildInstr(AArch64::SUBREG_TO_REG, 3354 {&AArch64::GPR64RegClass}, {}) 3355 .addImm(0) 3356 .addUse(SrcReg) 3357 .addImm(AArch64::sub_32) 3358 .getReg(0); 3359 } 3360 3361 ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri, 3362 {DefReg}, {SrcReg}) 3363 .addImm(0) 3364 .addImm(SrcSize - 1); 3365 } else if (DstSize <= 32) { 3366 ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri, 3367 {DefReg}, {SrcReg}) 3368 .addImm(0) 3369 .addImm(SrcSize - 1); 3370 } else { 3371 return false; 3372 } 3373 3374 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 3375 I.eraseFromParent(); 3376 return true; 3377 } 3378 3379 case TargetOpcode::G_SITOFP: 3380 case TargetOpcode::G_UITOFP: 3381 case TargetOpcode::G_FPTOSI: 3382 case TargetOpcode::G_FPTOUI: { 3383 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()), 3384 SrcTy = MRI.getType(I.getOperand(1).getReg()); 3385 const unsigned NewOpc = selectFPConvOpc(Opcode, DstTy, SrcTy); 3386 if (NewOpc == Opcode) 3387 return false; 3388 3389 I.setDesc(TII.get(NewOpc)); 3390 constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3391 I.setFlags(MachineInstr::NoFPExcept); 3392 3393 return true; 3394 } 3395 3396 case TargetOpcode::G_FREEZE: 3397 return selectCopy(I, TII, MRI, TRI, RBI); 3398 3399 case TargetOpcode::G_INTTOPTR: 3400 // The importer is currently unable to import pointer types since they 3401 // didn't exist in SelectionDAG. 3402 return selectCopy(I, TII, MRI, TRI, RBI); 3403 3404 case TargetOpcode::G_BITCAST: 3405 // Imported SelectionDAG rules can handle every bitcast except those that 3406 // bitcast from a type to the same type. Ideally, these shouldn't occur 3407 // but we might not run an optimizer that deletes them. The other exception 3408 // is bitcasts involving pointer types, as SelectionDAG has no knowledge 3409 // of them. 3410 return selectCopy(I, TII, MRI, TRI, RBI); 3411 3412 case TargetOpcode::G_SELECT: { 3413 auto &Sel = cast<GSelect>(I); 3414 const Register CondReg = Sel.getCondReg(); 3415 const Register TReg = Sel.getTrueReg(); 3416 const Register FReg = Sel.getFalseReg(); 3417 3418 if (tryOptSelect(Sel)) 3419 return true; 3420 3421 // Make sure to use an unused vreg instead of wzr, so that the peephole 3422 // optimizations will be able to optimize these. 3423 Register DeadVReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 3424 auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg}) 3425 .addImm(AArch64_AM::encodeLogicalImmediate(1, 32)); 3426 constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI); 3427 if (!emitSelect(Sel.getReg(0), TReg, FReg, AArch64CC::NE, MIB)) 3428 return false; 3429 Sel.eraseFromParent(); 3430 return true; 3431 } 3432 case TargetOpcode::G_ICMP: { 3433 if (Ty.isVector()) 3434 return false; 3435 3436 if (Ty != LLT::scalar(32)) { 3437 LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty 3438 << ", expected: " << LLT::scalar(32) << '\n'); 3439 return false; 3440 } 3441 3442 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate()); 3443 const AArch64CC::CondCode InvCC = 3444 changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred)); 3445 emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1), MIB); 3446 emitCSINC(/*Dst=*/I.getOperand(0).getReg(), /*Src1=*/AArch64::WZR, 3447 /*Src2=*/AArch64::WZR, InvCC, MIB); 3448 I.eraseFromParent(); 3449 return true; 3450 } 3451 3452 case TargetOpcode::G_FCMP: { 3453 CmpInst::Predicate Pred = 3454 static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate()); 3455 if (!emitFPCompare(I.getOperand(2).getReg(), I.getOperand(3).getReg(), MIB, 3456 Pred) || 3457 !emitCSetForFCmp(I.getOperand(0).getReg(), Pred, MIB)) 3458 return false; 3459 I.eraseFromParent(); 3460 return true; 3461 } 3462 case TargetOpcode::G_VASTART: 3463 return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI) 3464 : selectVaStartAAPCS(I, MF, MRI); 3465 case TargetOpcode::G_INTRINSIC: 3466 return selectIntrinsic(I, MRI); 3467 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: 3468 return selectIntrinsicWithSideEffects(I, MRI); 3469 case TargetOpcode::G_IMPLICIT_DEF: { 3470 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); 3471 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 3472 const Register DstReg = I.getOperand(0).getReg(); 3473 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); 3474 const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB); 3475 RBI.constrainGenericRegister(DstReg, *DstRC, MRI); 3476 return true; 3477 } 3478 case TargetOpcode::G_BLOCK_ADDR: { 3479 Function *BAFn = I.getOperand(1).getBlockAddress()->getFunction(); 3480 if (std::optional<uint16_t> BADisc = 3481 STI.getPtrAuthBlockAddressDiscriminatorIfEnabled(*BAFn)) { 3482 MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X16}, {}); 3483 MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X17}, {}); 3484 MIB.buildInstr(AArch64::MOVaddrPAC) 3485 .addBlockAddress(I.getOperand(1).getBlockAddress()) 3486 .addImm(AArch64PACKey::IA) 3487 .addReg(/*AddrDisc=*/AArch64::XZR) 3488 .addImm(*BADisc) 3489 .constrainAllUses(TII, TRI, RBI); 3490 MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X16)); 3491 RBI.constrainGenericRegister(I.getOperand(0).getReg(), 3492 AArch64::GPR64RegClass, MRI); 3493 I.eraseFromParent(); 3494 return true; 3495 } 3496 if (TM.getCodeModel() == CodeModel::Large && !TM.isPositionIndependent()) { 3497 materializeLargeCMVal(I, I.getOperand(1).getBlockAddress(), 0); 3498 I.eraseFromParent(); 3499 return true; 3500 } else { 3501 I.setDesc(TII.get(AArch64::MOVaddrBA)); 3502 auto MovMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::MOVaddrBA), 3503 I.getOperand(0).getReg()) 3504 .addBlockAddress(I.getOperand(1).getBlockAddress(), 3505 /* Offset */ 0, AArch64II::MO_PAGE) 3506 .addBlockAddress( 3507 I.getOperand(1).getBlockAddress(), /* Offset */ 0, 3508 AArch64II::MO_NC | AArch64II::MO_PAGEOFF); 3509 I.eraseFromParent(); 3510 return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI); 3511 } 3512 } 3513 case AArch64::G_DUP: { 3514 // When the scalar of G_DUP is an s8/s16 gpr, they can't be selected by 3515 // imported patterns. Do it manually here. Avoiding generating s16 gpr is 3516 // difficult because at RBS we may end up pessimizing the fpr case if we 3517 // decided to add an anyextend to fix this. Manual selection is the most 3518 // robust solution for now. 3519 if (RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() != 3520 AArch64::GPRRegBankID) 3521 return false; // We expect the fpr regbank case to be imported. 3522 LLT VecTy = MRI.getType(I.getOperand(0).getReg()); 3523 if (VecTy == LLT::fixed_vector(8, 8)) 3524 I.setDesc(TII.get(AArch64::DUPv8i8gpr)); 3525 else if (VecTy == LLT::fixed_vector(16, 8)) 3526 I.setDesc(TII.get(AArch64::DUPv16i8gpr)); 3527 else if (VecTy == LLT::fixed_vector(4, 16)) 3528 I.setDesc(TII.get(AArch64::DUPv4i16gpr)); 3529 else if (VecTy == LLT::fixed_vector(8, 16)) 3530 I.setDesc(TII.get(AArch64::DUPv8i16gpr)); 3531 else 3532 return false; 3533 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3534 } 3535 case TargetOpcode::G_BUILD_VECTOR: 3536 return selectBuildVector(I, MRI); 3537 case TargetOpcode::G_MERGE_VALUES: 3538 return selectMergeValues(I, MRI); 3539 case TargetOpcode::G_UNMERGE_VALUES: 3540 return selectUnmergeValues(I, MRI); 3541 case TargetOpcode::G_SHUFFLE_VECTOR: 3542 return selectShuffleVector(I, MRI); 3543 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 3544 return selectExtractElt(I, MRI); 3545 case TargetOpcode::G_CONCAT_VECTORS: 3546 return selectConcatVectors(I, MRI); 3547 case TargetOpcode::G_JUMP_TABLE: 3548 return selectJumpTable(I, MRI); 3549 case TargetOpcode::G_MEMCPY: 3550 case TargetOpcode::G_MEMCPY_INLINE: 3551 case TargetOpcode::G_MEMMOVE: 3552 case TargetOpcode::G_MEMSET: 3553 assert(STI.hasMOPS() && "Shouldn't get here without +mops feature"); 3554 return selectMOPS(I, MRI); 3555 } 3556 3557 return false; 3558 } 3559 3560 bool AArch64InstructionSelector::selectAndRestoreState(MachineInstr &I) { 3561 MachineIRBuilderState OldMIBState = MIB.getState(); 3562 bool Success = select(I); 3563 MIB.setState(OldMIBState); 3564 return Success; 3565 } 3566 3567 bool AArch64InstructionSelector::selectMOPS(MachineInstr &GI, 3568 MachineRegisterInfo &MRI) { 3569 unsigned Mopcode; 3570 switch (GI.getOpcode()) { 3571 case TargetOpcode::G_MEMCPY: 3572 case TargetOpcode::G_MEMCPY_INLINE: 3573 Mopcode = AArch64::MOPSMemoryCopyPseudo; 3574 break; 3575 case TargetOpcode::G_MEMMOVE: 3576 Mopcode = AArch64::MOPSMemoryMovePseudo; 3577 break; 3578 case TargetOpcode::G_MEMSET: 3579 // For tagged memset see llvm.aarch64.mops.memset.tag 3580 Mopcode = AArch64::MOPSMemorySetPseudo; 3581 break; 3582 } 3583 3584 auto &DstPtr = GI.getOperand(0); 3585 auto &SrcOrVal = GI.getOperand(1); 3586 auto &Size = GI.getOperand(2); 3587 3588 // Create copies of the registers that can be clobbered. 3589 const Register DstPtrCopy = MRI.cloneVirtualRegister(DstPtr.getReg()); 3590 const Register SrcValCopy = MRI.cloneVirtualRegister(SrcOrVal.getReg()); 3591 const Register SizeCopy = MRI.cloneVirtualRegister(Size.getReg()); 3592 3593 const bool IsSet = Mopcode == AArch64::MOPSMemorySetPseudo; 3594 const auto &SrcValRegClass = 3595 IsSet ? AArch64::GPR64RegClass : AArch64::GPR64commonRegClass; 3596 3597 // Constrain to specific registers 3598 RBI.constrainGenericRegister(DstPtrCopy, AArch64::GPR64commonRegClass, MRI); 3599 RBI.constrainGenericRegister(SrcValCopy, SrcValRegClass, MRI); 3600 RBI.constrainGenericRegister(SizeCopy, AArch64::GPR64RegClass, MRI); 3601 3602 MIB.buildCopy(DstPtrCopy, DstPtr); 3603 MIB.buildCopy(SrcValCopy, SrcOrVal); 3604 MIB.buildCopy(SizeCopy, Size); 3605 3606 // New instruction uses the copied registers because it must update them. 3607 // The defs are not used since they don't exist in G_MEM*. They are still 3608 // tied. 3609 // Note: order of operands is different from G_MEMSET, G_MEMCPY, G_MEMMOVE 3610 Register DefDstPtr = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass); 3611 Register DefSize = MRI.createVirtualRegister(&AArch64::GPR64RegClass); 3612 if (IsSet) { 3613 MIB.buildInstr(Mopcode, {DefDstPtr, DefSize}, 3614 {DstPtrCopy, SizeCopy, SrcValCopy}); 3615 } else { 3616 Register DefSrcPtr = MRI.createVirtualRegister(&SrcValRegClass); 3617 MIB.buildInstr(Mopcode, {DefDstPtr, DefSrcPtr, DefSize}, 3618 {DstPtrCopy, SrcValCopy, SizeCopy}); 3619 } 3620 3621 GI.eraseFromParent(); 3622 return true; 3623 } 3624 3625 bool AArch64InstructionSelector::selectBrJT(MachineInstr &I, 3626 MachineRegisterInfo &MRI) { 3627 assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT"); 3628 Register JTAddr = I.getOperand(0).getReg(); 3629 unsigned JTI = I.getOperand(1).getIndex(); 3630 Register Index = I.getOperand(2).getReg(); 3631 3632 MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(JTI, 4, nullptr); 3633 3634 // With aarch64-jump-table-hardening, we only expand the jump table dispatch 3635 // sequence later, to guarantee the integrity of the intermediate values. 3636 if (MF->getFunction().hasFnAttribute("aarch64-jump-table-hardening")) { 3637 CodeModel::Model CM = TM.getCodeModel(); 3638 if (STI.isTargetMachO()) { 3639 if (CM != CodeModel::Small && CM != CodeModel::Large) 3640 report_fatal_error("Unsupported code-model for hardened jump-table"); 3641 } else { 3642 // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO. 3643 assert(STI.isTargetELF() && 3644 "jump table hardening only supported on MachO/ELF"); 3645 if (CM != CodeModel::Small) 3646 report_fatal_error("Unsupported code-model for hardened jump-table"); 3647 } 3648 3649 MIB.buildCopy({AArch64::X16}, I.getOperand(2).getReg()); 3650 MIB.buildInstr(AArch64::BR_JumpTable) 3651 .addJumpTableIndex(I.getOperand(1).getIndex()); 3652 I.eraseFromParent(); 3653 return true; 3654 } 3655 3656 Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); 3657 Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass); 3658 3659 auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32, 3660 {TargetReg, ScratchReg}, {JTAddr, Index}) 3661 .addJumpTableIndex(JTI); 3662 // Save the jump table info. 3663 MIB.buildInstr(TargetOpcode::JUMP_TABLE_DEBUG_INFO, {}, 3664 {static_cast<int64_t>(JTI)}); 3665 // Build the indirect branch. 3666 MIB.buildInstr(AArch64::BR, {}, {TargetReg}); 3667 I.eraseFromParent(); 3668 return constrainSelectedInstRegOperands(*JumpTableInst, TII, TRI, RBI); 3669 } 3670 3671 bool AArch64InstructionSelector::selectJumpTable(MachineInstr &I, 3672 MachineRegisterInfo &MRI) { 3673 assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table"); 3674 assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!"); 3675 3676 Register DstReg = I.getOperand(0).getReg(); 3677 unsigned JTI = I.getOperand(1).getIndex(); 3678 // We generate a MOVaddrJT which will get expanded to an ADRP + ADD later. 3679 auto MovMI = 3680 MIB.buildInstr(AArch64::MOVaddrJT, {DstReg}, {}) 3681 .addJumpTableIndex(JTI, AArch64II::MO_PAGE) 3682 .addJumpTableIndex(JTI, AArch64II::MO_NC | AArch64II::MO_PAGEOFF); 3683 I.eraseFromParent(); 3684 return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI); 3685 } 3686 3687 bool AArch64InstructionSelector::selectTLSGlobalValue( 3688 MachineInstr &I, MachineRegisterInfo &MRI) { 3689 if (!STI.isTargetMachO()) 3690 return false; 3691 MachineFunction &MF = *I.getParent()->getParent(); 3692 MF.getFrameInfo().setAdjustsStack(true); 3693 3694 const auto &GlobalOp = I.getOperand(1); 3695 assert(GlobalOp.getOffset() == 0 && 3696 "Shouldn't have an offset on TLS globals!"); 3697 const GlobalValue &GV = *GlobalOp.getGlobal(); 3698 3699 auto LoadGOT = 3700 MIB.buildInstr(AArch64::LOADgot, {&AArch64::GPR64commonRegClass}, {}) 3701 .addGlobalAddress(&GV, 0, AArch64II::MO_TLS); 3702 3703 auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass}, 3704 {LoadGOT.getReg(0)}) 3705 .addImm(0); 3706 3707 MIB.buildCopy(Register(AArch64::X0), LoadGOT.getReg(0)); 3708 // TLS calls preserve all registers except those that absolutely must be 3709 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be 3710 // silly). 3711 unsigned Opcode = getBLRCallOpcode(MF); 3712 3713 // With ptrauth-calls, the tlv access thunk pointer is authenticated (IA, 0). 3714 if (MF.getFunction().hasFnAttribute("ptrauth-calls")) { 3715 assert(Opcode == AArch64::BLR); 3716 Opcode = AArch64::BLRAAZ; 3717 } 3718 3719 MIB.buildInstr(Opcode, {}, {Load}) 3720 .addUse(AArch64::X0, RegState::Implicit) 3721 .addDef(AArch64::X0, RegState::Implicit) 3722 .addRegMask(TRI.getTLSCallPreservedMask()); 3723 3724 MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X0)); 3725 RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR64RegClass, 3726 MRI); 3727 I.eraseFromParent(); 3728 return true; 3729 } 3730 3731 MachineInstr *AArch64InstructionSelector::emitScalarToVector( 3732 unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar, 3733 MachineIRBuilder &MIRBuilder) const { 3734 auto Undef = MIRBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstRC}, {}); 3735 3736 auto BuildFn = [&](unsigned SubregIndex) { 3737 auto Ins = 3738 MIRBuilder 3739 .buildInstr(TargetOpcode::INSERT_SUBREG, {DstRC}, {Undef, Scalar}) 3740 .addImm(SubregIndex); 3741 constrainSelectedInstRegOperands(*Undef, TII, TRI, RBI); 3742 constrainSelectedInstRegOperands(*Ins, TII, TRI, RBI); 3743 return &*Ins; 3744 }; 3745 3746 switch (EltSize) { 3747 case 8: 3748 return BuildFn(AArch64::bsub); 3749 case 16: 3750 return BuildFn(AArch64::hsub); 3751 case 32: 3752 return BuildFn(AArch64::ssub); 3753 case 64: 3754 return BuildFn(AArch64::dsub); 3755 default: 3756 return nullptr; 3757 } 3758 } 3759 3760 MachineInstr * 3761 AArch64InstructionSelector::emitNarrowVector(Register DstReg, Register SrcReg, 3762 MachineIRBuilder &MIB, 3763 MachineRegisterInfo &MRI) const { 3764 LLT DstTy = MRI.getType(DstReg); 3765 const TargetRegisterClass *RC = 3766 getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(SrcReg, MRI, TRI)); 3767 if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) { 3768 LLVM_DEBUG(dbgs() << "Unsupported register class!\n"); 3769 return nullptr; 3770 } 3771 unsigned SubReg = 0; 3772 if (!getSubRegForClass(RC, TRI, SubReg)) 3773 return nullptr; 3774 if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) { 3775 LLVM_DEBUG(dbgs() << "Unsupported destination size! (" 3776 << DstTy.getSizeInBits() << "\n"); 3777 return nullptr; 3778 } 3779 auto Copy = MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}) 3780 .addReg(SrcReg, 0, SubReg); 3781 RBI.constrainGenericRegister(DstReg, *RC, MRI); 3782 return Copy; 3783 } 3784 3785 bool AArch64InstructionSelector::selectMergeValues( 3786 MachineInstr &I, MachineRegisterInfo &MRI) { 3787 assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode"); 3788 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 3789 const LLT SrcTy = MRI.getType(I.getOperand(1).getReg()); 3790 assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation"); 3791 const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI); 3792 3793 if (I.getNumOperands() != 3) 3794 return false; 3795 3796 // Merging 2 s64s into an s128. 3797 if (DstTy == LLT::scalar(128)) { 3798 if (SrcTy.getSizeInBits() != 64) 3799 return false; 3800 Register DstReg = I.getOperand(0).getReg(); 3801 Register Src1Reg = I.getOperand(1).getReg(); 3802 Register Src2Reg = I.getOperand(2).getReg(); 3803 auto Tmp = MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstTy}, {}); 3804 MachineInstr *InsMI = emitLaneInsert(std::nullopt, Tmp.getReg(0), Src1Reg, 3805 /* LaneIdx */ 0, RB, MIB); 3806 if (!InsMI) 3807 return false; 3808 MachineInstr *Ins2MI = emitLaneInsert(DstReg, InsMI->getOperand(0).getReg(), 3809 Src2Reg, /* LaneIdx */ 1, RB, MIB); 3810 if (!Ins2MI) 3811 return false; 3812 constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI); 3813 constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI); 3814 I.eraseFromParent(); 3815 return true; 3816 } 3817 3818 if (RB.getID() != AArch64::GPRRegBankID) 3819 return false; 3820 3821 if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32) 3822 return false; 3823 3824 auto *DstRC = &AArch64::GPR64RegClass; 3825 Register SubToRegDef = MRI.createVirtualRegister(DstRC); 3826 MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(), 3827 TII.get(TargetOpcode::SUBREG_TO_REG)) 3828 .addDef(SubToRegDef) 3829 .addImm(0) 3830 .addUse(I.getOperand(1).getReg()) 3831 .addImm(AArch64::sub_32); 3832 Register SubToRegDef2 = MRI.createVirtualRegister(DstRC); 3833 // Need to anyext the second scalar before we can use bfm 3834 MachineInstr &SubRegMI2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(), 3835 TII.get(TargetOpcode::SUBREG_TO_REG)) 3836 .addDef(SubToRegDef2) 3837 .addImm(0) 3838 .addUse(I.getOperand(2).getReg()) 3839 .addImm(AArch64::sub_32); 3840 MachineInstr &BFM = 3841 *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::BFMXri)) 3842 .addDef(I.getOperand(0).getReg()) 3843 .addUse(SubToRegDef) 3844 .addUse(SubToRegDef2) 3845 .addImm(32) 3846 .addImm(31); 3847 constrainSelectedInstRegOperands(SubRegMI, TII, TRI, RBI); 3848 constrainSelectedInstRegOperands(SubRegMI2, TII, TRI, RBI); 3849 constrainSelectedInstRegOperands(BFM, TII, TRI, RBI); 3850 I.eraseFromParent(); 3851 return true; 3852 } 3853 3854 static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg, 3855 const unsigned EltSize) { 3856 // Choose a lane copy opcode and subregister based off of the size of the 3857 // vector's elements. 3858 switch (EltSize) { 3859 case 8: 3860 CopyOpc = AArch64::DUPi8; 3861 ExtractSubReg = AArch64::bsub; 3862 break; 3863 case 16: 3864 CopyOpc = AArch64::DUPi16; 3865 ExtractSubReg = AArch64::hsub; 3866 break; 3867 case 32: 3868 CopyOpc = AArch64::DUPi32; 3869 ExtractSubReg = AArch64::ssub; 3870 break; 3871 case 64: 3872 CopyOpc = AArch64::DUPi64; 3873 ExtractSubReg = AArch64::dsub; 3874 break; 3875 default: 3876 // Unknown size, bail out. 3877 LLVM_DEBUG(dbgs() << "Elt size '" << EltSize << "' unsupported.\n"); 3878 return false; 3879 } 3880 return true; 3881 } 3882 3883 MachineInstr *AArch64InstructionSelector::emitExtractVectorElt( 3884 std::optional<Register> DstReg, const RegisterBank &DstRB, LLT ScalarTy, 3885 Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const { 3886 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 3887 unsigned CopyOpc = 0; 3888 unsigned ExtractSubReg = 0; 3889 if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, ScalarTy.getSizeInBits())) { 3890 LLVM_DEBUG( 3891 dbgs() << "Couldn't determine lane copy opcode for instruction.\n"); 3892 return nullptr; 3893 } 3894 3895 const TargetRegisterClass *DstRC = 3896 getRegClassForTypeOnBank(ScalarTy, DstRB, true); 3897 if (!DstRC) { 3898 LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n"); 3899 return nullptr; 3900 } 3901 3902 const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI); 3903 const LLT &VecTy = MRI.getType(VecReg); 3904 const TargetRegisterClass *VecRC = 3905 getRegClassForTypeOnBank(VecTy, VecRB, true); 3906 if (!VecRC) { 3907 LLVM_DEBUG(dbgs() << "Could not determine source register class.\n"); 3908 return nullptr; 3909 } 3910 3911 // The register that we're going to copy into. 3912 Register InsertReg = VecReg; 3913 if (!DstReg) 3914 DstReg = MRI.createVirtualRegister(DstRC); 3915 // If the lane index is 0, we just use a subregister COPY. 3916 if (LaneIdx == 0) { 3917 auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {*DstReg}, {}) 3918 .addReg(VecReg, 0, ExtractSubReg); 3919 RBI.constrainGenericRegister(*DstReg, *DstRC, MRI); 3920 return &*Copy; 3921 } 3922 3923 // Lane copies require 128-bit wide registers. If we're dealing with an 3924 // unpacked vector, then we need to move up to that width. Insert an implicit 3925 // def and a subregister insert to get us there. 3926 if (VecTy.getSizeInBits() != 128) { 3927 MachineInstr *ScalarToVector = emitScalarToVector( 3928 VecTy.getSizeInBits(), &AArch64::FPR128RegClass, VecReg, MIRBuilder); 3929 if (!ScalarToVector) 3930 return nullptr; 3931 InsertReg = ScalarToVector->getOperand(0).getReg(); 3932 } 3933 3934 MachineInstr *LaneCopyMI = 3935 MIRBuilder.buildInstr(CopyOpc, {*DstReg}, {InsertReg}).addImm(LaneIdx); 3936 constrainSelectedInstRegOperands(*LaneCopyMI, TII, TRI, RBI); 3937 3938 // Make sure that we actually constrain the initial copy. 3939 RBI.constrainGenericRegister(*DstReg, *DstRC, MRI); 3940 return LaneCopyMI; 3941 } 3942 3943 bool AArch64InstructionSelector::selectExtractElt( 3944 MachineInstr &I, MachineRegisterInfo &MRI) { 3945 assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT && 3946 "unexpected opcode!"); 3947 Register DstReg = I.getOperand(0).getReg(); 3948 const LLT NarrowTy = MRI.getType(DstReg); 3949 const Register SrcReg = I.getOperand(1).getReg(); 3950 const LLT WideTy = MRI.getType(SrcReg); 3951 (void)WideTy; 3952 assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() && 3953 "source register size too small!"); 3954 assert(!NarrowTy.isVector() && "cannot extract vector into vector!"); 3955 3956 // Need the lane index to determine the correct copy opcode. 3957 MachineOperand &LaneIdxOp = I.getOperand(2); 3958 assert(LaneIdxOp.isReg() && "Lane index operand was not a register?"); 3959 3960 if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) { 3961 LLVM_DEBUG(dbgs() << "Cannot extract into GPR.\n"); 3962 return false; 3963 } 3964 3965 // Find the index to extract from. 3966 auto VRegAndVal = getIConstantVRegValWithLookThrough(LaneIdxOp.getReg(), MRI); 3967 if (!VRegAndVal) 3968 return false; 3969 unsigned LaneIdx = VRegAndVal->Value.getSExtValue(); 3970 3971 3972 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); 3973 MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, NarrowTy, SrcReg, 3974 LaneIdx, MIB); 3975 if (!Extract) 3976 return false; 3977 3978 I.eraseFromParent(); 3979 return true; 3980 } 3981 3982 bool AArch64InstructionSelector::selectSplitVectorUnmerge( 3983 MachineInstr &I, MachineRegisterInfo &MRI) { 3984 unsigned NumElts = I.getNumOperands() - 1; 3985 Register SrcReg = I.getOperand(NumElts).getReg(); 3986 const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg()); 3987 const LLT SrcTy = MRI.getType(SrcReg); 3988 3989 assert(NarrowTy.isVector() && "Expected an unmerge into vectors"); 3990 if (SrcTy.getSizeInBits() > 128) { 3991 LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge"); 3992 return false; 3993 } 3994 3995 // We implement a split vector operation by treating the sub-vectors as 3996 // scalars and extracting them. 3997 const RegisterBank &DstRB = 3998 *RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI); 3999 for (unsigned OpIdx = 0; OpIdx < NumElts; ++OpIdx) { 4000 Register Dst = I.getOperand(OpIdx).getReg(); 4001 MachineInstr *Extract = 4002 emitExtractVectorElt(Dst, DstRB, NarrowTy, SrcReg, OpIdx, MIB); 4003 if (!Extract) 4004 return false; 4005 } 4006 I.eraseFromParent(); 4007 return true; 4008 } 4009 4010 bool AArch64InstructionSelector::selectUnmergeValues(MachineInstr &I, 4011 MachineRegisterInfo &MRI) { 4012 assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES && 4013 "unexpected opcode"); 4014 4015 // TODO: Handle unmerging into GPRs and from scalars to scalars. 4016 if (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() != 4017 AArch64::FPRRegBankID || 4018 RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() != 4019 AArch64::FPRRegBankID) { 4020 LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar " 4021 "currently unsupported.\n"); 4022 return false; 4023 } 4024 4025 // The last operand is the vector source register, and every other operand is 4026 // a register to unpack into. 4027 unsigned NumElts = I.getNumOperands() - 1; 4028 Register SrcReg = I.getOperand(NumElts).getReg(); 4029 const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg()); 4030 const LLT WideTy = MRI.getType(SrcReg); 4031 (void)WideTy; 4032 assert((WideTy.isVector() || WideTy.getSizeInBits() == 128) && 4033 "can only unmerge from vector or s128 types!"); 4034 assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() && 4035 "source register size too small!"); 4036 4037 if (!NarrowTy.isScalar()) 4038 return selectSplitVectorUnmerge(I, MRI); 4039 4040 // Choose a lane copy opcode and subregister based off of the size of the 4041 // vector's elements. 4042 unsigned CopyOpc = 0; 4043 unsigned ExtractSubReg = 0; 4044 if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, NarrowTy.getSizeInBits())) 4045 return false; 4046 4047 // Set up for the lane copies. 4048 MachineBasicBlock &MBB = *I.getParent(); 4049 4050 // Stores the registers we'll be copying from. 4051 SmallVector<Register, 4> InsertRegs; 4052 4053 // We'll use the first register twice, so we only need NumElts-1 registers. 4054 unsigned NumInsertRegs = NumElts - 1; 4055 4056 // If our elements fit into exactly 128 bits, then we can copy from the source 4057 // directly. Otherwise, we need to do a bit of setup with some subregister 4058 // inserts. 4059 if (NarrowTy.getSizeInBits() * NumElts == 128) { 4060 InsertRegs = SmallVector<Register, 4>(NumInsertRegs, SrcReg); 4061 } else { 4062 // No. We have to perform subregister inserts. For each insert, create an 4063 // implicit def and a subregister insert, and save the register we create. 4064 const TargetRegisterClass *RC = getRegClassForTypeOnBank( 4065 LLT::fixed_vector(NumElts, WideTy.getScalarSizeInBits()), 4066 *RBI.getRegBank(SrcReg, MRI, TRI)); 4067 unsigned SubReg = 0; 4068 bool Found = getSubRegForClass(RC, TRI, SubReg); 4069 (void)Found; 4070 assert(Found && "expected to find last operand's subeg idx"); 4071 for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) { 4072 Register ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass); 4073 MachineInstr &ImpDefMI = 4074 *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::IMPLICIT_DEF), 4075 ImpDefReg); 4076 4077 // Now, create the subregister insert from SrcReg. 4078 Register InsertReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass); 4079 MachineInstr &InsMI = 4080 *BuildMI(MBB, I, I.getDebugLoc(), 4081 TII.get(TargetOpcode::INSERT_SUBREG), InsertReg) 4082 .addUse(ImpDefReg) 4083 .addUse(SrcReg) 4084 .addImm(SubReg); 4085 4086 constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI); 4087 constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI); 4088 4089 // Save the register so that we can copy from it after. 4090 InsertRegs.push_back(InsertReg); 4091 } 4092 } 4093 4094 // Now that we've created any necessary subregister inserts, we can 4095 // create the copies. 4096 // 4097 // Perform the first copy separately as a subregister copy. 4098 Register CopyTo = I.getOperand(0).getReg(); 4099 auto FirstCopy = MIB.buildInstr(TargetOpcode::COPY, {CopyTo}, {}) 4100 .addReg(InsertRegs[0], 0, ExtractSubReg); 4101 constrainSelectedInstRegOperands(*FirstCopy, TII, TRI, RBI); 4102 4103 // Now, perform the remaining copies as vector lane copies. 4104 unsigned LaneIdx = 1; 4105 for (Register InsReg : InsertRegs) { 4106 Register CopyTo = I.getOperand(LaneIdx).getReg(); 4107 MachineInstr &CopyInst = 4108 *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CopyOpc), CopyTo) 4109 .addUse(InsReg) 4110 .addImm(LaneIdx); 4111 constrainSelectedInstRegOperands(CopyInst, TII, TRI, RBI); 4112 ++LaneIdx; 4113 } 4114 4115 // Separately constrain the first copy's destination. Because of the 4116 // limitation in constrainOperandRegClass, we can't guarantee that this will 4117 // actually be constrained. So, do it ourselves using the second operand. 4118 const TargetRegisterClass *RC = 4119 MRI.getRegClassOrNull(I.getOperand(1).getReg()); 4120 if (!RC) { 4121 LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n"); 4122 return false; 4123 } 4124 4125 RBI.constrainGenericRegister(CopyTo, *RC, MRI); 4126 I.eraseFromParent(); 4127 return true; 4128 } 4129 4130 bool AArch64InstructionSelector::selectConcatVectors( 4131 MachineInstr &I, MachineRegisterInfo &MRI) { 4132 assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS && 4133 "Unexpected opcode"); 4134 Register Dst = I.getOperand(0).getReg(); 4135 Register Op1 = I.getOperand(1).getReg(); 4136 Register Op2 = I.getOperand(2).getReg(); 4137 MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIB); 4138 if (!ConcatMI) 4139 return false; 4140 I.eraseFromParent(); 4141 return true; 4142 } 4143 4144 unsigned 4145 AArch64InstructionSelector::emitConstantPoolEntry(const Constant *CPVal, 4146 MachineFunction &MF) const { 4147 Type *CPTy = CPVal->getType(); 4148 Align Alignment = MF.getDataLayout().getPrefTypeAlign(CPTy); 4149 4150 MachineConstantPool *MCP = MF.getConstantPool(); 4151 return MCP->getConstantPoolIndex(CPVal, Alignment); 4152 } 4153 4154 MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool( 4155 const Constant *CPVal, MachineIRBuilder &MIRBuilder) const { 4156 const TargetRegisterClass *RC; 4157 unsigned Opc; 4158 bool IsTiny = TM.getCodeModel() == CodeModel::Tiny; 4159 unsigned Size = MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType()); 4160 switch (Size) { 4161 case 16: 4162 RC = &AArch64::FPR128RegClass; 4163 Opc = IsTiny ? AArch64::LDRQl : AArch64::LDRQui; 4164 break; 4165 case 8: 4166 RC = &AArch64::FPR64RegClass; 4167 Opc = IsTiny ? AArch64::LDRDl : AArch64::LDRDui; 4168 break; 4169 case 4: 4170 RC = &AArch64::FPR32RegClass; 4171 Opc = IsTiny ? AArch64::LDRSl : AArch64::LDRSui; 4172 break; 4173 case 2: 4174 RC = &AArch64::FPR16RegClass; 4175 Opc = AArch64::LDRHui; 4176 break; 4177 default: 4178 LLVM_DEBUG(dbgs() << "Could not load from constant pool of type " 4179 << *CPVal->getType()); 4180 return nullptr; 4181 } 4182 4183 MachineInstr *LoadMI = nullptr; 4184 auto &MF = MIRBuilder.getMF(); 4185 unsigned CPIdx = emitConstantPoolEntry(CPVal, MF); 4186 if (IsTiny && (Size == 16 || Size == 8 || Size == 4)) { 4187 // Use load(literal) for tiny code model. 4188 LoadMI = &*MIRBuilder.buildInstr(Opc, {RC}, {}).addConstantPoolIndex(CPIdx); 4189 } else { 4190 auto Adrp = 4191 MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {}) 4192 .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE); 4193 4194 LoadMI = &*MIRBuilder.buildInstr(Opc, {RC}, {Adrp}) 4195 .addConstantPoolIndex( 4196 CPIdx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 4197 4198 constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI); 4199 } 4200 4201 MachinePointerInfo PtrInfo = MachinePointerInfo::getConstantPool(MF); 4202 LoadMI->addMemOperand(MF, MF.getMachineMemOperand(PtrInfo, 4203 MachineMemOperand::MOLoad, 4204 Size, Align(Size))); 4205 constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI); 4206 return LoadMI; 4207 } 4208 4209 /// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given 4210 /// size and RB. 4211 static std::pair<unsigned, unsigned> 4212 getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) { 4213 unsigned Opc, SubregIdx; 4214 if (RB.getID() == AArch64::GPRRegBankID) { 4215 if (EltSize == 8) { 4216 Opc = AArch64::INSvi8gpr; 4217 SubregIdx = AArch64::bsub; 4218 } else if (EltSize == 16) { 4219 Opc = AArch64::INSvi16gpr; 4220 SubregIdx = AArch64::ssub; 4221 } else if (EltSize == 32) { 4222 Opc = AArch64::INSvi32gpr; 4223 SubregIdx = AArch64::ssub; 4224 } else if (EltSize == 64) { 4225 Opc = AArch64::INSvi64gpr; 4226 SubregIdx = AArch64::dsub; 4227 } else { 4228 llvm_unreachable("invalid elt size!"); 4229 } 4230 } else { 4231 if (EltSize == 8) { 4232 Opc = AArch64::INSvi8lane; 4233 SubregIdx = AArch64::bsub; 4234 } else if (EltSize == 16) { 4235 Opc = AArch64::INSvi16lane; 4236 SubregIdx = AArch64::hsub; 4237 } else if (EltSize == 32) { 4238 Opc = AArch64::INSvi32lane; 4239 SubregIdx = AArch64::ssub; 4240 } else if (EltSize == 64) { 4241 Opc = AArch64::INSvi64lane; 4242 SubregIdx = AArch64::dsub; 4243 } else { 4244 llvm_unreachable("invalid elt size!"); 4245 } 4246 } 4247 return std::make_pair(Opc, SubregIdx); 4248 } 4249 4250 MachineInstr *AArch64InstructionSelector::emitInstr( 4251 unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps, 4252 std::initializer_list<llvm::SrcOp> SrcOps, MachineIRBuilder &MIRBuilder, 4253 const ComplexRendererFns &RenderFns) const { 4254 assert(Opcode && "Expected an opcode?"); 4255 assert(!isPreISelGenericOpcode(Opcode) && 4256 "Function should only be used to produce selected instructions!"); 4257 auto MI = MIRBuilder.buildInstr(Opcode, DstOps, SrcOps); 4258 if (RenderFns) 4259 for (auto &Fn : *RenderFns) 4260 Fn(MI); 4261 constrainSelectedInstRegOperands(*MI, TII, TRI, RBI); 4262 return &*MI; 4263 } 4264 4265 MachineInstr *AArch64InstructionSelector::emitAddSub( 4266 const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode, 4267 Register Dst, MachineOperand &LHS, MachineOperand &RHS, 4268 MachineIRBuilder &MIRBuilder) const { 4269 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); 4270 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?"); 4271 auto Ty = MRI.getType(LHS.getReg()); 4272 assert(!Ty.isVector() && "Expected a scalar or pointer?"); 4273 unsigned Size = Ty.getSizeInBits(); 4274 assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit type only"); 4275 bool Is32Bit = Size == 32; 4276 4277 // INSTRri form with positive arithmetic immediate. 4278 if (auto Fns = selectArithImmed(RHS)) 4279 return emitInstr(AddrModeAndSizeToOpcode[0][Is32Bit], {Dst}, {LHS}, 4280 MIRBuilder, Fns); 4281 4282 // INSTRri form with negative arithmetic immediate. 4283 if (auto Fns = selectNegArithImmed(RHS)) 4284 return emitInstr(AddrModeAndSizeToOpcode[3][Is32Bit], {Dst}, {LHS}, 4285 MIRBuilder, Fns); 4286 4287 // INSTRrx form. 4288 if (auto Fns = selectArithExtendedRegister(RHS)) 4289 return emitInstr(AddrModeAndSizeToOpcode[4][Is32Bit], {Dst}, {LHS}, 4290 MIRBuilder, Fns); 4291 4292 // INSTRrs form. 4293 if (auto Fns = selectShiftedRegister(RHS)) 4294 return emitInstr(AddrModeAndSizeToOpcode[1][Is32Bit], {Dst}, {LHS}, 4295 MIRBuilder, Fns); 4296 return emitInstr(AddrModeAndSizeToOpcode[2][Is32Bit], {Dst}, {LHS, RHS}, 4297 MIRBuilder); 4298 } 4299 4300 MachineInstr * 4301 AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS, 4302 MachineOperand &RHS, 4303 MachineIRBuilder &MIRBuilder) const { 4304 const std::array<std::array<unsigned, 2>, 5> OpcTable{ 4305 {{AArch64::ADDXri, AArch64::ADDWri}, 4306 {AArch64::ADDXrs, AArch64::ADDWrs}, 4307 {AArch64::ADDXrr, AArch64::ADDWrr}, 4308 {AArch64::SUBXri, AArch64::SUBWri}, 4309 {AArch64::ADDXrx, AArch64::ADDWrx}}}; 4310 return emitAddSub(OpcTable, DefReg, LHS, RHS, MIRBuilder); 4311 } 4312 4313 MachineInstr * 4314 AArch64InstructionSelector::emitADDS(Register Dst, MachineOperand &LHS, 4315 MachineOperand &RHS, 4316 MachineIRBuilder &MIRBuilder) const { 4317 const std::array<std::array<unsigned, 2>, 5> OpcTable{ 4318 {{AArch64::ADDSXri, AArch64::ADDSWri}, 4319 {AArch64::ADDSXrs, AArch64::ADDSWrs}, 4320 {AArch64::ADDSXrr, AArch64::ADDSWrr}, 4321 {AArch64::SUBSXri, AArch64::SUBSWri}, 4322 {AArch64::ADDSXrx, AArch64::ADDSWrx}}}; 4323 return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder); 4324 } 4325 4326 MachineInstr * 4327 AArch64InstructionSelector::emitSUBS(Register Dst, MachineOperand &LHS, 4328 MachineOperand &RHS, 4329 MachineIRBuilder &MIRBuilder) const { 4330 const std::array<std::array<unsigned, 2>, 5> OpcTable{ 4331 {{AArch64::SUBSXri, AArch64::SUBSWri}, 4332 {AArch64::SUBSXrs, AArch64::SUBSWrs}, 4333 {AArch64::SUBSXrr, AArch64::SUBSWrr}, 4334 {AArch64::ADDSXri, AArch64::ADDSWri}, 4335 {AArch64::SUBSXrx, AArch64::SUBSWrx}}}; 4336 return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder); 4337 } 4338 4339 MachineInstr * 4340 AArch64InstructionSelector::emitADCS(Register Dst, MachineOperand &LHS, 4341 MachineOperand &RHS, 4342 MachineIRBuilder &MIRBuilder) const { 4343 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?"); 4344 MachineRegisterInfo *MRI = MIRBuilder.getMRI(); 4345 bool Is32Bit = (MRI->getType(LHS.getReg()).getSizeInBits() == 32); 4346 static const unsigned OpcTable[2] = {AArch64::ADCSXr, AArch64::ADCSWr}; 4347 return emitInstr(OpcTable[Is32Bit], {Dst}, {LHS, RHS}, MIRBuilder); 4348 } 4349 4350 MachineInstr * 4351 AArch64InstructionSelector::emitSBCS(Register Dst, MachineOperand &LHS, 4352 MachineOperand &RHS, 4353 MachineIRBuilder &MIRBuilder) const { 4354 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?"); 4355 MachineRegisterInfo *MRI = MIRBuilder.getMRI(); 4356 bool Is32Bit = (MRI->getType(LHS.getReg()).getSizeInBits() == 32); 4357 static const unsigned OpcTable[2] = {AArch64::SBCSXr, AArch64::SBCSWr}; 4358 return emitInstr(OpcTable[Is32Bit], {Dst}, {LHS, RHS}, MIRBuilder); 4359 } 4360 4361 MachineInstr * 4362 AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS, 4363 MachineIRBuilder &MIRBuilder) const { 4364 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); 4365 bool Is32Bit = (MRI.getType(LHS.getReg()).getSizeInBits() == 32); 4366 auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass; 4367 return emitADDS(MRI.createVirtualRegister(RC), LHS, RHS, MIRBuilder); 4368 } 4369 4370 MachineInstr * 4371 AArch64InstructionSelector::emitTST(MachineOperand &LHS, MachineOperand &RHS, 4372 MachineIRBuilder &MIRBuilder) const { 4373 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?"); 4374 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); 4375 LLT Ty = MRI.getType(LHS.getReg()); 4376 unsigned RegSize = Ty.getSizeInBits(); 4377 bool Is32Bit = (RegSize == 32); 4378 const unsigned OpcTable[3][2] = {{AArch64::ANDSXri, AArch64::ANDSWri}, 4379 {AArch64::ANDSXrs, AArch64::ANDSWrs}, 4380 {AArch64::ANDSXrr, AArch64::ANDSWrr}}; 4381 // ANDS needs a logical immediate for its immediate form. Check if we can 4382 // fold one in. 4383 if (auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI)) { 4384 int64_t Imm = ValAndVReg->Value.getSExtValue(); 4385 4386 if (AArch64_AM::isLogicalImmediate(Imm, RegSize)) { 4387 auto TstMI = MIRBuilder.buildInstr(OpcTable[0][Is32Bit], {Ty}, {LHS}); 4388 TstMI.addImm(AArch64_AM::encodeLogicalImmediate(Imm, RegSize)); 4389 constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI); 4390 return &*TstMI; 4391 } 4392 } 4393 4394 if (auto Fns = selectLogicalShiftedRegister(RHS)) 4395 return emitInstr(OpcTable[1][Is32Bit], {Ty}, {LHS}, MIRBuilder, Fns); 4396 return emitInstr(OpcTable[2][Is32Bit], {Ty}, {LHS, RHS}, MIRBuilder); 4397 } 4398 4399 MachineInstr *AArch64InstructionSelector::emitIntegerCompare( 4400 MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate, 4401 MachineIRBuilder &MIRBuilder) const { 4402 assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!"); 4403 assert(Predicate.isPredicate() && "Expected predicate?"); 4404 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); 4405 LLT CmpTy = MRI.getType(LHS.getReg()); 4406 assert(!CmpTy.isVector() && "Expected scalar or pointer"); 4407 unsigned Size = CmpTy.getSizeInBits(); 4408 (void)Size; 4409 assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit LHS/RHS?"); 4410 // Fold the compare into a cmn or tst if possible. 4411 if (auto FoldCmp = tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder)) 4412 return FoldCmp; 4413 auto Dst = MRI.cloneVirtualRegister(LHS.getReg()); 4414 return emitSUBS(Dst, LHS, RHS, MIRBuilder); 4415 } 4416 4417 MachineInstr *AArch64InstructionSelector::emitCSetForFCmp( 4418 Register Dst, CmpInst::Predicate Pred, MachineIRBuilder &MIRBuilder) const { 4419 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 4420 #ifndef NDEBUG 4421 LLT Ty = MRI.getType(Dst); 4422 assert(!Ty.isVector() && Ty.getSizeInBits() == 32 && 4423 "Expected a 32-bit scalar register?"); 4424 #endif 4425 const Register ZReg = AArch64::WZR; 4426 AArch64CC::CondCode CC1, CC2; 4427 changeFCMPPredToAArch64CC(Pred, CC1, CC2); 4428 auto InvCC1 = AArch64CC::getInvertedCondCode(CC1); 4429 if (CC2 == AArch64CC::AL) 4430 return emitCSINC(/*Dst=*/Dst, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1, 4431 MIRBuilder); 4432 const TargetRegisterClass *RC = &AArch64::GPR32RegClass; 4433 Register Def1Reg = MRI.createVirtualRegister(RC); 4434 Register Def2Reg = MRI.createVirtualRegister(RC); 4435 auto InvCC2 = AArch64CC::getInvertedCondCode(CC2); 4436 emitCSINC(/*Dst=*/Def1Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1, MIRBuilder); 4437 emitCSINC(/*Dst=*/Def2Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC2, MIRBuilder); 4438 auto OrMI = MIRBuilder.buildInstr(AArch64::ORRWrr, {Dst}, {Def1Reg, Def2Reg}); 4439 constrainSelectedInstRegOperands(*OrMI, TII, TRI, RBI); 4440 return &*OrMI; 4441 } 4442 4443 MachineInstr *AArch64InstructionSelector::emitFPCompare( 4444 Register LHS, Register RHS, MachineIRBuilder &MIRBuilder, 4445 std::optional<CmpInst::Predicate> Pred) const { 4446 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 4447 LLT Ty = MRI.getType(LHS); 4448 if (Ty.isVector()) 4449 return nullptr; 4450 unsigned OpSize = Ty.getSizeInBits(); 4451 assert(OpSize == 16 || OpSize == 32 || OpSize == 64); 4452 4453 // If this is a compare against +0.0, then we don't have 4454 // to explicitly materialize a constant. 4455 const ConstantFP *FPImm = getConstantFPVRegVal(RHS, MRI); 4456 bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative()); 4457 4458 auto IsEqualityPred = [](CmpInst::Predicate P) { 4459 return P == CmpInst::FCMP_OEQ || P == CmpInst::FCMP_ONE || 4460 P == CmpInst::FCMP_UEQ || P == CmpInst::FCMP_UNE; 4461 }; 4462 if (!ShouldUseImm && Pred && IsEqualityPred(*Pred)) { 4463 // Try commutating the operands. 4464 const ConstantFP *LHSImm = getConstantFPVRegVal(LHS, MRI); 4465 if (LHSImm && (LHSImm->isZero() && !LHSImm->isNegative())) { 4466 ShouldUseImm = true; 4467 std::swap(LHS, RHS); 4468 } 4469 } 4470 unsigned CmpOpcTbl[2][3] = { 4471 {AArch64::FCMPHrr, AArch64::FCMPSrr, AArch64::FCMPDrr}, 4472 {AArch64::FCMPHri, AArch64::FCMPSri, AArch64::FCMPDri}}; 4473 unsigned CmpOpc = 4474 CmpOpcTbl[ShouldUseImm][OpSize == 16 ? 0 : (OpSize == 32 ? 1 : 2)]; 4475 4476 // Partially build the compare. Decide if we need to add a use for the 4477 // third operand based off whether or not we're comparing against 0.0. 4478 auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addUse(LHS); 4479 CmpMI.setMIFlags(MachineInstr::NoFPExcept); 4480 if (!ShouldUseImm) 4481 CmpMI.addUse(RHS); 4482 constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI); 4483 return &*CmpMI; 4484 } 4485 4486 MachineInstr *AArch64InstructionSelector::emitVectorConcat( 4487 std::optional<Register> Dst, Register Op1, Register Op2, 4488 MachineIRBuilder &MIRBuilder) const { 4489 // We implement a vector concat by: 4490 // 1. Use scalar_to_vector to insert the lower vector into the larger dest 4491 // 2. Insert the upper vector into the destination's upper element 4492 // TODO: some of this code is common with G_BUILD_VECTOR handling. 4493 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); 4494 4495 const LLT Op1Ty = MRI.getType(Op1); 4496 const LLT Op2Ty = MRI.getType(Op2); 4497 4498 if (Op1Ty != Op2Ty) { 4499 LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys"); 4500 return nullptr; 4501 } 4502 assert(Op1Ty.isVector() && "Expected a vector for vector concat"); 4503 4504 if (Op1Ty.getSizeInBits() >= 128) { 4505 LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors"); 4506 return nullptr; 4507 } 4508 4509 // At the moment we just support 64 bit vector concats. 4510 if (Op1Ty.getSizeInBits() != 64) { 4511 LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors"); 4512 return nullptr; 4513 } 4514 4515 const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits()); 4516 const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI); 4517 const TargetRegisterClass *DstRC = 4518 getRegClassForTypeOnBank(Op1Ty.multiplyElements(2), FPRBank); 4519 4520 MachineInstr *WidenedOp1 = 4521 emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder); 4522 MachineInstr *WidenedOp2 = 4523 emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op2, MIRBuilder); 4524 if (!WidenedOp1 || !WidenedOp2) { 4525 LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value"); 4526 return nullptr; 4527 } 4528 4529 // Now do the insert of the upper element. 4530 unsigned InsertOpc, InsSubRegIdx; 4531 std::tie(InsertOpc, InsSubRegIdx) = 4532 getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits()); 4533 4534 if (!Dst) 4535 Dst = MRI.createVirtualRegister(DstRC); 4536 auto InsElt = 4537 MIRBuilder 4538 .buildInstr(InsertOpc, {*Dst}, {WidenedOp1->getOperand(0).getReg()}) 4539 .addImm(1) /* Lane index */ 4540 .addUse(WidenedOp2->getOperand(0).getReg()) 4541 .addImm(0); 4542 constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI); 4543 return &*InsElt; 4544 } 4545 4546 MachineInstr * 4547 AArch64InstructionSelector::emitCSINC(Register Dst, Register Src1, 4548 Register Src2, AArch64CC::CondCode Pred, 4549 MachineIRBuilder &MIRBuilder) const { 4550 auto &MRI = *MIRBuilder.getMRI(); 4551 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Dst); 4552 // If we used a register class, then this won't necessarily have an LLT. 4553 // Compute the size based off whether or not we have a class or bank. 4554 unsigned Size; 4555 if (const auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>()) 4556 Size = TRI.getRegSizeInBits(*RC); 4557 else 4558 Size = MRI.getType(Dst).getSizeInBits(); 4559 // Some opcodes use s1. 4560 assert(Size <= 64 && "Expected 64 bits or less only!"); 4561 static const unsigned OpcTable[2] = {AArch64::CSINCWr, AArch64::CSINCXr}; 4562 unsigned Opc = OpcTable[Size == 64]; 4563 auto CSINC = MIRBuilder.buildInstr(Opc, {Dst}, {Src1, Src2}).addImm(Pred); 4564 constrainSelectedInstRegOperands(*CSINC, TII, TRI, RBI); 4565 return &*CSINC; 4566 } 4567 4568 MachineInstr *AArch64InstructionSelector::emitCarryIn(MachineInstr &I, 4569 Register CarryReg) { 4570 MachineRegisterInfo *MRI = MIB.getMRI(); 4571 unsigned Opcode = I.getOpcode(); 4572 4573 // If the instruction is a SUB, we need to negate the carry, 4574 // because borrowing is indicated by carry-flag == 0. 4575 bool NeedsNegatedCarry = 4576 (Opcode == TargetOpcode::G_USUBE || Opcode == TargetOpcode::G_SSUBE); 4577 4578 // If the previous instruction will already produce the correct carry, do not 4579 // emit a carry generating instruction. E.g. for G_UADDE/G_USUBE sequences 4580 // generated during legalization of wide add/sub. This optimization depends on 4581 // these sequences not being interrupted by other instructions. 4582 // We have to select the previous instruction before the carry-using 4583 // instruction is deleted by the calling function, otherwise the previous 4584 // instruction might become dead and would get deleted. 4585 MachineInstr *SrcMI = MRI->getVRegDef(CarryReg); 4586 if (SrcMI == I.getPrevNode()) { 4587 if (auto *CarrySrcMI = dyn_cast<GAddSubCarryOut>(SrcMI)) { 4588 bool ProducesNegatedCarry = CarrySrcMI->isSub(); 4589 if (NeedsNegatedCarry == ProducesNegatedCarry && 4590 CarrySrcMI->isUnsigned() && 4591 CarrySrcMI->getCarryOutReg() == CarryReg && 4592 selectAndRestoreState(*SrcMI)) 4593 return nullptr; 4594 } 4595 } 4596 4597 Register DeadReg = MRI->createVirtualRegister(&AArch64::GPR32RegClass); 4598 4599 if (NeedsNegatedCarry) { 4600 // (0 - Carry) sets !C in NZCV when Carry == 1 4601 Register ZReg = AArch64::WZR; 4602 return emitInstr(AArch64::SUBSWrr, {DeadReg}, {ZReg, CarryReg}, MIB); 4603 } 4604 4605 // (Carry - 1) sets !C in NZCV when Carry == 0 4606 auto Fns = select12BitValueWithLeftShift(1); 4607 return emitInstr(AArch64::SUBSWri, {DeadReg}, {CarryReg}, MIB, Fns); 4608 } 4609 4610 bool AArch64InstructionSelector::selectOverflowOp(MachineInstr &I, 4611 MachineRegisterInfo &MRI) { 4612 auto &CarryMI = cast<GAddSubCarryOut>(I); 4613 4614 if (auto *CarryInMI = dyn_cast<GAddSubCarryInOut>(&I)) { 4615 // Set NZCV carry according to carry-in VReg 4616 emitCarryIn(I, CarryInMI->getCarryInReg()); 4617 } 4618 4619 // Emit the operation and get the correct condition code. 4620 auto OpAndCC = emitOverflowOp(I.getOpcode(), CarryMI.getDstReg(), 4621 CarryMI.getLHS(), CarryMI.getRHS(), MIB); 4622 4623 Register CarryOutReg = CarryMI.getCarryOutReg(); 4624 4625 // Don't convert carry-out to VReg if it is never used 4626 if (!MRI.use_nodbg_empty(CarryOutReg)) { 4627 // Now, put the overflow result in the register given by the first operand 4628 // to the overflow op. CSINC increments the result when the predicate is 4629 // false, so to get the increment when it's true, we need to use the 4630 // inverse. In this case, we want to increment when carry is set. 4631 Register ZReg = AArch64::WZR; 4632 emitCSINC(/*Dst=*/CarryOutReg, /*Src1=*/ZReg, /*Src2=*/ZReg, 4633 getInvertedCondCode(OpAndCC.second), MIB); 4634 } 4635 4636 I.eraseFromParent(); 4637 return true; 4638 } 4639 4640 std::pair<MachineInstr *, AArch64CC::CondCode> 4641 AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst, 4642 MachineOperand &LHS, 4643 MachineOperand &RHS, 4644 MachineIRBuilder &MIRBuilder) const { 4645 switch (Opcode) { 4646 default: 4647 llvm_unreachable("Unexpected opcode!"); 4648 case TargetOpcode::G_SADDO: 4649 return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS); 4650 case TargetOpcode::G_UADDO: 4651 return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS); 4652 case TargetOpcode::G_SSUBO: 4653 return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS); 4654 case TargetOpcode::G_USUBO: 4655 return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO); 4656 case TargetOpcode::G_SADDE: 4657 return std::make_pair(emitADCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS); 4658 case TargetOpcode::G_UADDE: 4659 return std::make_pair(emitADCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS); 4660 case TargetOpcode::G_SSUBE: 4661 return std::make_pair(emitSBCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS); 4662 case TargetOpcode::G_USUBE: 4663 return std::make_pair(emitSBCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO); 4664 } 4665 } 4666 4667 /// Returns true if @p Val is a tree of AND/OR/CMP operations that can be 4668 /// expressed as a conjunction. 4669 /// \param CanNegate Set to true if we can negate the whole sub-tree just by 4670 /// changing the conditions on the CMP tests. 4671 /// (this means we can call emitConjunctionRec() with 4672 /// Negate==true on this sub-tree) 4673 /// \param MustBeFirst Set to true if this subtree needs to be negated and we 4674 /// cannot do the negation naturally. We are required to 4675 /// emit the subtree first in this case. 4676 /// \param WillNegate Is true if are called when the result of this 4677 /// subexpression must be negated. This happens when the 4678 /// outer expression is an OR. We can use this fact to know 4679 /// that we have a double negation (or (or ...) ...) that 4680 /// can be implemented for free. 4681 static bool canEmitConjunction(Register Val, bool &CanNegate, bool &MustBeFirst, 4682 bool WillNegate, MachineRegisterInfo &MRI, 4683 unsigned Depth = 0) { 4684 if (!MRI.hasOneNonDBGUse(Val)) 4685 return false; 4686 MachineInstr *ValDef = MRI.getVRegDef(Val); 4687 unsigned Opcode = ValDef->getOpcode(); 4688 if (isa<GAnyCmp>(ValDef)) { 4689 CanNegate = true; 4690 MustBeFirst = false; 4691 return true; 4692 } 4693 // Protect against exponential runtime and stack overflow. 4694 if (Depth > 6) 4695 return false; 4696 if (Opcode == TargetOpcode::G_AND || Opcode == TargetOpcode::G_OR) { 4697 bool IsOR = Opcode == TargetOpcode::G_OR; 4698 Register O0 = ValDef->getOperand(1).getReg(); 4699 Register O1 = ValDef->getOperand(2).getReg(); 4700 bool CanNegateL; 4701 bool MustBeFirstL; 4702 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, MRI, Depth + 1)) 4703 return false; 4704 bool CanNegateR; 4705 bool MustBeFirstR; 4706 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, MRI, Depth + 1)) 4707 return false; 4708 4709 if (MustBeFirstL && MustBeFirstR) 4710 return false; 4711 4712 if (IsOR) { 4713 // For an OR expression we need to be able to naturally negate at least 4714 // one side or we cannot do the transformation at all. 4715 if (!CanNegateL && !CanNegateR) 4716 return false; 4717 // If we the result of the OR will be negated and we can naturally negate 4718 // the leaves, then this sub-tree as a whole negates naturally. 4719 CanNegate = WillNegate && CanNegateL && CanNegateR; 4720 // If we cannot naturally negate the whole sub-tree, then this must be 4721 // emitted first. 4722 MustBeFirst = !CanNegate; 4723 } else { 4724 assert(Opcode == TargetOpcode::G_AND && "Must be G_AND"); 4725 // We cannot naturally negate an AND operation. 4726 CanNegate = false; 4727 MustBeFirst = MustBeFirstL || MustBeFirstR; 4728 } 4729 return true; 4730 } 4731 return false; 4732 } 4733 4734 MachineInstr *AArch64InstructionSelector::emitConditionalComparison( 4735 Register LHS, Register RHS, CmpInst::Predicate CC, 4736 AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, 4737 MachineIRBuilder &MIB) const { 4738 auto &MRI = *MIB.getMRI(); 4739 LLT OpTy = MRI.getType(LHS); 4740 unsigned CCmpOpc; 4741 std::optional<ValueAndVReg> C; 4742 if (CmpInst::isIntPredicate(CC)) { 4743 assert(OpTy.getSizeInBits() == 32 || OpTy.getSizeInBits() == 64); 4744 C = getIConstantVRegValWithLookThrough(RHS, MRI); 4745 if (!C || C->Value.sgt(31) || C->Value.slt(-31)) 4746 CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWr : AArch64::CCMPXr; 4747 else if (C->Value.ule(31)) 4748 CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWi : AArch64::CCMPXi; 4749 else 4750 CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMNWi : AArch64::CCMNXi; 4751 } else { 4752 assert(OpTy.getSizeInBits() == 16 || OpTy.getSizeInBits() == 32 || 4753 OpTy.getSizeInBits() == 64); 4754 switch (OpTy.getSizeInBits()) { 4755 case 16: 4756 assert(STI.hasFullFP16() && "Expected Full FP16 for fp16 comparisons"); 4757 CCmpOpc = AArch64::FCCMPHrr; 4758 break; 4759 case 32: 4760 CCmpOpc = AArch64::FCCMPSrr; 4761 break; 4762 case 64: 4763 CCmpOpc = AArch64::FCCMPDrr; 4764 break; 4765 default: 4766 return nullptr; 4767 } 4768 } 4769 AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); 4770 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); 4771 auto CCmp = 4772 MIB.buildInstr(CCmpOpc, {}, {LHS}); 4773 if (CCmpOpc == AArch64::CCMPWi || CCmpOpc == AArch64::CCMPXi) 4774 CCmp.addImm(C->Value.getZExtValue()); 4775 else if (CCmpOpc == AArch64::CCMNWi || CCmpOpc == AArch64::CCMNXi) 4776 CCmp.addImm(C->Value.abs().getZExtValue()); 4777 else 4778 CCmp.addReg(RHS); 4779 CCmp.addImm(NZCV).addImm(Predicate); 4780 constrainSelectedInstRegOperands(*CCmp, TII, TRI, RBI); 4781 return &*CCmp; 4782 } 4783 4784 MachineInstr *AArch64InstructionSelector::emitConjunctionRec( 4785 Register Val, AArch64CC::CondCode &OutCC, bool Negate, Register CCOp, 4786 AArch64CC::CondCode Predicate, MachineIRBuilder &MIB) const { 4787 // We're at a tree leaf, produce a conditional comparison operation. 4788 auto &MRI = *MIB.getMRI(); 4789 MachineInstr *ValDef = MRI.getVRegDef(Val); 4790 unsigned Opcode = ValDef->getOpcode(); 4791 if (auto *Cmp = dyn_cast<GAnyCmp>(ValDef)) { 4792 Register LHS = Cmp->getLHSReg(); 4793 Register RHS = Cmp->getRHSReg(); 4794 CmpInst::Predicate CC = Cmp->getCond(); 4795 if (Negate) 4796 CC = CmpInst::getInversePredicate(CC); 4797 if (isa<GICmp>(Cmp)) { 4798 OutCC = changeICMPPredToAArch64CC(CC); 4799 } else { 4800 // Handle special FP cases. 4801 AArch64CC::CondCode ExtraCC; 4802 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC); 4803 // Some floating point conditions can't be tested with a single condition 4804 // code. Construct an additional comparison in this case. 4805 if (ExtraCC != AArch64CC::AL) { 4806 MachineInstr *ExtraCmp; 4807 if (!CCOp) 4808 ExtraCmp = emitFPCompare(LHS, RHS, MIB, CC); 4809 else 4810 ExtraCmp = 4811 emitConditionalComparison(LHS, RHS, CC, Predicate, ExtraCC, MIB); 4812 CCOp = ExtraCmp->getOperand(0).getReg(); 4813 Predicate = ExtraCC; 4814 } 4815 } 4816 4817 // Produce a normal comparison if we are first in the chain 4818 if (!CCOp) { 4819 auto Dst = MRI.cloneVirtualRegister(LHS); 4820 if (isa<GICmp>(Cmp)) 4821 return emitSUBS(Dst, Cmp->getOperand(2), Cmp->getOperand(3), MIB); 4822 return emitFPCompare(Cmp->getOperand(2).getReg(), 4823 Cmp->getOperand(3).getReg(), MIB); 4824 } 4825 // Otherwise produce a ccmp. 4826 return emitConditionalComparison(LHS, RHS, CC, Predicate, OutCC, MIB); 4827 } 4828 assert(MRI.hasOneNonDBGUse(Val) && "Valid conjunction/disjunction tree"); 4829 4830 bool IsOR = Opcode == TargetOpcode::G_OR; 4831 4832 Register LHS = ValDef->getOperand(1).getReg(); 4833 bool CanNegateL; 4834 bool MustBeFirstL; 4835 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR, MRI); 4836 assert(ValidL && "Valid conjunction/disjunction tree"); 4837 (void)ValidL; 4838 4839 Register RHS = ValDef->getOperand(2).getReg(); 4840 bool CanNegateR; 4841 bool MustBeFirstR; 4842 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR, MRI); 4843 assert(ValidR && "Valid conjunction/disjunction tree"); 4844 (void)ValidR; 4845 4846 // Swap sub-tree that must come first to the right side. 4847 if (MustBeFirstL) { 4848 assert(!MustBeFirstR && "Valid conjunction/disjunction tree"); 4849 std::swap(LHS, RHS); 4850 std::swap(CanNegateL, CanNegateR); 4851 std::swap(MustBeFirstL, MustBeFirstR); 4852 } 4853 4854 bool NegateR; 4855 bool NegateAfterR; 4856 bool NegateL; 4857 bool NegateAfterAll; 4858 if (Opcode == TargetOpcode::G_OR) { 4859 // Swap the sub-tree that we can negate naturally to the left. 4860 if (!CanNegateL) { 4861 assert(CanNegateR && "at least one side must be negatable"); 4862 assert(!MustBeFirstR && "invalid conjunction/disjunction tree"); 4863 assert(!Negate); 4864 std::swap(LHS, RHS); 4865 NegateR = false; 4866 NegateAfterR = true; 4867 } else { 4868 // Negate the left sub-tree if possible, otherwise negate the result. 4869 NegateR = CanNegateR; 4870 NegateAfterR = !CanNegateR; 4871 } 4872 NegateL = true; 4873 NegateAfterAll = !Negate; 4874 } else { 4875 assert(Opcode == TargetOpcode::G_AND && 4876 "Valid conjunction/disjunction tree"); 4877 assert(!Negate && "Valid conjunction/disjunction tree"); 4878 4879 NegateL = false; 4880 NegateR = false; 4881 NegateAfterR = false; 4882 NegateAfterAll = false; 4883 } 4884 4885 // Emit sub-trees. 4886 AArch64CC::CondCode RHSCC; 4887 MachineInstr *CmpR = 4888 emitConjunctionRec(RHS, RHSCC, NegateR, CCOp, Predicate, MIB); 4889 if (NegateAfterR) 4890 RHSCC = AArch64CC::getInvertedCondCode(RHSCC); 4891 MachineInstr *CmpL = emitConjunctionRec( 4892 LHS, OutCC, NegateL, CmpR->getOperand(0).getReg(), RHSCC, MIB); 4893 if (NegateAfterAll) 4894 OutCC = AArch64CC::getInvertedCondCode(OutCC); 4895 return CmpL; 4896 } 4897 4898 MachineInstr *AArch64InstructionSelector::emitConjunction( 4899 Register Val, AArch64CC::CondCode &OutCC, MachineIRBuilder &MIB) const { 4900 bool DummyCanNegate; 4901 bool DummyMustBeFirst; 4902 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false, 4903 *MIB.getMRI())) 4904 return nullptr; 4905 return emitConjunctionRec(Val, OutCC, false, Register(), AArch64CC::AL, MIB); 4906 } 4907 4908 bool AArch64InstructionSelector::tryOptSelectConjunction(GSelect &SelI, 4909 MachineInstr &CondMI) { 4910 AArch64CC::CondCode AArch64CC; 4911 MachineInstr *ConjMI = emitConjunction(SelI.getCondReg(), AArch64CC, MIB); 4912 if (!ConjMI) 4913 return false; 4914 4915 emitSelect(SelI.getReg(0), SelI.getTrueReg(), SelI.getFalseReg(), AArch64CC, MIB); 4916 SelI.eraseFromParent(); 4917 return true; 4918 } 4919 4920 bool AArch64InstructionSelector::tryOptSelect(GSelect &I) { 4921 MachineRegisterInfo &MRI = *MIB.getMRI(); 4922 // We want to recognize this pattern: 4923 // 4924 // $z = G_FCMP pred, $x, $y 4925 // ... 4926 // $w = G_SELECT $z, $a, $b 4927 // 4928 // Where the value of $z is *only* ever used by the G_SELECT (possibly with 4929 // some copies/truncs in between.) 4930 // 4931 // If we see this, then we can emit something like this: 4932 // 4933 // fcmp $x, $y 4934 // fcsel $w, $a, $b, pred 4935 // 4936 // Rather than emitting both of the rather long sequences in the standard 4937 // G_FCMP/G_SELECT select methods. 4938 4939 // First, check if the condition is defined by a compare. 4940 MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg()); 4941 4942 // We can only fold if all of the defs have one use. 4943 Register CondDefReg = CondDef->getOperand(0).getReg(); 4944 if (!MRI.hasOneNonDBGUse(CondDefReg)) { 4945 // Unless it's another select. 4946 for (const MachineInstr &UI : MRI.use_nodbg_instructions(CondDefReg)) { 4947 if (CondDef == &UI) 4948 continue; 4949 if (UI.getOpcode() != TargetOpcode::G_SELECT) 4950 return false; 4951 } 4952 } 4953 4954 // Is the condition defined by a compare? 4955 unsigned CondOpc = CondDef->getOpcode(); 4956 if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP) { 4957 if (tryOptSelectConjunction(I, *CondDef)) 4958 return true; 4959 return false; 4960 } 4961 4962 AArch64CC::CondCode CondCode; 4963 if (CondOpc == TargetOpcode::G_ICMP) { 4964 auto Pred = 4965 static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate()); 4966 CondCode = changeICMPPredToAArch64CC(Pred); 4967 emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3), 4968 CondDef->getOperand(1), MIB); 4969 } else { 4970 // Get the condition code for the select. 4971 auto Pred = 4972 static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate()); 4973 AArch64CC::CondCode CondCode2; 4974 changeFCMPPredToAArch64CC(Pred, CondCode, CondCode2); 4975 4976 // changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two 4977 // instructions to emit the comparison. 4978 // TODO: Handle FCMP_UEQ and FCMP_ONE. After that, this check will be 4979 // unnecessary. 4980 if (CondCode2 != AArch64CC::AL) 4981 return false; 4982 4983 if (!emitFPCompare(CondDef->getOperand(2).getReg(), 4984 CondDef->getOperand(3).getReg(), MIB)) { 4985 LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n"); 4986 return false; 4987 } 4988 } 4989 4990 // Emit the select. 4991 emitSelect(I.getOperand(0).getReg(), I.getOperand(2).getReg(), 4992 I.getOperand(3).getReg(), CondCode, MIB); 4993 I.eraseFromParent(); 4994 return true; 4995 } 4996 4997 MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare( 4998 MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate, 4999 MachineIRBuilder &MIRBuilder) const { 5000 assert(LHS.isReg() && RHS.isReg() && Predicate.isPredicate() && 5001 "Unexpected MachineOperand"); 5002 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 5003 // We want to find this sort of thing: 5004 // x = G_SUB 0, y 5005 // G_ICMP z, x 5006 // 5007 // In this case, we can fold the G_SUB into the G_ICMP using a CMN instead. 5008 // e.g: 5009 // 5010 // cmn z, y 5011 5012 // Check if the RHS or LHS of the G_ICMP is defined by a SUB 5013 MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI); 5014 MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI); 5015 auto P = static_cast<CmpInst::Predicate>(Predicate.getPredicate()); 5016 // Given this: 5017 // 5018 // x = G_SUB 0, y 5019 // G_ICMP x, z 5020 // 5021 // Produce this: 5022 // 5023 // cmn y, z 5024 if (isCMN(LHSDef, P, MRI)) 5025 return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder); 5026 5027 // Same idea here, but with the RHS of the compare instead: 5028 // 5029 // Given this: 5030 // 5031 // x = G_SUB 0, y 5032 // G_ICMP z, x 5033 // 5034 // Produce this: 5035 // 5036 // cmn z, y 5037 if (isCMN(RHSDef, P, MRI)) 5038 return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder); 5039 5040 // Given this: 5041 // 5042 // z = G_AND x, y 5043 // G_ICMP z, 0 5044 // 5045 // Produce this if the compare is signed: 5046 // 5047 // tst x, y 5048 if (!CmpInst::isUnsigned(P) && LHSDef && 5049 LHSDef->getOpcode() == TargetOpcode::G_AND) { 5050 // Make sure that the RHS is 0. 5051 auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI); 5052 if (!ValAndVReg || ValAndVReg->Value != 0) 5053 return nullptr; 5054 5055 return emitTST(LHSDef->getOperand(1), 5056 LHSDef->getOperand(2), MIRBuilder); 5057 } 5058 5059 return nullptr; 5060 } 5061 5062 bool AArch64InstructionSelector::selectShuffleVector( 5063 MachineInstr &I, MachineRegisterInfo &MRI) { 5064 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 5065 Register Src1Reg = I.getOperand(1).getReg(); 5066 const LLT Src1Ty = MRI.getType(Src1Reg); 5067 Register Src2Reg = I.getOperand(2).getReg(); 5068 const LLT Src2Ty = MRI.getType(Src2Reg); 5069 ArrayRef<int> Mask = I.getOperand(3).getShuffleMask(); 5070 5071 MachineBasicBlock &MBB = *I.getParent(); 5072 MachineFunction &MF = *MBB.getParent(); 5073 LLVMContext &Ctx = MF.getFunction().getContext(); 5074 5075 // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if 5076 // it's originated from a <1 x T> type. Those should have been lowered into 5077 // G_BUILD_VECTOR earlier. 5078 if (!Src1Ty.isVector() || !Src2Ty.isVector()) { 5079 LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n"); 5080 return false; 5081 } 5082 5083 unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8; 5084 5085 SmallVector<Constant *, 64> CstIdxs; 5086 for (int Val : Mask) { 5087 // For now, any undef indexes we'll just assume to be 0. This should be 5088 // optimized in future, e.g. to select DUP etc. 5089 Val = Val < 0 ? 0 : Val; 5090 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { 5091 unsigned Offset = Byte + Val * BytesPerElt; 5092 CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset)); 5093 } 5094 } 5095 5096 // Use a constant pool to load the index vector for TBL. 5097 Constant *CPVal = ConstantVector::get(CstIdxs); 5098 MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIB); 5099 if (!IndexLoad) { 5100 LLVM_DEBUG(dbgs() << "Could not load from a constant pool"); 5101 return false; 5102 } 5103 5104 if (DstTy.getSizeInBits() != 128) { 5105 assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty"); 5106 // This case can be done with TBL1. 5107 MachineInstr *Concat = 5108 emitVectorConcat(std::nullopt, Src1Reg, Src2Reg, MIB); 5109 if (!Concat) { 5110 LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1"); 5111 return false; 5112 } 5113 5114 // The constant pool load will be 64 bits, so need to convert to FPR128 reg. 5115 IndexLoad = emitScalarToVector(64, &AArch64::FPR128RegClass, 5116 IndexLoad->getOperand(0).getReg(), MIB); 5117 5118 auto TBL1 = MIB.buildInstr( 5119 AArch64::TBLv16i8One, {&AArch64::FPR128RegClass}, 5120 {Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()}); 5121 constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI); 5122 5123 auto Copy = 5124 MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {}) 5125 .addReg(TBL1.getReg(0), 0, AArch64::dsub); 5126 RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI); 5127 I.eraseFromParent(); 5128 return true; 5129 } 5130 5131 // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive 5132 // Q registers for regalloc. 5133 SmallVector<Register, 2> Regs = {Src1Reg, Src2Reg}; 5134 auto RegSeq = createQTuple(Regs, MIB); 5135 auto TBL2 = MIB.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0)}, 5136 {RegSeq, IndexLoad->getOperand(0)}); 5137 constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI); 5138 I.eraseFromParent(); 5139 return true; 5140 } 5141 5142 MachineInstr *AArch64InstructionSelector::emitLaneInsert( 5143 std::optional<Register> DstReg, Register SrcReg, Register EltReg, 5144 unsigned LaneIdx, const RegisterBank &RB, 5145 MachineIRBuilder &MIRBuilder) const { 5146 MachineInstr *InsElt = nullptr; 5147 const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass; 5148 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 5149 5150 // Create a register to define with the insert if one wasn't passed in. 5151 if (!DstReg) 5152 DstReg = MRI.createVirtualRegister(DstRC); 5153 5154 unsigned EltSize = MRI.getType(EltReg).getSizeInBits(); 5155 unsigned Opc = getInsertVecEltOpInfo(RB, EltSize).first; 5156 5157 if (RB.getID() == AArch64::FPRRegBankID) { 5158 auto InsSub = emitScalarToVector(EltSize, DstRC, EltReg, MIRBuilder); 5159 InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg}) 5160 .addImm(LaneIdx) 5161 .addUse(InsSub->getOperand(0).getReg()) 5162 .addImm(0); 5163 } else { 5164 InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg}) 5165 .addImm(LaneIdx) 5166 .addUse(EltReg); 5167 } 5168 5169 constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI); 5170 return InsElt; 5171 } 5172 5173 bool AArch64InstructionSelector::selectUSMovFromExtend( 5174 MachineInstr &MI, MachineRegisterInfo &MRI) { 5175 if (MI.getOpcode() != TargetOpcode::G_SEXT && 5176 MI.getOpcode() != TargetOpcode::G_ZEXT && 5177 MI.getOpcode() != TargetOpcode::G_ANYEXT) 5178 return false; 5179 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SEXT; 5180 const Register DefReg = MI.getOperand(0).getReg(); 5181 const LLT DstTy = MRI.getType(DefReg); 5182 unsigned DstSize = DstTy.getSizeInBits(); 5183 5184 if (DstSize != 32 && DstSize != 64) 5185 return false; 5186 5187 MachineInstr *Extract = getOpcodeDef(TargetOpcode::G_EXTRACT_VECTOR_ELT, 5188 MI.getOperand(1).getReg(), MRI); 5189 int64_t Lane; 5190 if (!Extract || !mi_match(Extract->getOperand(2).getReg(), MRI, m_ICst(Lane))) 5191 return false; 5192 Register Src0 = Extract->getOperand(1).getReg(); 5193 5194 const LLT &VecTy = MRI.getType(Src0); 5195 5196 if (VecTy.getSizeInBits() != 128) { 5197 const MachineInstr *ScalarToVector = emitScalarToVector( 5198 VecTy.getSizeInBits(), &AArch64::FPR128RegClass, Src0, MIB); 5199 assert(ScalarToVector && "Didn't expect emitScalarToVector to fail!"); 5200 Src0 = ScalarToVector->getOperand(0).getReg(); 5201 } 5202 5203 unsigned Opcode; 5204 if (DstSize == 64 && VecTy.getScalarSizeInBits() == 32) 5205 Opcode = IsSigned ? AArch64::SMOVvi32to64 : AArch64::UMOVvi32; 5206 else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 16) 5207 Opcode = IsSigned ? AArch64::SMOVvi16to64 : AArch64::UMOVvi16; 5208 else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 8) 5209 Opcode = IsSigned ? AArch64::SMOVvi8to64 : AArch64::UMOVvi8; 5210 else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 16) 5211 Opcode = IsSigned ? AArch64::SMOVvi16to32 : AArch64::UMOVvi16; 5212 else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 8) 5213 Opcode = IsSigned ? AArch64::SMOVvi8to32 : AArch64::UMOVvi8; 5214 else 5215 llvm_unreachable("Unexpected type combo for S/UMov!"); 5216 5217 // We may need to generate one of these, depending on the type and sign of the 5218 // input: 5219 // DstReg = SMOV Src0, Lane; 5220 // NewReg = UMOV Src0, Lane; DstReg = SUBREG_TO_REG NewReg, sub_32; 5221 MachineInstr *ExtI = nullptr; 5222 if (DstSize == 64 && !IsSigned) { 5223 Register NewReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 5224 MIB.buildInstr(Opcode, {NewReg}, {Src0}).addImm(Lane); 5225 ExtI = MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {}) 5226 .addImm(0) 5227 .addUse(NewReg) 5228 .addImm(AArch64::sub_32); 5229 RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI); 5230 } else 5231 ExtI = MIB.buildInstr(Opcode, {DefReg}, {Src0}).addImm(Lane); 5232 5233 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 5234 MI.eraseFromParent(); 5235 return true; 5236 } 5237 5238 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm8( 5239 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) { 5240 unsigned int Op; 5241 if (DstSize == 128) { 5242 if (Bits.getHiBits(64) != Bits.getLoBits(64)) 5243 return nullptr; 5244 Op = AArch64::MOVIv16b_ns; 5245 } else { 5246 Op = AArch64::MOVIv8b_ns; 5247 } 5248 5249 uint64_t Val = Bits.zextOrTrunc(64).getZExtValue(); 5250 5251 if (AArch64_AM::isAdvSIMDModImmType9(Val)) { 5252 Val = AArch64_AM::encodeAdvSIMDModImmType9(Val); 5253 auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val); 5254 constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI); 5255 return &*Mov; 5256 } 5257 return nullptr; 5258 } 5259 5260 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm16( 5261 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder, 5262 bool Inv) { 5263 5264 unsigned int Op; 5265 if (DstSize == 128) { 5266 if (Bits.getHiBits(64) != Bits.getLoBits(64)) 5267 return nullptr; 5268 Op = Inv ? AArch64::MVNIv8i16 : AArch64::MOVIv8i16; 5269 } else { 5270 Op = Inv ? AArch64::MVNIv4i16 : AArch64::MOVIv4i16; 5271 } 5272 5273 uint64_t Val = Bits.zextOrTrunc(64).getZExtValue(); 5274 uint64_t Shift; 5275 5276 if (AArch64_AM::isAdvSIMDModImmType5(Val)) { 5277 Val = AArch64_AM::encodeAdvSIMDModImmType5(Val); 5278 Shift = 0; 5279 } else if (AArch64_AM::isAdvSIMDModImmType6(Val)) { 5280 Val = AArch64_AM::encodeAdvSIMDModImmType6(Val); 5281 Shift = 8; 5282 } else 5283 return nullptr; 5284 5285 auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val).addImm(Shift); 5286 constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI); 5287 return &*Mov; 5288 } 5289 5290 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm32( 5291 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder, 5292 bool Inv) { 5293 5294 unsigned int Op; 5295 if (DstSize == 128) { 5296 if (Bits.getHiBits(64) != Bits.getLoBits(64)) 5297 return nullptr; 5298 Op = Inv ? AArch64::MVNIv4i32 : AArch64::MOVIv4i32; 5299 } else { 5300 Op = Inv ? AArch64::MVNIv2i32 : AArch64::MOVIv2i32; 5301 } 5302 5303 uint64_t Val = Bits.zextOrTrunc(64).getZExtValue(); 5304 uint64_t Shift; 5305 5306 if ((AArch64_AM::isAdvSIMDModImmType1(Val))) { 5307 Val = AArch64_AM::encodeAdvSIMDModImmType1(Val); 5308 Shift = 0; 5309 } else if ((AArch64_AM::isAdvSIMDModImmType2(Val))) { 5310 Val = AArch64_AM::encodeAdvSIMDModImmType2(Val); 5311 Shift = 8; 5312 } else if ((AArch64_AM::isAdvSIMDModImmType3(Val))) { 5313 Val = AArch64_AM::encodeAdvSIMDModImmType3(Val); 5314 Shift = 16; 5315 } else if ((AArch64_AM::isAdvSIMDModImmType4(Val))) { 5316 Val = AArch64_AM::encodeAdvSIMDModImmType4(Val); 5317 Shift = 24; 5318 } else 5319 return nullptr; 5320 5321 auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val).addImm(Shift); 5322 constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI); 5323 return &*Mov; 5324 } 5325 5326 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm64( 5327 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) { 5328 5329 unsigned int Op; 5330 if (DstSize == 128) { 5331 if (Bits.getHiBits(64) != Bits.getLoBits(64)) 5332 return nullptr; 5333 Op = AArch64::MOVIv2d_ns; 5334 } else { 5335 Op = AArch64::MOVID; 5336 } 5337 5338 uint64_t Val = Bits.zextOrTrunc(64).getZExtValue(); 5339 if (AArch64_AM::isAdvSIMDModImmType10(Val)) { 5340 Val = AArch64_AM::encodeAdvSIMDModImmType10(Val); 5341 auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val); 5342 constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI); 5343 return &*Mov; 5344 } 5345 return nullptr; 5346 } 5347 5348 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm321s( 5349 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder, 5350 bool Inv) { 5351 5352 unsigned int Op; 5353 if (DstSize == 128) { 5354 if (Bits.getHiBits(64) != Bits.getLoBits(64)) 5355 return nullptr; 5356 Op = Inv ? AArch64::MVNIv4s_msl : AArch64::MOVIv4s_msl; 5357 } else { 5358 Op = Inv ? AArch64::MVNIv2s_msl : AArch64::MOVIv2s_msl; 5359 } 5360 5361 uint64_t Val = Bits.zextOrTrunc(64).getZExtValue(); 5362 uint64_t Shift; 5363 5364 if (AArch64_AM::isAdvSIMDModImmType7(Val)) { 5365 Val = AArch64_AM::encodeAdvSIMDModImmType7(Val); 5366 Shift = 264; 5367 } else if (AArch64_AM::isAdvSIMDModImmType8(Val)) { 5368 Val = AArch64_AM::encodeAdvSIMDModImmType8(Val); 5369 Shift = 272; 5370 } else 5371 return nullptr; 5372 5373 auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val).addImm(Shift); 5374 constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI); 5375 return &*Mov; 5376 } 5377 5378 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImmFP( 5379 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) { 5380 5381 unsigned int Op; 5382 bool IsWide = false; 5383 if (DstSize == 128) { 5384 if (Bits.getHiBits(64) != Bits.getLoBits(64)) 5385 return nullptr; 5386 Op = AArch64::FMOVv4f32_ns; 5387 IsWide = true; 5388 } else { 5389 Op = AArch64::FMOVv2f32_ns; 5390 } 5391 5392 uint64_t Val = Bits.zextOrTrunc(64).getZExtValue(); 5393 5394 if (AArch64_AM::isAdvSIMDModImmType11(Val)) { 5395 Val = AArch64_AM::encodeAdvSIMDModImmType11(Val); 5396 } else if (IsWide && AArch64_AM::isAdvSIMDModImmType12(Val)) { 5397 Val = AArch64_AM::encodeAdvSIMDModImmType12(Val); 5398 Op = AArch64::FMOVv2f64_ns; 5399 } else 5400 return nullptr; 5401 5402 auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val); 5403 constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI); 5404 return &*Mov; 5405 } 5406 5407 bool AArch64InstructionSelector::selectIndexedExtLoad( 5408 MachineInstr &MI, MachineRegisterInfo &MRI) { 5409 auto &ExtLd = cast<GIndexedAnyExtLoad>(MI); 5410 Register Dst = ExtLd.getDstReg(); 5411 Register WriteBack = ExtLd.getWritebackReg(); 5412 Register Base = ExtLd.getBaseReg(); 5413 Register Offset = ExtLd.getOffsetReg(); 5414 LLT Ty = MRI.getType(Dst); 5415 assert(Ty.getSizeInBits() <= 64); // Only for scalar GPRs. 5416 unsigned MemSizeBits = ExtLd.getMMO().getMemoryType().getSizeInBits(); 5417 bool IsPre = ExtLd.isPre(); 5418 bool IsSExt = isa<GIndexedSExtLoad>(ExtLd); 5419 bool InsertIntoXReg = false; 5420 bool IsDst64 = Ty.getSizeInBits() == 64; 5421 5422 unsigned Opc = 0; 5423 LLT NewLdDstTy; 5424 LLT s32 = LLT::scalar(32); 5425 LLT s64 = LLT::scalar(64); 5426 5427 if (MemSizeBits == 8) { 5428 if (IsSExt) { 5429 if (IsDst64) 5430 Opc = IsPre ? AArch64::LDRSBXpre : AArch64::LDRSBXpost; 5431 else 5432 Opc = IsPre ? AArch64::LDRSBWpre : AArch64::LDRSBWpost; 5433 NewLdDstTy = IsDst64 ? s64 : s32; 5434 } else { 5435 Opc = IsPre ? AArch64::LDRBBpre : AArch64::LDRBBpost; 5436 InsertIntoXReg = IsDst64; 5437 NewLdDstTy = s32; 5438 } 5439 } else if (MemSizeBits == 16) { 5440 if (IsSExt) { 5441 if (IsDst64) 5442 Opc = IsPre ? AArch64::LDRSHXpre : AArch64::LDRSHXpost; 5443 else 5444 Opc = IsPre ? AArch64::LDRSHWpre : AArch64::LDRSHWpost; 5445 NewLdDstTy = IsDst64 ? s64 : s32; 5446 } else { 5447 Opc = IsPre ? AArch64::LDRHHpre : AArch64::LDRHHpost; 5448 InsertIntoXReg = IsDst64; 5449 NewLdDstTy = s32; 5450 } 5451 } else if (MemSizeBits == 32) { 5452 if (IsSExt) { 5453 Opc = IsPre ? AArch64::LDRSWpre : AArch64::LDRSWpost; 5454 NewLdDstTy = s64; 5455 } else { 5456 Opc = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost; 5457 InsertIntoXReg = IsDst64; 5458 NewLdDstTy = s32; 5459 } 5460 } else { 5461 llvm_unreachable("Unexpected size for indexed load"); 5462 } 5463 5464 if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID) 5465 return false; // We should be on gpr. 5466 5467 auto Cst = getIConstantVRegVal(Offset, MRI); 5468 if (!Cst) 5469 return false; // Shouldn't happen, but just in case. 5470 5471 auto LdMI = MIB.buildInstr(Opc, {WriteBack, NewLdDstTy}, {Base}) 5472 .addImm(Cst->getSExtValue()); 5473 LdMI.cloneMemRefs(ExtLd); 5474 constrainSelectedInstRegOperands(*LdMI, TII, TRI, RBI); 5475 // Make sure to select the load with the MemTy as the dest type, and then 5476 // insert into X reg if needed. 5477 if (InsertIntoXReg) { 5478 // Generate a SUBREG_TO_REG. 5479 auto SubToReg = MIB.buildInstr(TargetOpcode::SUBREG_TO_REG, {Dst}, {}) 5480 .addImm(0) 5481 .addUse(LdMI.getReg(1)) 5482 .addImm(AArch64::sub_32); 5483 RBI.constrainGenericRegister(SubToReg.getReg(0), AArch64::GPR64RegClass, 5484 MRI); 5485 } else { 5486 auto Copy = MIB.buildCopy(Dst, LdMI.getReg(1)); 5487 selectCopy(*Copy, TII, MRI, TRI, RBI); 5488 } 5489 MI.eraseFromParent(); 5490 5491 return true; 5492 } 5493 5494 bool AArch64InstructionSelector::selectIndexedLoad(MachineInstr &MI, 5495 MachineRegisterInfo &MRI) { 5496 auto &Ld = cast<GIndexedLoad>(MI); 5497 Register Dst = Ld.getDstReg(); 5498 Register WriteBack = Ld.getWritebackReg(); 5499 Register Base = Ld.getBaseReg(); 5500 Register Offset = Ld.getOffsetReg(); 5501 assert(MRI.getType(Dst).getSizeInBits() <= 128 && 5502 "Unexpected type for indexed load"); 5503 unsigned MemSize = Ld.getMMO().getMemoryType().getSizeInBytes(); 5504 5505 if (MemSize < MRI.getType(Dst).getSizeInBytes()) 5506 return selectIndexedExtLoad(MI, MRI); 5507 5508 unsigned Opc = 0; 5509 if (Ld.isPre()) { 5510 static constexpr unsigned GPROpcodes[] = { 5511 AArch64::LDRBBpre, AArch64::LDRHHpre, AArch64::LDRWpre, 5512 AArch64::LDRXpre}; 5513 static constexpr unsigned FPROpcodes[] = { 5514 AArch64::LDRBpre, AArch64::LDRHpre, AArch64::LDRSpre, AArch64::LDRDpre, 5515 AArch64::LDRQpre}; 5516 if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID) 5517 Opc = FPROpcodes[Log2_32(MemSize)]; 5518 else 5519 Opc = GPROpcodes[Log2_32(MemSize)]; 5520 } else { 5521 static constexpr unsigned GPROpcodes[] = { 5522 AArch64::LDRBBpost, AArch64::LDRHHpost, AArch64::LDRWpost, 5523 AArch64::LDRXpost}; 5524 static constexpr unsigned FPROpcodes[] = { 5525 AArch64::LDRBpost, AArch64::LDRHpost, AArch64::LDRSpost, 5526 AArch64::LDRDpost, AArch64::LDRQpost}; 5527 if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID) 5528 Opc = FPROpcodes[Log2_32(MemSize)]; 5529 else 5530 Opc = GPROpcodes[Log2_32(MemSize)]; 5531 } 5532 auto Cst = getIConstantVRegVal(Offset, MRI); 5533 if (!Cst) 5534 return false; // Shouldn't happen, but just in case. 5535 auto LdMI = 5536 MIB.buildInstr(Opc, {WriteBack, Dst}, {Base}).addImm(Cst->getSExtValue()); 5537 LdMI.cloneMemRefs(Ld); 5538 constrainSelectedInstRegOperands(*LdMI, TII, TRI, RBI); 5539 MI.eraseFromParent(); 5540 return true; 5541 } 5542 5543 bool AArch64InstructionSelector::selectIndexedStore(GIndexedStore &I, 5544 MachineRegisterInfo &MRI) { 5545 Register Dst = I.getWritebackReg(); 5546 Register Val = I.getValueReg(); 5547 Register Base = I.getBaseReg(); 5548 Register Offset = I.getOffsetReg(); 5549 LLT ValTy = MRI.getType(Val); 5550 assert(ValTy.getSizeInBits() <= 128 && "Unexpected type for indexed store"); 5551 5552 unsigned Opc = 0; 5553 if (I.isPre()) { 5554 static constexpr unsigned GPROpcodes[] = { 5555 AArch64::STRBBpre, AArch64::STRHHpre, AArch64::STRWpre, 5556 AArch64::STRXpre}; 5557 static constexpr unsigned FPROpcodes[] = { 5558 AArch64::STRBpre, AArch64::STRHpre, AArch64::STRSpre, AArch64::STRDpre, 5559 AArch64::STRQpre}; 5560 5561 if (RBI.getRegBank(Val, MRI, TRI)->getID() == AArch64::FPRRegBankID) 5562 Opc = FPROpcodes[Log2_32(ValTy.getSizeInBytes())]; 5563 else 5564 Opc = GPROpcodes[Log2_32(ValTy.getSizeInBytes())]; 5565 } else { 5566 static constexpr unsigned GPROpcodes[] = { 5567 AArch64::STRBBpost, AArch64::STRHHpost, AArch64::STRWpost, 5568 AArch64::STRXpost}; 5569 static constexpr unsigned FPROpcodes[] = { 5570 AArch64::STRBpost, AArch64::STRHpost, AArch64::STRSpost, 5571 AArch64::STRDpost, AArch64::STRQpost}; 5572 5573 if (RBI.getRegBank(Val, MRI, TRI)->getID() == AArch64::FPRRegBankID) 5574 Opc = FPROpcodes[Log2_32(ValTy.getSizeInBytes())]; 5575 else 5576 Opc = GPROpcodes[Log2_32(ValTy.getSizeInBytes())]; 5577 } 5578 5579 auto Cst = getIConstantVRegVal(Offset, MRI); 5580 if (!Cst) 5581 return false; // Shouldn't happen, but just in case. 5582 auto Str = 5583 MIB.buildInstr(Opc, {Dst}, {Val, Base}).addImm(Cst->getSExtValue()); 5584 Str.cloneMemRefs(I); 5585 constrainSelectedInstRegOperands(*Str, TII, TRI, RBI); 5586 I.eraseFromParent(); 5587 return true; 5588 } 5589 5590 MachineInstr * 5591 AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV, 5592 MachineIRBuilder &MIRBuilder, 5593 MachineRegisterInfo &MRI) { 5594 LLT DstTy = MRI.getType(Dst); 5595 unsigned DstSize = DstTy.getSizeInBits(); 5596 if (CV->isNullValue()) { 5597 if (DstSize == 128) { 5598 auto Mov = 5599 MIRBuilder.buildInstr(AArch64::MOVIv2d_ns, {Dst}, {}).addImm(0); 5600 constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI); 5601 return &*Mov; 5602 } 5603 5604 if (DstSize == 64) { 5605 auto Mov = 5606 MIRBuilder 5607 .buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {}) 5608 .addImm(0); 5609 auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {Dst}, {}) 5610 .addReg(Mov.getReg(0), 0, AArch64::dsub); 5611 RBI.constrainGenericRegister(Dst, AArch64::FPR64RegClass, MRI); 5612 return &*Copy; 5613 } 5614 } 5615 5616 if (CV->getSplatValue()) { 5617 APInt DefBits = APInt::getSplat(DstSize, CV->getUniqueInteger()); 5618 auto TryMOVIWithBits = [&](APInt DefBits) -> MachineInstr * { 5619 MachineInstr *NewOp; 5620 bool Inv = false; 5621 if ((NewOp = tryAdvSIMDModImm64(Dst, DstSize, DefBits, MIRBuilder)) || 5622 (NewOp = 5623 tryAdvSIMDModImm32(Dst, DstSize, DefBits, MIRBuilder, Inv)) || 5624 (NewOp = 5625 tryAdvSIMDModImm321s(Dst, DstSize, DefBits, MIRBuilder, Inv)) || 5626 (NewOp = 5627 tryAdvSIMDModImm16(Dst, DstSize, DefBits, MIRBuilder, Inv)) || 5628 (NewOp = tryAdvSIMDModImm8(Dst, DstSize, DefBits, MIRBuilder)) || 5629 (NewOp = tryAdvSIMDModImmFP(Dst, DstSize, DefBits, MIRBuilder))) 5630 return NewOp; 5631 5632 DefBits = ~DefBits; 5633 Inv = true; 5634 if ((NewOp = 5635 tryAdvSIMDModImm32(Dst, DstSize, DefBits, MIRBuilder, Inv)) || 5636 (NewOp = 5637 tryAdvSIMDModImm321s(Dst, DstSize, DefBits, MIRBuilder, Inv)) || 5638 (NewOp = tryAdvSIMDModImm16(Dst, DstSize, DefBits, MIRBuilder, Inv))) 5639 return NewOp; 5640 return nullptr; 5641 }; 5642 5643 if (auto *NewOp = TryMOVIWithBits(DefBits)) 5644 return NewOp; 5645 5646 // See if a fneg of the constant can be materialized with a MOVI, etc 5647 auto TryWithFNeg = [&](APInt DefBits, int NumBits, 5648 unsigned NegOpc) -> MachineInstr * { 5649 // FNegate each sub-element of the constant 5650 APInt Neg = APInt::getHighBitsSet(NumBits, 1).zext(DstSize); 5651 APInt NegBits(DstSize, 0); 5652 unsigned NumElts = DstSize / NumBits; 5653 for (unsigned i = 0; i < NumElts; i++) 5654 NegBits |= Neg << (NumBits * i); 5655 NegBits = DefBits ^ NegBits; 5656 5657 // Try to create the new constants with MOVI, and if so generate a fneg 5658 // for it. 5659 if (auto *NewOp = TryMOVIWithBits(NegBits)) { 5660 Register NewDst = MRI.createVirtualRegister(&AArch64::FPR128RegClass); 5661 NewOp->getOperand(0).setReg(NewDst); 5662 return MIRBuilder.buildInstr(NegOpc, {Dst}, {NewDst}); 5663 } 5664 return nullptr; 5665 }; 5666 MachineInstr *R; 5667 if ((R = TryWithFNeg(DefBits, 32, AArch64::FNEGv4f32)) || 5668 (R = TryWithFNeg(DefBits, 64, AArch64::FNEGv2f64)) || 5669 (STI.hasFullFP16() && 5670 (R = TryWithFNeg(DefBits, 16, AArch64::FNEGv8f16)))) 5671 return R; 5672 } 5673 5674 auto *CPLoad = emitLoadFromConstantPool(CV, MIRBuilder); 5675 if (!CPLoad) { 5676 LLVM_DEBUG(dbgs() << "Could not generate cp load for constant vector!"); 5677 return nullptr; 5678 } 5679 5680 auto Copy = MIRBuilder.buildCopy(Dst, CPLoad->getOperand(0)); 5681 RBI.constrainGenericRegister( 5682 Dst, *MRI.getRegClass(CPLoad->getOperand(0).getReg()), MRI); 5683 return &*Copy; 5684 } 5685 5686 bool AArch64InstructionSelector::tryOptConstantBuildVec( 5687 MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) { 5688 assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR); 5689 unsigned DstSize = DstTy.getSizeInBits(); 5690 assert(DstSize <= 128 && "Unexpected build_vec type!"); 5691 if (DstSize < 32) 5692 return false; 5693 // Check if we're building a constant vector, in which case we want to 5694 // generate a constant pool load instead of a vector insert sequence. 5695 SmallVector<Constant *, 16> Csts; 5696 for (unsigned Idx = 1; Idx < I.getNumOperands(); ++Idx) { 5697 // Try to find G_CONSTANT or G_FCONSTANT 5698 auto *OpMI = 5699 getOpcodeDef(TargetOpcode::G_CONSTANT, I.getOperand(Idx).getReg(), MRI); 5700 if (OpMI) 5701 Csts.emplace_back( 5702 const_cast<ConstantInt *>(OpMI->getOperand(1).getCImm())); 5703 else if ((OpMI = getOpcodeDef(TargetOpcode::G_FCONSTANT, 5704 I.getOperand(Idx).getReg(), MRI))) 5705 Csts.emplace_back( 5706 const_cast<ConstantFP *>(OpMI->getOperand(1).getFPImm())); 5707 else 5708 return false; 5709 } 5710 Constant *CV = ConstantVector::get(Csts); 5711 if (!emitConstantVector(I.getOperand(0).getReg(), CV, MIB, MRI)) 5712 return false; 5713 I.eraseFromParent(); 5714 return true; 5715 } 5716 5717 bool AArch64InstructionSelector::tryOptBuildVecToSubregToReg( 5718 MachineInstr &I, MachineRegisterInfo &MRI) { 5719 // Given: 5720 // %vec = G_BUILD_VECTOR %elt, %undef, %undef, ... %undef 5721 // 5722 // Select the G_BUILD_VECTOR as a SUBREG_TO_REG from %elt. 5723 Register Dst = I.getOperand(0).getReg(); 5724 Register EltReg = I.getOperand(1).getReg(); 5725 LLT EltTy = MRI.getType(EltReg); 5726 // If the index isn't on the same bank as its elements, then this can't be a 5727 // SUBREG_TO_REG. 5728 const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI); 5729 const RegisterBank &DstRB = *RBI.getRegBank(Dst, MRI, TRI); 5730 if (EltRB != DstRB) 5731 return false; 5732 if (any_of(drop_begin(I.operands(), 2), [&MRI](const MachineOperand &Op) { 5733 return !getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, Op.getReg(), MRI); 5734 })) 5735 return false; 5736 unsigned SubReg; 5737 const TargetRegisterClass *EltRC = getRegClassForTypeOnBank(EltTy, EltRB); 5738 if (!EltRC) 5739 return false; 5740 const TargetRegisterClass *DstRC = 5741 getRegClassForTypeOnBank(MRI.getType(Dst), DstRB); 5742 if (!DstRC) 5743 return false; 5744 if (!getSubRegForClass(EltRC, TRI, SubReg)) 5745 return false; 5746 auto SubregToReg = MIB.buildInstr(AArch64::SUBREG_TO_REG, {Dst}, {}) 5747 .addImm(0) 5748 .addUse(EltReg) 5749 .addImm(SubReg); 5750 I.eraseFromParent(); 5751 constrainSelectedInstRegOperands(*SubregToReg, TII, TRI, RBI); 5752 return RBI.constrainGenericRegister(Dst, *DstRC, MRI); 5753 } 5754 5755 bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I, 5756 MachineRegisterInfo &MRI) { 5757 assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR); 5758 // Until we port more of the optimized selections, for now just use a vector 5759 // insert sequence. 5760 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 5761 const LLT EltTy = MRI.getType(I.getOperand(1).getReg()); 5762 unsigned EltSize = EltTy.getSizeInBits(); 5763 5764 if (tryOptConstantBuildVec(I, DstTy, MRI)) 5765 return true; 5766 if (tryOptBuildVecToSubregToReg(I, MRI)) 5767 return true; 5768 5769 if (EltSize != 8 && EltSize != 16 && EltSize != 32 && EltSize != 64) 5770 return false; // Don't support all element types yet. 5771 const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI); 5772 5773 const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass; 5774 MachineInstr *ScalarToVec = 5775 emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC, 5776 I.getOperand(1).getReg(), MIB); 5777 if (!ScalarToVec) 5778 return false; 5779 5780 Register DstVec = ScalarToVec->getOperand(0).getReg(); 5781 unsigned DstSize = DstTy.getSizeInBits(); 5782 5783 // Keep track of the last MI we inserted. Later on, we might be able to save 5784 // a copy using it. 5785 MachineInstr *PrevMI = ScalarToVec; 5786 for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) { 5787 // Note that if we don't do a subregister copy, we can end up making an 5788 // extra register. 5789 Register OpReg = I.getOperand(i).getReg(); 5790 // Do not emit inserts for undefs 5791 if (!getOpcodeDef<GImplicitDef>(OpReg, MRI)) { 5792 PrevMI = &*emitLaneInsert(std::nullopt, DstVec, OpReg, i - 1, RB, MIB); 5793 DstVec = PrevMI->getOperand(0).getReg(); 5794 } 5795 } 5796 5797 // If DstTy's size in bits is less than 128, then emit a subregister copy 5798 // from DstVec to the last register we've defined. 5799 if (DstSize < 128) { 5800 // Force this to be FPR using the destination vector. 5801 const TargetRegisterClass *RC = 5802 getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DstVec, MRI, TRI)); 5803 if (!RC) 5804 return false; 5805 if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) { 5806 LLVM_DEBUG(dbgs() << "Unsupported register class!\n"); 5807 return false; 5808 } 5809 5810 unsigned SubReg = 0; 5811 if (!getSubRegForClass(RC, TRI, SubReg)) 5812 return false; 5813 if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) { 5814 LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << DstSize 5815 << "\n"); 5816 return false; 5817 } 5818 5819 Register Reg = MRI.createVirtualRegister(RC); 5820 Register DstReg = I.getOperand(0).getReg(); 5821 5822 MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}).addReg(DstVec, 0, SubReg); 5823 MachineOperand &RegOp = I.getOperand(1); 5824 RegOp.setReg(Reg); 5825 RBI.constrainGenericRegister(DstReg, *RC, MRI); 5826 } else { 5827 // We either have a vector with all elements (except the first one) undef or 5828 // at least one non-undef non-first element. In the first case, we need to 5829 // constrain the output register ourselves as we may have generated an 5830 // INSERT_SUBREG operation which is a generic operation for which the 5831 // output regclass cannot be automatically chosen. 5832 // 5833 // In the second case, there is no need to do this as it may generate an 5834 // instruction like INSvi32gpr where the regclass can be automatically 5835 // chosen. 5836 // 5837 // Also, we save a copy by re-using the destination register on the final 5838 // insert. 5839 PrevMI->getOperand(0).setReg(I.getOperand(0).getReg()); 5840 constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI); 5841 5842 Register DstReg = PrevMI->getOperand(0).getReg(); 5843 if (PrevMI == ScalarToVec && DstReg.isVirtual()) { 5844 const TargetRegisterClass *RC = 5845 getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DstVec, MRI, TRI)); 5846 RBI.constrainGenericRegister(DstReg, *RC, MRI); 5847 } 5848 } 5849 5850 I.eraseFromParent(); 5851 return true; 5852 } 5853 5854 bool AArch64InstructionSelector::selectVectorLoadIntrinsic(unsigned Opc, 5855 unsigned NumVecs, 5856 MachineInstr &I) { 5857 assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS); 5858 assert(Opc && "Expected an opcode?"); 5859 assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors"); 5860 auto &MRI = *MIB.getMRI(); 5861 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 5862 unsigned Size = Ty.getSizeInBits(); 5863 assert((Size == 64 || Size == 128) && 5864 "Destination must be 64 bits or 128 bits?"); 5865 unsigned SubReg = Size == 64 ? AArch64::dsub0 : AArch64::qsub0; 5866 auto Ptr = I.getOperand(I.getNumOperands() - 1).getReg(); 5867 assert(MRI.getType(Ptr).isPointer() && "Expected a pointer type?"); 5868 auto Load = MIB.buildInstr(Opc, {Ty}, {Ptr}); 5869 Load.cloneMemRefs(I); 5870 constrainSelectedInstRegOperands(*Load, TII, TRI, RBI); 5871 Register SelectedLoadDst = Load->getOperand(0).getReg(); 5872 for (unsigned Idx = 0; Idx < NumVecs; ++Idx) { 5873 auto Vec = MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(Idx)}, {}) 5874 .addReg(SelectedLoadDst, 0, SubReg + Idx); 5875 // Emit the subreg copies and immediately select them. 5876 // FIXME: We should refactor our copy code into an emitCopy helper and 5877 // clean up uses of this pattern elsewhere in the selector. 5878 selectCopy(*Vec, TII, MRI, TRI, RBI); 5879 } 5880 return true; 5881 } 5882 5883 bool AArch64InstructionSelector::selectVectorLoadLaneIntrinsic( 5884 unsigned Opc, unsigned NumVecs, MachineInstr &I) { 5885 assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS); 5886 assert(Opc && "Expected an opcode?"); 5887 assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors"); 5888 auto &MRI = *MIB.getMRI(); 5889 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 5890 bool Narrow = Ty.getSizeInBits() == 64; 5891 5892 auto FirstSrcRegIt = I.operands_begin() + NumVecs + 1; 5893 SmallVector<Register, 4> Regs(NumVecs); 5894 std::transform(FirstSrcRegIt, FirstSrcRegIt + NumVecs, Regs.begin(), 5895 [](auto MO) { return MO.getReg(); }); 5896 5897 if (Narrow) { 5898 transform(Regs, Regs.begin(), [this](Register Reg) { 5899 return emitScalarToVector(64, &AArch64::FPR128RegClass, Reg, MIB) 5900 ->getOperand(0) 5901 .getReg(); 5902 }); 5903 Ty = Ty.multiplyElements(2); 5904 } 5905 5906 Register Tuple = createQTuple(Regs, MIB); 5907 auto LaneNo = getIConstantVRegVal((FirstSrcRegIt + NumVecs)->getReg(), MRI); 5908 if (!LaneNo) 5909 return false; 5910 5911 Register Ptr = (FirstSrcRegIt + NumVecs + 1)->getReg(); 5912 auto Load = MIB.buildInstr(Opc, {Ty}, {}) 5913 .addReg(Tuple) 5914 .addImm(LaneNo->getZExtValue()) 5915 .addReg(Ptr); 5916 Load.cloneMemRefs(I); 5917 constrainSelectedInstRegOperands(*Load, TII, TRI, RBI); 5918 Register SelectedLoadDst = Load->getOperand(0).getReg(); 5919 unsigned SubReg = AArch64::qsub0; 5920 for (unsigned Idx = 0; Idx < NumVecs; ++Idx) { 5921 auto Vec = MIB.buildInstr(TargetOpcode::COPY, 5922 {Narrow ? DstOp(&AArch64::FPR128RegClass) 5923 : DstOp(I.getOperand(Idx).getReg())}, 5924 {}) 5925 .addReg(SelectedLoadDst, 0, SubReg + Idx); 5926 Register WideReg = Vec.getReg(0); 5927 // Emit the subreg copies and immediately select them. 5928 selectCopy(*Vec, TII, MRI, TRI, RBI); 5929 if (Narrow && 5930 !emitNarrowVector(I.getOperand(Idx).getReg(), WideReg, MIB, MRI)) 5931 return false; 5932 } 5933 return true; 5934 } 5935 5936 void AArch64InstructionSelector::selectVectorStoreIntrinsic(MachineInstr &I, 5937 unsigned NumVecs, 5938 unsigned Opc) { 5939 MachineRegisterInfo &MRI = I.getParent()->getParent()->getRegInfo(); 5940 LLT Ty = MRI.getType(I.getOperand(1).getReg()); 5941 Register Ptr = I.getOperand(1 + NumVecs).getReg(); 5942 5943 SmallVector<Register, 2> Regs(NumVecs); 5944 std::transform(I.operands_begin() + 1, I.operands_begin() + 1 + NumVecs, 5945 Regs.begin(), [](auto MO) { return MO.getReg(); }); 5946 5947 Register Tuple = Ty.getSizeInBits() == 128 ? createQTuple(Regs, MIB) 5948 : createDTuple(Regs, MIB); 5949 auto Store = MIB.buildInstr(Opc, {}, {Tuple, Ptr}); 5950 Store.cloneMemRefs(I); 5951 constrainSelectedInstRegOperands(*Store, TII, TRI, RBI); 5952 } 5953 5954 bool AArch64InstructionSelector::selectVectorStoreLaneIntrinsic( 5955 MachineInstr &I, unsigned NumVecs, unsigned Opc) { 5956 MachineRegisterInfo &MRI = I.getParent()->getParent()->getRegInfo(); 5957 LLT Ty = MRI.getType(I.getOperand(1).getReg()); 5958 bool Narrow = Ty.getSizeInBits() == 64; 5959 5960 SmallVector<Register, 2> Regs(NumVecs); 5961 std::transform(I.operands_begin() + 1, I.operands_begin() + 1 + NumVecs, 5962 Regs.begin(), [](auto MO) { return MO.getReg(); }); 5963 5964 if (Narrow) 5965 transform(Regs, Regs.begin(), [this](Register Reg) { 5966 return emitScalarToVector(64, &AArch64::FPR128RegClass, Reg, MIB) 5967 ->getOperand(0) 5968 .getReg(); 5969 }); 5970 5971 Register Tuple = createQTuple(Regs, MIB); 5972 5973 auto LaneNo = getIConstantVRegVal(I.getOperand(1 + NumVecs).getReg(), MRI); 5974 if (!LaneNo) 5975 return false; 5976 Register Ptr = I.getOperand(1 + NumVecs + 1).getReg(); 5977 auto Store = MIB.buildInstr(Opc, {}, {}) 5978 .addReg(Tuple) 5979 .addImm(LaneNo->getZExtValue()) 5980 .addReg(Ptr); 5981 Store.cloneMemRefs(I); 5982 constrainSelectedInstRegOperands(*Store, TII, TRI, RBI); 5983 return true; 5984 } 5985 5986 bool AArch64InstructionSelector::selectIntrinsicWithSideEffects( 5987 MachineInstr &I, MachineRegisterInfo &MRI) { 5988 // Find the intrinsic ID. 5989 unsigned IntrinID = cast<GIntrinsic>(I).getIntrinsicID(); 5990 5991 const LLT S8 = LLT::scalar(8); 5992 const LLT S16 = LLT::scalar(16); 5993 const LLT S32 = LLT::scalar(32); 5994 const LLT S64 = LLT::scalar(64); 5995 const LLT P0 = LLT::pointer(0, 64); 5996 // Select the instruction. 5997 switch (IntrinID) { 5998 default: 5999 return false; 6000 case Intrinsic::aarch64_ldxp: 6001 case Intrinsic::aarch64_ldaxp: { 6002 auto NewI = MIB.buildInstr( 6003 IntrinID == Intrinsic::aarch64_ldxp ? AArch64::LDXPX : AArch64::LDAXPX, 6004 {I.getOperand(0).getReg(), I.getOperand(1).getReg()}, 6005 {I.getOperand(3)}); 6006 NewI.cloneMemRefs(I); 6007 constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI); 6008 break; 6009 } 6010 case Intrinsic::aarch64_neon_ld1x2: { 6011 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6012 unsigned Opc = 0; 6013 if (Ty == LLT::fixed_vector(8, S8)) 6014 Opc = AArch64::LD1Twov8b; 6015 else if (Ty == LLT::fixed_vector(16, S8)) 6016 Opc = AArch64::LD1Twov16b; 6017 else if (Ty == LLT::fixed_vector(4, S16)) 6018 Opc = AArch64::LD1Twov4h; 6019 else if (Ty == LLT::fixed_vector(8, S16)) 6020 Opc = AArch64::LD1Twov8h; 6021 else if (Ty == LLT::fixed_vector(2, S32)) 6022 Opc = AArch64::LD1Twov2s; 6023 else if (Ty == LLT::fixed_vector(4, S32)) 6024 Opc = AArch64::LD1Twov4s; 6025 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6026 Opc = AArch64::LD1Twov2d; 6027 else if (Ty == S64 || Ty == P0) 6028 Opc = AArch64::LD1Twov1d; 6029 else 6030 llvm_unreachable("Unexpected type for ld1x2!"); 6031 selectVectorLoadIntrinsic(Opc, 2, I); 6032 break; 6033 } 6034 case Intrinsic::aarch64_neon_ld1x3: { 6035 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6036 unsigned Opc = 0; 6037 if (Ty == LLT::fixed_vector(8, S8)) 6038 Opc = AArch64::LD1Threev8b; 6039 else if (Ty == LLT::fixed_vector(16, S8)) 6040 Opc = AArch64::LD1Threev16b; 6041 else if (Ty == LLT::fixed_vector(4, S16)) 6042 Opc = AArch64::LD1Threev4h; 6043 else if (Ty == LLT::fixed_vector(8, S16)) 6044 Opc = AArch64::LD1Threev8h; 6045 else if (Ty == LLT::fixed_vector(2, S32)) 6046 Opc = AArch64::LD1Threev2s; 6047 else if (Ty == LLT::fixed_vector(4, S32)) 6048 Opc = AArch64::LD1Threev4s; 6049 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6050 Opc = AArch64::LD1Threev2d; 6051 else if (Ty == S64 || Ty == P0) 6052 Opc = AArch64::LD1Threev1d; 6053 else 6054 llvm_unreachable("Unexpected type for ld1x3!"); 6055 selectVectorLoadIntrinsic(Opc, 3, I); 6056 break; 6057 } 6058 case Intrinsic::aarch64_neon_ld1x4: { 6059 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6060 unsigned Opc = 0; 6061 if (Ty == LLT::fixed_vector(8, S8)) 6062 Opc = AArch64::LD1Fourv8b; 6063 else if (Ty == LLT::fixed_vector(16, S8)) 6064 Opc = AArch64::LD1Fourv16b; 6065 else if (Ty == LLT::fixed_vector(4, S16)) 6066 Opc = AArch64::LD1Fourv4h; 6067 else if (Ty == LLT::fixed_vector(8, S16)) 6068 Opc = AArch64::LD1Fourv8h; 6069 else if (Ty == LLT::fixed_vector(2, S32)) 6070 Opc = AArch64::LD1Fourv2s; 6071 else if (Ty == LLT::fixed_vector(4, S32)) 6072 Opc = AArch64::LD1Fourv4s; 6073 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6074 Opc = AArch64::LD1Fourv2d; 6075 else if (Ty == S64 || Ty == P0) 6076 Opc = AArch64::LD1Fourv1d; 6077 else 6078 llvm_unreachable("Unexpected type for ld1x4!"); 6079 selectVectorLoadIntrinsic(Opc, 4, I); 6080 break; 6081 } 6082 case Intrinsic::aarch64_neon_ld2: { 6083 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6084 unsigned Opc = 0; 6085 if (Ty == LLT::fixed_vector(8, S8)) 6086 Opc = AArch64::LD2Twov8b; 6087 else if (Ty == LLT::fixed_vector(16, S8)) 6088 Opc = AArch64::LD2Twov16b; 6089 else if (Ty == LLT::fixed_vector(4, S16)) 6090 Opc = AArch64::LD2Twov4h; 6091 else if (Ty == LLT::fixed_vector(8, S16)) 6092 Opc = AArch64::LD2Twov8h; 6093 else if (Ty == LLT::fixed_vector(2, S32)) 6094 Opc = AArch64::LD2Twov2s; 6095 else if (Ty == LLT::fixed_vector(4, S32)) 6096 Opc = AArch64::LD2Twov4s; 6097 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6098 Opc = AArch64::LD2Twov2d; 6099 else if (Ty == S64 || Ty == P0) 6100 Opc = AArch64::LD1Twov1d; 6101 else 6102 llvm_unreachable("Unexpected type for ld2!"); 6103 selectVectorLoadIntrinsic(Opc, 2, I); 6104 break; 6105 } 6106 case Intrinsic::aarch64_neon_ld2lane: { 6107 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6108 unsigned Opc; 6109 if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8)) 6110 Opc = AArch64::LD2i8; 6111 else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16)) 6112 Opc = AArch64::LD2i16; 6113 else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32)) 6114 Opc = AArch64::LD2i32; 6115 else if (Ty == LLT::fixed_vector(2, S64) || 6116 Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0) 6117 Opc = AArch64::LD2i64; 6118 else 6119 llvm_unreachable("Unexpected type for st2lane!"); 6120 if (!selectVectorLoadLaneIntrinsic(Opc, 2, I)) 6121 return false; 6122 break; 6123 } 6124 case Intrinsic::aarch64_neon_ld2r: { 6125 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6126 unsigned Opc = 0; 6127 if (Ty == LLT::fixed_vector(8, S8)) 6128 Opc = AArch64::LD2Rv8b; 6129 else if (Ty == LLT::fixed_vector(16, S8)) 6130 Opc = AArch64::LD2Rv16b; 6131 else if (Ty == LLT::fixed_vector(4, S16)) 6132 Opc = AArch64::LD2Rv4h; 6133 else if (Ty == LLT::fixed_vector(8, S16)) 6134 Opc = AArch64::LD2Rv8h; 6135 else if (Ty == LLT::fixed_vector(2, S32)) 6136 Opc = AArch64::LD2Rv2s; 6137 else if (Ty == LLT::fixed_vector(4, S32)) 6138 Opc = AArch64::LD2Rv4s; 6139 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6140 Opc = AArch64::LD2Rv2d; 6141 else if (Ty == S64 || Ty == P0) 6142 Opc = AArch64::LD2Rv1d; 6143 else 6144 llvm_unreachable("Unexpected type for ld2r!"); 6145 selectVectorLoadIntrinsic(Opc, 2, I); 6146 break; 6147 } 6148 case Intrinsic::aarch64_neon_ld3: { 6149 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6150 unsigned Opc = 0; 6151 if (Ty == LLT::fixed_vector(8, S8)) 6152 Opc = AArch64::LD3Threev8b; 6153 else if (Ty == LLT::fixed_vector(16, S8)) 6154 Opc = AArch64::LD3Threev16b; 6155 else if (Ty == LLT::fixed_vector(4, S16)) 6156 Opc = AArch64::LD3Threev4h; 6157 else if (Ty == LLT::fixed_vector(8, S16)) 6158 Opc = AArch64::LD3Threev8h; 6159 else if (Ty == LLT::fixed_vector(2, S32)) 6160 Opc = AArch64::LD3Threev2s; 6161 else if (Ty == LLT::fixed_vector(4, S32)) 6162 Opc = AArch64::LD3Threev4s; 6163 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6164 Opc = AArch64::LD3Threev2d; 6165 else if (Ty == S64 || Ty == P0) 6166 Opc = AArch64::LD1Threev1d; 6167 else 6168 llvm_unreachable("Unexpected type for ld3!"); 6169 selectVectorLoadIntrinsic(Opc, 3, I); 6170 break; 6171 } 6172 case Intrinsic::aarch64_neon_ld3lane: { 6173 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6174 unsigned Opc; 6175 if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8)) 6176 Opc = AArch64::LD3i8; 6177 else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16)) 6178 Opc = AArch64::LD3i16; 6179 else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32)) 6180 Opc = AArch64::LD3i32; 6181 else if (Ty == LLT::fixed_vector(2, S64) || 6182 Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0) 6183 Opc = AArch64::LD3i64; 6184 else 6185 llvm_unreachable("Unexpected type for st3lane!"); 6186 if (!selectVectorLoadLaneIntrinsic(Opc, 3, I)) 6187 return false; 6188 break; 6189 } 6190 case Intrinsic::aarch64_neon_ld3r: { 6191 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6192 unsigned Opc = 0; 6193 if (Ty == LLT::fixed_vector(8, S8)) 6194 Opc = AArch64::LD3Rv8b; 6195 else if (Ty == LLT::fixed_vector(16, S8)) 6196 Opc = AArch64::LD3Rv16b; 6197 else if (Ty == LLT::fixed_vector(4, S16)) 6198 Opc = AArch64::LD3Rv4h; 6199 else if (Ty == LLT::fixed_vector(8, S16)) 6200 Opc = AArch64::LD3Rv8h; 6201 else if (Ty == LLT::fixed_vector(2, S32)) 6202 Opc = AArch64::LD3Rv2s; 6203 else if (Ty == LLT::fixed_vector(4, S32)) 6204 Opc = AArch64::LD3Rv4s; 6205 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6206 Opc = AArch64::LD3Rv2d; 6207 else if (Ty == S64 || Ty == P0) 6208 Opc = AArch64::LD3Rv1d; 6209 else 6210 llvm_unreachable("Unexpected type for ld3r!"); 6211 selectVectorLoadIntrinsic(Opc, 3, I); 6212 break; 6213 } 6214 case Intrinsic::aarch64_neon_ld4: { 6215 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6216 unsigned Opc = 0; 6217 if (Ty == LLT::fixed_vector(8, S8)) 6218 Opc = AArch64::LD4Fourv8b; 6219 else if (Ty == LLT::fixed_vector(16, S8)) 6220 Opc = AArch64::LD4Fourv16b; 6221 else if (Ty == LLT::fixed_vector(4, S16)) 6222 Opc = AArch64::LD4Fourv4h; 6223 else if (Ty == LLT::fixed_vector(8, S16)) 6224 Opc = AArch64::LD4Fourv8h; 6225 else if (Ty == LLT::fixed_vector(2, S32)) 6226 Opc = AArch64::LD4Fourv2s; 6227 else if (Ty == LLT::fixed_vector(4, S32)) 6228 Opc = AArch64::LD4Fourv4s; 6229 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6230 Opc = AArch64::LD4Fourv2d; 6231 else if (Ty == S64 || Ty == P0) 6232 Opc = AArch64::LD1Fourv1d; 6233 else 6234 llvm_unreachable("Unexpected type for ld4!"); 6235 selectVectorLoadIntrinsic(Opc, 4, I); 6236 break; 6237 } 6238 case Intrinsic::aarch64_neon_ld4lane: { 6239 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6240 unsigned Opc; 6241 if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8)) 6242 Opc = AArch64::LD4i8; 6243 else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16)) 6244 Opc = AArch64::LD4i16; 6245 else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32)) 6246 Opc = AArch64::LD4i32; 6247 else if (Ty == LLT::fixed_vector(2, S64) || 6248 Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0) 6249 Opc = AArch64::LD4i64; 6250 else 6251 llvm_unreachable("Unexpected type for st4lane!"); 6252 if (!selectVectorLoadLaneIntrinsic(Opc, 4, I)) 6253 return false; 6254 break; 6255 } 6256 case Intrinsic::aarch64_neon_ld4r: { 6257 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6258 unsigned Opc = 0; 6259 if (Ty == LLT::fixed_vector(8, S8)) 6260 Opc = AArch64::LD4Rv8b; 6261 else if (Ty == LLT::fixed_vector(16, S8)) 6262 Opc = AArch64::LD4Rv16b; 6263 else if (Ty == LLT::fixed_vector(4, S16)) 6264 Opc = AArch64::LD4Rv4h; 6265 else if (Ty == LLT::fixed_vector(8, S16)) 6266 Opc = AArch64::LD4Rv8h; 6267 else if (Ty == LLT::fixed_vector(2, S32)) 6268 Opc = AArch64::LD4Rv2s; 6269 else if (Ty == LLT::fixed_vector(4, S32)) 6270 Opc = AArch64::LD4Rv4s; 6271 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6272 Opc = AArch64::LD4Rv2d; 6273 else if (Ty == S64 || Ty == P0) 6274 Opc = AArch64::LD4Rv1d; 6275 else 6276 llvm_unreachable("Unexpected type for ld4r!"); 6277 selectVectorLoadIntrinsic(Opc, 4, I); 6278 break; 6279 } 6280 case Intrinsic::aarch64_neon_st1x2: { 6281 LLT Ty = MRI.getType(I.getOperand(1).getReg()); 6282 unsigned Opc; 6283 if (Ty == LLT::fixed_vector(8, S8)) 6284 Opc = AArch64::ST1Twov8b; 6285 else if (Ty == LLT::fixed_vector(16, S8)) 6286 Opc = AArch64::ST1Twov16b; 6287 else if (Ty == LLT::fixed_vector(4, S16)) 6288 Opc = AArch64::ST1Twov4h; 6289 else if (Ty == LLT::fixed_vector(8, S16)) 6290 Opc = AArch64::ST1Twov8h; 6291 else if (Ty == LLT::fixed_vector(2, S32)) 6292 Opc = AArch64::ST1Twov2s; 6293 else if (Ty == LLT::fixed_vector(4, S32)) 6294 Opc = AArch64::ST1Twov4s; 6295 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6296 Opc = AArch64::ST1Twov2d; 6297 else if (Ty == S64 || Ty == P0) 6298 Opc = AArch64::ST1Twov1d; 6299 else 6300 llvm_unreachable("Unexpected type for st1x2!"); 6301 selectVectorStoreIntrinsic(I, 2, Opc); 6302 break; 6303 } 6304 case Intrinsic::aarch64_neon_st1x3: { 6305 LLT Ty = MRI.getType(I.getOperand(1).getReg()); 6306 unsigned Opc; 6307 if (Ty == LLT::fixed_vector(8, S8)) 6308 Opc = AArch64::ST1Threev8b; 6309 else if (Ty == LLT::fixed_vector(16, S8)) 6310 Opc = AArch64::ST1Threev16b; 6311 else if (Ty == LLT::fixed_vector(4, S16)) 6312 Opc = AArch64::ST1Threev4h; 6313 else if (Ty == LLT::fixed_vector(8, S16)) 6314 Opc = AArch64::ST1Threev8h; 6315 else if (Ty == LLT::fixed_vector(2, S32)) 6316 Opc = AArch64::ST1Threev2s; 6317 else if (Ty == LLT::fixed_vector(4, S32)) 6318 Opc = AArch64::ST1Threev4s; 6319 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6320 Opc = AArch64::ST1Threev2d; 6321 else if (Ty == S64 || Ty == P0) 6322 Opc = AArch64::ST1Threev1d; 6323 else 6324 llvm_unreachable("Unexpected type for st1x3!"); 6325 selectVectorStoreIntrinsic(I, 3, Opc); 6326 break; 6327 } 6328 case Intrinsic::aarch64_neon_st1x4: { 6329 LLT Ty = MRI.getType(I.getOperand(1).getReg()); 6330 unsigned Opc; 6331 if (Ty == LLT::fixed_vector(8, S8)) 6332 Opc = AArch64::ST1Fourv8b; 6333 else if (Ty == LLT::fixed_vector(16, S8)) 6334 Opc = AArch64::ST1Fourv16b; 6335 else if (Ty == LLT::fixed_vector(4, S16)) 6336 Opc = AArch64::ST1Fourv4h; 6337 else if (Ty == LLT::fixed_vector(8, S16)) 6338 Opc = AArch64::ST1Fourv8h; 6339 else if (Ty == LLT::fixed_vector(2, S32)) 6340 Opc = AArch64::ST1Fourv2s; 6341 else if (Ty == LLT::fixed_vector(4, S32)) 6342 Opc = AArch64::ST1Fourv4s; 6343 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6344 Opc = AArch64::ST1Fourv2d; 6345 else if (Ty == S64 || Ty == P0) 6346 Opc = AArch64::ST1Fourv1d; 6347 else 6348 llvm_unreachable("Unexpected type for st1x4!"); 6349 selectVectorStoreIntrinsic(I, 4, Opc); 6350 break; 6351 } 6352 case Intrinsic::aarch64_neon_st2: { 6353 LLT Ty = MRI.getType(I.getOperand(1).getReg()); 6354 unsigned Opc; 6355 if (Ty == LLT::fixed_vector(8, S8)) 6356 Opc = AArch64::ST2Twov8b; 6357 else if (Ty == LLT::fixed_vector(16, S8)) 6358 Opc = AArch64::ST2Twov16b; 6359 else if (Ty == LLT::fixed_vector(4, S16)) 6360 Opc = AArch64::ST2Twov4h; 6361 else if (Ty == LLT::fixed_vector(8, S16)) 6362 Opc = AArch64::ST2Twov8h; 6363 else if (Ty == LLT::fixed_vector(2, S32)) 6364 Opc = AArch64::ST2Twov2s; 6365 else if (Ty == LLT::fixed_vector(4, S32)) 6366 Opc = AArch64::ST2Twov4s; 6367 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6368 Opc = AArch64::ST2Twov2d; 6369 else if (Ty == S64 || Ty == P0) 6370 Opc = AArch64::ST1Twov1d; 6371 else 6372 llvm_unreachable("Unexpected type for st2!"); 6373 selectVectorStoreIntrinsic(I, 2, Opc); 6374 break; 6375 } 6376 case Intrinsic::aarch64_neon_st3: { 6377 LLT Ty = MRI.getType(I.getOperand(1).getReg()); 6378 unsigned Opc; 6379 if (Ty == LLT::fixed_vector(8, S8)) 6380 Opc = AArch64::ST3Threev8b; 6381 else if (Ty == LLT::fixed_vector(16, S8)) 6382 Opc = AArch64::ST3Threev16b; 6383 else if (Ty == LLT::fixed_vector(4, S16)) 6384 Opc = AArch64::ST3Threev4h; 6385 else if (Ty == LLT::fixed_vector(8, S16)) 6386 Opc = AArch64::ST3Threev8h; 6387 else if (Ty == LLT::fixed_vector(2, S32)) 6388 Opc = AArch64::ST3Threev2s; 6389 else if (Ty == LLT::fixed_vector(4, S32)) 6390 Opc = AArch64::ST3Threev4s; 6391 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6392 Opc = AArch64::ST3Threev2d; 6393 else if (Ty == S64 || Ty == P0) 6394 Opc = AArch64::ST1Threev1d; 6395 else 6396 llvm_unreachable("Unexpected type for st3!"); 6397 selectVectorStoreIntrinsic(I, 3, Opc); 6398 break; 6399 } 6400 case Intrinsic::aarch64_neon_st4: { 6401 LLT Ty = MRI.getType(I.getOperand(1).getReg()); 6402 unsigned Opc; 6403 if (Ty == LLT::fixed_vector(8, S8)) 6404 Opc = AArch64::ST4Fourv8b; 6405 else if (Ty == LLT::fixed_vector(16, S8)) 6406 Opc = AArch64::ST4Fourv16b; 6407 else if (Ty == LLT::fixed_vector(4, S16)) 6408 Opc = AArch64::ST4Fourv4h; 6409 else if (Ty == LLT::fixed_vector(8, S16)) 6410 Opc = AArch64::ST4Fourv8h; 6411 else if (Ty == LLT::fixed_vector(2, S32)) 6412 Opc = AArch64::ST4Fourv2s; 6413 else if (Ty == LLT::fixed_vector(4, S32)) 6414 Opc = AArch64::ST4Fourv4s; 6415 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6416 Opc = AArch64::ST4Fourv2d; 6417 else if (Ty == S64 || Ty == P0) 6418 Opc = AArch64::ST1Fourv1d; 6419 else 6420 llvm_unreachable("Unexpected type for st4!"); 6421 selectVectorStoreIntrinsic(I, 4, Opc); 6422 break; 6423 } 6424 case Intrinsic::aarch64_neon_st2lane: { 6425 LLT Ty = MRI.getType(I.getOperand(1).getReg()); 6426 unsigned Opc; 6427 if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8)) 6428 Opc = AArch64::ST2i8; 6429 else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16)) 6430 Opc = AArch64::ST2i16; 6431 else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32)) 6432 Opc = AArch64::ST2i32; 6433 else if (Ty == LLT::fixed_vector(2, S64) || 6434 Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0) 6435 Opc = AArch64::ST2i64; 6436 else 6437 llvm_unreachable("Unexpected type for st2lane!"); 6438 if (!selectVectorStoreLaneIntrinsic(I, 2, Opc)) 6439 return false; 6440 break; 6441 } 6442 case Intrinsic::aarch64_neon_st3lane: { 6443 LLT Ty = MRI.getType(I.getOperand(1).getReg()); 6444 unsigned Opc; 6445 if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8)) 6446 Opc = AArch64::ST3i8; 6447 else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16)) 6448 Opc = AArch64::ST3i16; 6449 else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32)) 6450 Opc = AArch64::ST3i32; 6451 else if (Ty == LLT::fixed_vector(2, S64) || 6452 Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0) 6453 Opc = AArch64::ST3i64; 6454 else 6455 llvm_unreachable("Unexpected type for st3lane!"); 6456 if (!selectVectorStoreLaneIntrinsic(I, 3, Opc)) 6457 return false; 6458 break; 6459 } 6460 case Intrinsic::aarch64_neon_st4lane: { 6461 LLT Ty = MRI.getType(I.getOperand(1).getReg()); 6462 unsigned Opc; 6463 if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8)) 6464 Opc = AArch64::ST4i8; 6465 else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16)) 6466 Opc = AArch64::ST4i16; 6467 else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32)) 6468 Opc = AArch64::ST4i32; 6469 else if (Ty == LLT::fixed_vector(2, S64) || 6470 Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0) 6471 Opc = AArch64::ST4i64; 6472 else 6473 llvm_unreachable("Unexpected type for st4lane!"); 6474 if (!selectVectorStoreLaneIntrinsic(I, 4, Opc)) 6475 return false; 6476 break; 6477 } 6478 case Intrinsic::aarch64_mops_memset_tag: { 6479 // Transform 6480 // %dst:gpr(p0) = \ 6481 // G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.mops.memset.tag), 6482 // \ %dst:gpr(p0), %val:gpr(s64), %n:gpr(s64) 6483 // where %dst is updated, into 6484 // %Rd:GPR64common, %Rn:GPR64) = \ 6485 // MOPSMemorySetTaggingPseudo \ 6486 // %Rd:GPR64common, %Rn:GPR64, %Rm:GPR64 6487 // where Rd and Rn are tied. 6488 // It is expected that %val has been extended to s64 in legalization. 6489 // Note that the order of the size/value operands are swapped. 6490 6491 Register DstDef = I.getOperand(0).getReg(); 6492 // I.getOperand(1) is the intrinsic function 6493 Register DstUse = I.getOperand(2).getReg(); 6494 Register ValUse = I.getOperand(3).getReg(); 6495 Register SizeUse = I.getOperand(4).getReg(); 6496 6497 // MOPSMemorySetTaggingPseudo has two defs; the intrinsic call has only one. 6498 // Therefore an additional virtual register is requried for the updated size 6499 // operand. This value is not accessible via the semantics of the intrinsic. 6500 Register SizeDef = MRI.createGenericVirtualRegister(LLT::scalar(64)); 6501 6502 auto Memset = MIB.buildInstr(AArch64::MOPSMemorySetTaggingPseudo, 6503 {DstDef, SizeDef}, {DstUse, SizeUse, ValUse}); 6504 Memset.cloneMemRefs(I); 6505 constrainSelectedInstRegOperands(*Memset, TII, TRI, RBI); 6506 break; 6507 } 6508 } 6509 6510 I.eraseFromParent(); 6511 return true; 6512 } 6513 6514 bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I, 6515 MachineRegisterInfo &MRI) { 6516 unsigned IntrinID = cast<GIntrinsic>(I).getIntrinsicID(); 6517 6518 switch (IntrinID) { 6519 default: 6520 break; 6521 case Intrinsic::aarch64_crypto_sha1h: { 6522 Register DstReg = I.getOperand(0).getReg(); 6523 Register SrcReg = I.getOperand(2).getReg(); 6524 6525 // FIXME: Should this be an assert? 6526 if (MRI.getType(DstReg).getSizeInBits() != 32 || 6527 MRI.getType(SrcReg).getSizeInBits() != 32) 6528 return false; 6529 6530 // The operation has to happen on FPRs. Set up some new FPR registers for 6531 // the source and destination if they are on GPRs. 6532 if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) { 6533 SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); 6534 MIB.buildCopy({SrcReg}, {I.getOperand(2)}); 6535 6536 // Make sure the copy ends up getting constrained properly. 6537 RBI.constrainGenericRegister(I.getOperand(2).getReg(), 6538 AArch64::GPR32RegClass, MRI); 6539 } 6540 6541 if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) 6542 DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); 6543 6544 // Actually insert the instruction. 6545 auto SHA1Inst = MIB.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg}); 6546 constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI); 6547 6548 // Did we create a new register for the destination? 6549 if (DstReg != I.getOperand(0).getReg()) { 6550 // Yep. Copy the result of the instruction back into the original 6551 // destination. 6552 MIB.buildCopy({I.getOperand(0)}, {DstReg}); 6553 RBI.constrainGenericRegister(I.getOperand(0).getReg(), 6554 AArch64::GPR32RegClass, MRI); 6555 } 6556 6557 I.eraseFromParent(); 6558 return true; 6559 } 6560 case Intrinsic::ptrauth_resign: { 6561 Register DstReg = I.getOperand(0).getReg(); 6562 Register ValReg = I.getOperand(2).getReg(); 6563 uint64_t AUTKey = I.getOperand(3).getImm(); 6564 Register AUTDisc = I.getOperand(4).getReg(); 6565 uint64_t PACKey = I.getOperand(5).getImm(); 6566 Register PACDisc = I.getOperand(6).getReg(); 6567 6568 Register AUTAddrDisc = AUTDisc; 6569 uint16_t AUTConstDiscC = 0; 6570 std::tie(AUTConstDiscC, AUTAddrDisc) = 6571 extractPtrauthBlendDiscriminators(AUTDisc, MRI); 6572 6573 Register PACAddrDisc = PACDisc; 6574 uint16_t PACConstDiscC = 0; 6575 std::tie(PACConstDiscC, PACAddrDisc) = 6576 extractPtrauthBlendDiscriminators(PACDisc, MRI); 6577 6578 MIB.buildCopy({AArch64::X16}, {ValReg}); 6579 MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X17}, {}); 6580 MIB.buildInstr(AArch64::AUTPAC) 6581 .addImm(AUTKey) 6582 .addImm(AUTConstDiscC) 6583 .addUse(AUTAddrDisc) 6584 .addImm(PACKey) 6585 .addImm(PACConstDiscC) 6586 .addUse(PACAddrDisc) 6587 .constrainAllUses(TII, TRI, RBI); 6588 MIB.buildCopy({DstReg}, Register(AArch64::X16)); 6589 6590 RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI); 6591 I.eraseFromParent(); 6592 return true; 6593 } 6594 case Intrinsic::ptrauth_auth: { 6595 Register DstReg = I.getOperand(0).getReg(); 6596 Register ValReg = I.getOperand(2).getReg(); 6597 uint64_t AUTKey = I.getOperand(3).getImm(); 6598 Register AUTDisc = I.getOperand(4).getReg(); 6599 6600 Register AUTAddrDisc = AUTDisc; 6601 uint16_t AUTConstDiscC = 0; 6602 std::tie(AUTConstDiscC, AUTAddrDisc) = 6603 extractPtrauthBlendDiscriminators(AUTDisc, MRI); 6604 6605 MIB.buildCopy({AArch64::X16}, {ValReg}); 6606 MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X17}, {}); 6607 MIB.buildInstr(AArch64::AUT) 6608 .addImm(AUTKey) 6609 .addImm(AUTConstDiscC) 6610 .addUse(AUTAddrDisc) 6611 .constrainAllUses(TII, TRI, RBI); 6612 MIB.buildCopy({DstReg}, Register(AArch64::X16)); 6613 6614 RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI); 6615 I.eraseFromParent(); 6616 return true; 6617 } 6618 case Intrinsic::frameaddress: 6619 case Intrinsic::returnaddress: { 6620 MachineFunction &MF = *I.getParent()->getParent(); 6621 MachineFrameInfo &MFI = MF.getFrameInfo(); 6622 6623 unsigned Depth = I.getOperand(2).getImm(); 6624 Register DstReg = I.getOperand(0).getReg(); 6625 RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI); 6626 6627 if (Depth == 0 && IntrinID == Intrinsic::returnaddress) { 6628 if (!MFReturnAddr) { 6629 // Insert the copy from LR/X30 into the entry block, before it can be 6630 // clobbered by anything. 6631 MFI.setReturnAddressIsTaken(true); 6632 MFReturnAddr = getFunctionLiveInPhysReg( 6633 MF, TII, AArch64::LR, AArch64::GPR64RegClass, I.getDebugLoc()); 6634 } 6635 6636 if (STI.hasPAuth()) { 6637 MIB.buildInstr(AArch64::XPACI, {DstReg}, {MFReturnAddr}); 6638 } else { 6639 MIB.buildCopy({Register(AArch64::LR)}, {MFReturnAddr}); 6640 MIB.buildInstr(AArch64::XPACLRI); 6641 MIB.buildCopy({DstReg}, {Register(AArch64::LR)}); 6642 } 6643 6644 I.eraseFromParent(); 6645 return true; 6646 } 6647 6648 MFI.setFrameAddressIsTaken(true); 6649 Register FrameAddr(AArch64::FP); 6650 while (Depth--) { 6651 Register NextFrame = MRI.createVirtualRegister(&AArch64::GPR64spRegClass); 6652 auto Ldr = 6653 MIB.buildInstr(AArch64::LDRXui, {NextFrame}, {FrameAddr}).addImm(0); 6654 constrainSelectedInstRegOperands(*Ldr, TII, TRI, RBI); 6655 FrameAddr = NextFrame; 6656 } 6657 6658 if (IntrinID == Intrinsic::frameaddress) 6659 MIB.buildCopy({DstReg}, {FrameAddr}); 6660 else { 6661 MFI.setReturnAddressIsTaken(true); 6662 6663 if (STI.hasPAuth()) { 6664 Register TmpReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); 6665 MIB.buildInstr(AArch64::LDRXui, {TmpReg}, {FrameAddr}).addImm(1); 6666 MIB.buildInstr(AArch64::XPACI, {DstReg}, {TmpReg}); 6667 } else { 6668 MIB.buildInstr(AArch64::LDRXui, {Register(AArch64::LR)}, {FrameAddr}) 6669 .addImm(1); 6670 MIB.buildInstr(AArch64::XPACLRI); 6671 MIB.buildCopy({DstReg}, {Register(AArch64::LR)}); 6672 } 6673 } 6674 6675 I.eraseFromParent(); 6676 return true; 6677 } 6678 case Intrinsic::aarch64_neon_tbl2: 6679 SelectTable(I, MRI, 2, AArch64::TBLv8i8Two, AArch64::TBLv16i8Two, false); 6680 return true; 6681 case Intrinsic::aarch64_neon_tbl3: 6682 SelectTable(I, MRI, 3, AArch64::TBLv8i8Three, AArch64::TBLv16i8Three, 6683 false); 6684 return true; 6685 case Intrinsic::aarch64_neon_tbl4: 6686 SelectTable(I, MRI, 4, AArch64::TBLv8i8Four, AArch64::TBLv16i8Four, false); 6687 return true; 6688 case Intrinsic::aarch64_neon_tbx2: 6689 SelectTable(I, MRI, 2, AArch64::TBXv8i8Two, AArch64::TBXv16i8Two, true); 6690 return true; 6691 case Intrinsic::aarch64_neon_tbx3: 6692 SelectTable(I, MRI, 3, AArch64::TBXv8i8Three, AArch64::TBXv16i8Three, true); 6693 return true; 6694 case Intrinsic::aarch64_neon_tbx4: 6695 SelectTable(I, MRI, 4, AArch64::TBXv8i8Four, AArch64::TBXv16i8Four, true); 6696 return true; 6697 case Intrinsic::swift_async_context_addr: 6698 auto Sub = MIB.buildInstr(AArch64::SUBXri, {I.getOperand(0).getReg()}, 6699 {Register(AArch64::FP)}) 6700 .addImm(8) 6701 .addImm(0); 6702 constrainSelectedInstRegOperands(*Sub, TII, TRI, RBI); 6703 6704 MF->getFrameInfo().setFrameAddressIsTaken(true); 6705 MF->getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true); 6706 I.eraseFromParent(); 6707 return true; 6708 } 6709 return false; 6710 } 6711 6712 // G_PTRAUTH_GLOBAL_VALUE lowering 6713 // 6714 // We have 3 lowering alternatives to choose from: 6715 // - MOVaddrPAC: similar to MOVaddr, with added PAC. 6716 // If the GV doesn't need a GOT load (i.e., is locally defined) 6717 // materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC. 6718 // 6719 // - LOADgotPAC: similar to LOADgot, with added PAC. 6720 // If the GV needs a GOT load, materialize the pointer using the usual 6721 // GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT 6722 // section is assumed to be read-only (for example, via relro mechanism). See 6723 // LowerMOVaddrPAC. 6724 // 6725 // - LOADauthptrstatic: similar to LOADgot, but use a 6726 // special stub slot instead of a GOT slot. 6727 // Load a signed pointer for symbol 'sym' from a stub slot named 6728 // 'sym$auth_ptr$key$disc' filled by dynamic linker during relocation 6729 // resolving. This usually lowers to adrp+ldr, but also emits an entry into 6730 // .data with an 6731 // @AUTH relocation. See LowerLOADauthptrstatic. 6732 // 6733 // All 3 are pseudos that are expand late to longer sequences: this lets us 6734 // provide integrity guarantees on the to-be-signed intermediate values. 6735 // 6736 // LOADauthptrstatic is undesirable because it requires a large section filled 6737 // with often similarly-signed pointers, making it a good harvesting target. 6738 // Thus, it's only used for ptrauth references to extern_weak to avoid null 6739 // checks. 6740 6741 bool AArch64InstructionSelector::selectPtrAuthGlobalValue( 6742 MachineInstr &I, MachineRegisterInfo &MRI) const { 6743 Register DefReg = I.getOperand(0).getReg(); 6744 Register Addr = I.getOperand(1).getReg(); 6745 uint64_t Key = I.getOperand(2).getImm(); 6746 Register AddrDisc = I.getOperand(3).getReg(); 6747 uint64_t Disc = I.getOperand(4).getImm(); 6748 int64_t Offset = 0; 6749 6750 if (Key > AArch64PACKey::LAST) 6751 report_fatal_error("key in ptrauth global out of range [0, " + 6752 Twine((int)AArch64PACKey::LAST) + "]"); 6753 6754 // Blend only works if the integer discriminator is 16-bit wide. 6755 if (!isUInt<16>(Disc)) 6756 report_fatal_error( 6757 "constant discriminator in ptrauth global out of range [0, 0xffff]"); 6758 6759 // Choosing between 3 lowering alternatives is target-specific. 6760 if (!STI.isTargetELF() && !STI.isTargetMachO()) 6761 report_fatal_error("ptrauth global lowering only supported on MachO/ELF"); 6762 6763 if (!MRI.hasOneDef(Addr)) 6764 return false; 6765 6766 // First match any offset we take from the real global. 6767 const MachineInstr *DefMI = &*MRI.def_instr_begin(Addr); 6768 if (DefMI->getOpcode() == TargetOpcode::G_PTR_ADD) { 6769 Register OffsetReg = DefMI->getOperand(2).getReg(); 6770 if (!MRI.hasOneDef(OffsetReg)) 6771 return false; 6772 const MachineInstr &OffsetMI = *MRI.def_instr_begin(OffsetReg); 6773 if (OffsetMI.getOpcode() != TargetOpcode::G_CONSTANT) 6774 return false; 6775 6776 Addr = DefMI->getOperand(1).getReg(); 6777 if (!MRI.hasOneDef(Addr)) 6778 return false; 6779 6780 DefMI = &*MRI.def_instr_begin(Addr); 6781 Offset = OffsetMI.getOperand(1).getCImm()->getSExtValue(); 6782 } 6783 6784 // We should be left with a genuine unauthenticated GlobalValue. 6785 const GlobalValue *GV; 6786 if (DefMI->getOpcode() == TargetOpcode::G_GLOBAL_VALUE) { 6787 GV = DefMI->getOperand(1).getGlobal(); 6788 Offset += DefMI->getOperand(1).getOffset(); 6789 } else if (DefMI->getOpcode() == AArch64::G_ADD_LOW) { 6790 GV = DefMI->getOperand(2).getGlobal(); 6791 Offset += DefMI->getOperand(2).getOffset(); 6792 } else { 6793 return false; 6794 } 6795 6796 MachineIRBuilder MIB(I); 6797 6798 // Classify the reference to determine whether it needs a GOT load. 6799 unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM); 6800 const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0); 6801 assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) && 6802 "unsupported non-GOT op flags on ptrauth global reference"); 6803 assert((!GV->hasExternalWeakLinkage() || NeedsGOTLoad) && 6804 "unsupported non-GOT reference to weak ptrauth global"); 6805 6806 std::optional<APInt> AddrDiscVal = getIConstantVRegVal(AddrDisc, MRI); 6807 bool HasAddrDisc = !AddrDiscVal || *AddrDiscVal != 0; 6808 6809 // Non-extern_weak: 6810 // - No GOT load needed -> MOVaddrPAC 6811 // - GOT load for non-extern_weak -> LOADgotPAC 6812 // Note that we disallow extern_weak refs to avoid null checks later. 6813 if (!GV->hasExternalWeakLinkage()) { 6814 MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X16}, {}); 6815 MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X17}, {}); 6816 MIB.buildInstr(NeedsGOTLoad ? AArch64::LOADgotPAC : AArch64::MOVaddrPAC) 6817 .addGlobalAddress(GV, Offset) 6818 .addImm(Key) 6819 .addReg(HasAddrDisc ? AddrDisc : AArch64::XZR) 6820 .addImm(Disc) 6821 .constrainAllUses(TII, TRI, RBI); 6822 MIB.buildCopy(DefReg, Register(AArch64::X16)); 6823 RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI); 6824 I.eraseFromParent(); 6825 return true; 6826 } 6827 6828 // extern_weak -> LOADauthptrstatic 6829 6830 // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the 6831 // offset alone as a pointer if the symbol wasn't available, which would 6832 // probably break null checks in users. Ptrauth complicates things further: 6833 // error out. 6834 if (Offset != 0) 6835 report_fatal_error( 6836 "unsupported non-zero offset in weak ptrauth global reference"); 6837 6838 if (HasAddrDisc) 6839 report_fatal_error("unsupported weak addr-div ptrauth global"); 6840 6841 MIB.buildInstr(AArch64::LOADauthptrstatic, {DefReg}, {}) 6842 .addGlobalAddress(GV, Offset) 6843 .addImm(Key) 6844 .addImm(Disc); 6845 RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI); 6846 6847 I.eraseFromParent(); 6848 return true; 6849 } 6850 6851 void AArch64InstructionSelector::SelectTable(MachineInstr &I, 6852 MachineRegisterInfo &MRI, 6853 unsigned NumVec, unsigned Opc1, 6854 unsigned Opc2, bool isExt) { 6855 Register DstReg = I.getOperand(0).getReg(); 6856 unsigned Opc = MRI.getType(DstReg) == LLT::fixed_vector(8, 8) ? Opc1 : Opc2; 6857 6858 // Create the REG_SEQUENCE 6859 SmallVector<Register, 4> Regs; 6860 for (unsigned i = 0; i < NumVec; i++) 6861 Regs.push_back(I.getOperand(i + 2 + isExt).getReg()); 6862 Register RegSeq = createQTuple(Regs, MIB); 6863 6864 Register IdxReg = I.getOperand(2 + NumVec + isExt).getReg(); 6865 MachineInstrBuilder Instr; 6866 if (isExt) { 6867 Register Reg = I.getOperand(2).getReg(); 6868 Instr = MIB.buildInstr(Opc, {DstReg}, {Reg, RegSeq, IdxReg}); 6869 } else 6870 Instr = MIB.buildInstr(Opc, {DstReg}, {RegSeq, IdxReg}); 6871 constrainSelectedInstRegOperands(*Instr, TII, TRI, RBI); 6872 I.eraseFromParent(); 6873 } 6874 6875 InstructionSelector::ComplexRendererFns 6876 AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const { 6877 auto MaybeImmed = getImmedFromMO(Root); 6878 if (MaybeImmed == std::nullopt || *MaybeImmed > 31) 6879 return std::nullopt; 6880 uint64_t Enc = (32 - *MaybeImmed) & 0x1f; 6881 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; 6882 } 6883 6884 InstructionSelector::ComplexRendererFns 6885 AArch64InstructionSelector::selectShiftB_32(const MachineOperand &Root) const { 6886 auto MaybeImmed = getImmedFromMO(Root); 6887 if (MaybeImmed == std::nullopt || *MaybeImmed > 31) 6888 return std::nullopt; 6889 uint64_t Enc = 31 - *MaybeImmed; 6890 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; 6891 } 6892 6893 InstructionSelector::ComplexRendererFns 6894 AArch64InstructionSelector::selectShiftA_64(const MachineOperand &Root) const { 6895 auto MaybeImmed = getImmedFromMO(Root); 6896 if (MaybeImmed == std::nullopt || *MaybeImmed > 63) 6897 return std::nullopt; 6898 uint64_t Enc = (64 - *MaybeImmed) & 0x3f; 6899 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; 6900 } 6901 6902 InstructionSelector::ComplexRendererFns 6903 AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const { 6904 auto MaybeImmed = getImmedFromMO(Root); 6905 if (MaybeImmed == std::nullopt || *MaybeImmed > 63) 6906 return std::nullopt; 6907 uint64_t Enc = 63 - *MaybeImmed; 6908 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; 6909 } 6910 6911 /// Helper to select an immediate value that can be represented as a 12-bit 6912 /// value shifted left by either 0 or 12. If it is possible to do so, return 6913 /// the immediate and shift value. If not, return std::nullopt. 6914 /// 6915 /// Used by selectArithImmed and selectNegArithImmed. 6916 InstructionSelector::ComplexRendererFns 6917 AArch64InstructionSelector::select12BitValueWithLeftShift( 6918 uint64_t Immed) const { 6919 unsigned ShiftAmt; 6920 if (Immed >> 12 == 0) { 6921 ShiftAmt = 0; 6922 } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) { 6923 ShiftAmt = 12; 6924 Immed = Immed >> 12; 6925 } else 6926 return std::nullopt; 6927 6928 unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt); 6929 return {{ 6930 [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); }, 6931 [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); }, 6932 }}; 6933 } 6934 6935 /// SelectArithImmed - Select an immediate value that can be represented as 6936 /// a 12-bit value shifted left by either 0 or 12. If so, return true with 6937 /// Val set to the 12-bit value and Shift set to the shifter operand. 6938 InstructionSelector::ComplexRendererFns 6939 AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const { 6940 // This function is called from the addsub_shifted_imm ComplexPattern, 6941 // which lists [imm] as the list of opcode it's interested in, however 6942 // we still need to check whether the operand is actually an immediate 6943 // here because the ComplexPattern opcode list is only used in 6944 // root-level opcode matching. 6945 auto MaybeImmed = getImmedFromMO(Root); 6946 if (MaybeImmed == std::nullopt) 6947 return std::nullopt; 6948 return select12BitValueWithLeftShift(*MaybeImmed); 6949 } 6950 6951 /// SelectNegArithImmed - As above, but negates the value before trying to 6952 /// select it. 6953 InstructionSelector::ComplexRendererFns 6954 AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const { 6955 // We need a register here, because we need to know if we have a 64 or 32 6956 // bit immediate. 6957 if (!Root.isReg()) 6958 return std::nullopt; 6959 auto MaybeImmed = getImmedFromMO(Root); 6960 if (MaybeImmed == std::nullopt) 6961 return std::nullopt; 6962 uint64_t Immed = *MaybeImmed; 6963 6964 // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0" 6965 // have the opposite effect on the C flag, so this pattern mustn't match under 6966 // those circumstances. 6967 if (Immed == 0) 6968 return std::nullopt; 6969 6970 // Check if we're dealing with a 32-bit type on the root or a 64-bit type on 6971 // the root. 6972 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 6973 if (MRI.getType(Root.getReg()).getSizeInBits() == 32) 6974 Immed = ~((uint32_t)Immed) + 1; 6975 else 6976 Immed = ~Immed + 1ULL; 6977 6978 if (Immed & 0xFFFFFFFFFF000000ULL) 6979 return std::nullopt; 6980 6981 Immed &= 0xFFFFFFULL; 6982 return select12BitValueWithLeftShift(Immed); 6983 } 6984 6985 /// Checks if we are sure that folding MI into load/store addressing mode is 6986 /// beneficial or not. 6987 /// 6988 /// Returns: 6989 /// - true if folding MI would be beneficial. 6990 /// - false if folding MI would be bad. 6991 /// - std::nullopt if it is not sure whether folding MI is beneficial. 6992 /// 6993 /// \p MI can be the offset operand of G_PTR_ADD, e.g. G_SHL in the example: 6994 /// 6995 /// %13:gpr(s64) = G_CONSTANT i64 1 6996 /// %8:gpr(s64) = G_SHL %6, %13(s64) 6997 /// %9:gpr(p0) = G_PTR_ADD %0, %8(s64) 6998 /// %12:gpr(s32) = G_LOAD %9(p0) :: (load (s16)) 6999 std::optional<bool> AArch64InstructionSelector::isWorthFoldingIntoAddrMode( 7000 MachineInstr &MI, const MachineRegisterInfo &MRI) const { 7001 if (MI.getOpcode() == AArch64::G_SHL) { 7002 // Address operands with shifts are free, except for running on subtargets 7003 // with AddrLSLSlow14. 7004 if (const auto ValAndVeg = getIConstantVRegValWithLookThrough( 7005 MI.getOperand(2).getReg(), MRI)) { 7006 const APInt ShiftVal = ValAndVeg->Value; 7007 7008 // Don't fold if we know this will be slow. 7009 return !(STI.hasAddrLSLSlow14() && (ShiftVal == 1 || ShiftVal == 4)); 7010 } 7011 } 7012 return std::nullopt; 7013 } 7014 7015 /// Return true if it is worth folding MI into an extended register. That is, 7016 /// if it's safe to pull it into the addressing mode of a load or store as a 7017 /// shift. 7018 /// \p IsAddrOperand whether the def of MI is used as an address operand 7019 /// (e.g. feeding into an LDR/STR). 7020 bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg( 7021 MachineInstr &MI, const MachineRegisterInfo &MRI, 7022 bool IsAddrOperand) const { 7023 7024 // Always fold if there is one use, or if we're optimizing for size. 7025 Register DefReg = MI.getOperand(0).getReg(); 7026 if (MRI.hasOneNonDBGUse(DefReg) || 7027 MI.getParent()->getParent()->getFunction().hasOptSize()) 7028 return true; 7029 7030 if (IsAddrOperand) { 7031 // If we are already sure that folding MI is good or bad, return the result. 7032 if (const auto Worth = isWorthFoldingIntoAddrMode(MI, MRI)) 7033 return *Worth; 7034 7035 // Fold G_PTR_ADD if its offset operand can be folded 7036 if (MI.getOpcode() == AArch64::G_PTR_ADD) { 7037 MachineInstr *OffsetInst = 7038 getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI); 7039 7040 // Note, we already know G_PTR_ADD is used by at least two instructions. 7041 // If we are also sure about whether folding is beneficial or not, 7042 // return the result. 7043 if (const auto Worth = isWorthFoldingIntoAddrMode(*OffsetInst, MRI)) 7044 return *Worth; 7045 } 7046 } 7047 7048 // FIXME: Consider checking HasALULSLFast as appropriate. 7049 7050 // We have a fastpath, so folding a shift in and potentially computing it 7051 // many times may be beneficial. Check if this is only used in memory ops. 7052 // If it is, then we should fold. 7053 return all_of(MRI.use_nodbg_instructions(DefReg), 7054 [](MachineInstr &Use) { return Use.mayLoadOrStore(); }); 7055 } 7056 7057 static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) { 7058 switch (Type) { 7059 case AArch64_AM::SXTB: 7060 case AArch64_AM::SXTH: 7061 case AArch64_AM::SXTW: 7062 return true; 7063 default: 7064 return false; 7065 } 7066 } 7067 7068 InstructionSelector::ComplexRendererFns 7069 AArch64InstructionSelector::selectExtendedSHL( 7070 MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset, 7071 unsigned SizeInBytes, bool WantsExt) const { 7072 assert(Base.isReg() && "Expected base to be a register operand"); 7073 assert(Offset.isReg() && "Expected offset to be a register operand"); 7074 7075 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 7076 MachineInstr *OffsetInst = MRI.getVRegDef(Offset.getReg()); 7077 7078 unsigned OffsetOpc = OffsetInst->getOpcode(); 7079 bool LookedThroughZExt = false; 7080 if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) { 7081 // Try to look through a ZEXT. 7082 if (OffsetOpc != TargetOpcode::G_ZEXT || !WantsExt) 7083 return std::nullopt; 7084 7085 OffsetInst = MRI.getVRegDef(OffsetInst->getOperand(1).getReg()); 7086 OffsetOpc = OffsetInst->getOpcode(); 7087 LookedThroughZExt = true; 7088 7089 if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) 7090 return std::nullopt; 7091 } 7092 // Make sure that the memory op is a valid size. 7093 int64_t LegalShiftVal = Log2_32(SizeInBytes); 7094 if (LegalShiftVal == 0) 7095 return std::nullopt; 7096 if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI, true)) 7097 return std::nullopt; 7098 7099 // Now, try to find the specific G_CONSTANT. Start by assuming that the 7100 // register we will offset is the LHS, and the register containing the 7101 // constant is the RHS. 7102 Register OffsetReg = OffsetInst->getOperand(1).getReg(); 7103 Register ConstantReg = OffsetInst->getOperand(2).getReg(); 7104 auto ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI); 7105 if (!ValAndVReg) { 7106 // We didn't get a constant on the RHS. If the opcode is a shift, then 7107 // we're done. 7108 if (OffsetOpc == TargetOpcode::G_SHL) 7109 return std::nullopt; 7110 7111 // If we have a G_MUL, we can use either register. Try looking at the RHS. 7112 std::swap(OffsetReg, ConstantReg); 7113 ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI); 7114 if (!ValAndVReg) 7115 return std::nullopt; 7116 } 7117 7118 // The value must fit into 3 bits, and must be positive. Make sure that is 7119 // true. 7120 int64_t ImmVal = ValAndVReg->Value.getSExtValue(); 7121 7122 // Since we're going to pull this into a shift, the constant value must be 7123 // a power of 2. If we got a multiply, then we need to check this. 7124 if (OffsetOpc == TargetOpcode::G_MUL) { 7125 if (!llvm::has_single_bit<uint32_t>(ImmVal)) 7126 return std::nullopt; 7127 7128 // Got a power of 2. So, the amount we'll shift is the log base-2 of that. 7129 ImmVal = Log2_32(ImmVal); 7130 } 7131 7132 if ((ImmVal & 0x7) != ImmVal) 7133 return std::nullopt; 7134 7135 // We are only allowed to shift by LegalShiftVal. This shift value is built 7136 // into the instruction, so we can't just use whatever we want. 7137 if (ImmVal != LegalShiftVal) 7138 return std::nullopt; 7139 7140 unsigned SignExtend = 0; 7141 if (WantsExt) { 7142 // Check if the offset is defined by an extend, unless we looked through a 7143 // G_ZEXT earlier. 7144 if (!LookedThroughZExt) { 7145 MachineInstr *ExtInst = getDefIgnoringCopies(OffsetReg, MRI); 7146 auto Ext = getExtendTypeForInst(*ExtInst, MRI, true); 7147 if (Ext == AArch64_AM::InvalidShiftExtend) 7148 return std::nullopt; 7149 7150 SignExtend = isSignExtendShiftType(Ext) ? 1 : 0; 7151 // We only support SXTW for signed extension here. 7152 if (SignExtend && Ext != AArch64_AM::SXTW) 7153 return std::nullopt; 7154 OffsetReg = ExtInst->getOperand(1).getReg(); 7155 } 7156 7157 // Need a 32-bit wide register here. 7158 MachineIRBuilder MIB(*MRI.getVRegDef(Root.getReg())); 7159 OffsetReg = moveScalarRegClass(OffsetReg, AArch64::GPR32RegClass, MIB); 7160 } 7161 7162 // We can use the LHS of the GEP as the base, and the LHS of the shift as an 7163 // offset. Signify that we are shifting by setting the shift flag to 1. 7164 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(Base.getReg()); }, 7165 [=](MachineInstrBuilder &MIB) { MIB.addUse(OffsetReg); }, 7166 [=](MachineInstrBuilder &MIB) { 7167 // Need to add both immediates here to make sure that they are both 7168 // added to the instruction. 7169 MIB.addImm(SignExtend); 7170 MIB.addImm(1); 7171 }}}; 7172 } 7173 7174 /// This is used for computing addresses like this: 7175 /// 7176 /// ldr x1, [x2, x3, lsl #3] 7177 /// 7178 /// Where x2 is the base register, and x3 is an offset register. The shift-left 7179 /// is a constant value specific to this load instruction. That is, we'll never 7180 /// see anything other than a 3 here (which corresponds to the size of the 7181 /// element being loaded.) 7182 InstructionSelector::ComplexRendererFns 7183 AArch64InstructionSelector::selectAddrModeShiftedExtendXReg( 7184 MachineOperand &Root, unsigned SizeInBytes) const { 7185 if (!Root.isReg()) 7186 return std::nullopt; 7187 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 7188 7189 // We want to find something like this: 7190 // 7191 // val = G_CONSTANT LegalShiftVal 7192 // shift = G_SHL off_reg val 7193 // ptr = G_PTR_ADD base_reg shift 7194 // x = G_LOAD ptr 7195 // 7196 // And fold it into this addressing mode: 7197 // 7198 // ldr x, [base_reg, off_reg, lsl #LegalShiftVal] 7199 7200 // Check if we can find the G_PTR_ADD. 7201 MachineInstr *PtrAdd = 7202 getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); 7203 if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI, true)) 7204 return std::nullopt; 7205 7206 // Now, try to match an opcode which will match our specific offset. 7207 // We want a G_SHL or a G_MUL. 7208 MachineInstr *OffsetInst = 7209 getDefIgnoringCopies(PtrAdd->getOperand(2).getReg(), MRI); 7210 return selectExtendedSHL(Root, PtrAdd->getOperand(1), 7211 OffsetInst->getOperand(0), SizeInBytes, 7212 /*WantsExt=*/false); 7213 } 7214 7215 /// This is used for computing addresses like this: 7216 /// 7217 /// ldr x1, [x2, x3] 7218 /// 7219 /// Where x2 is the base register, and x3 is an offset register. 7220 /// 7221 /// When possible (or profitable) to fold a G_PTR_ADD into the address 7222 /// calculation, this will do so. Otherwise, it will return std::nullopt. 7223 InstructionSelector::ComplexRendererFns 7224 AArch64InstructionSelector::selectAddrModeRegisterOffset( 7225 MachineOperand &Root) const { 7226 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 7227 7228 // We need a GEP. 7229 MachineInstr *Gep = MRI.getVRegDef(Root.getReg()); 7230 if (Gep->getOpcode() != TargetOpcode::G_PTR_ADD) 7231 return std::nullopt; 7232 7233 // If this is used more than once, let's not bother folding. 7234 // TODO: Check if they are memory ops. If they are, then we can still fold 7235 // without having to recompute anything. 7236 if (!MRI.hasOneNonDBGUse(Gep->getOperand(0).getReg())) 7237 return std::nullopt; 7238 7239 // Base is the GEP's LHS, offset is its RHS. 7240 return {{[=](MachineInstrBuilder &MIB) { 7241 MIB.addUse(Gep->getOperand(1).getReg()); 7242 }, 7243 [=](MachineInstrBuilder &MIB) { 7244 MIB.addUse(Gep->getOperand(2).getReg()); 7245 }, 7246 [=](MachineInstrBuilder &MIB) { 7247 // Need to add both immediates here to make sure that they are both 7248 // added to the instruction. 7249 MIB.addImm(0); 7250 MIB.addImm(0); 7251 }}}; 7252 } 7253 7254 /// This is intended to be equivalent to selectAddrModeXRO in 7255 /// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads. 7256 InstructionSelector::ComplexRendererFns 7257 AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root, 7258 unsigned SizeInBytes) const { 7259 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 7260 if (!Root.isReg()) 7261 return std::nullopt; 7262 MachineInstr *PtrAdd = 7263 getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); 7264 if (!PtrAdd) 7265 return std::nullopt; 7266 7267 // Check for an immediates which cannot be encoded in the [base + imm] 7268 // addressing mode, and can't be encoded in an add/sub. If this happens, we'll 7269 // end up with code like: 7270 // 7271 // mov x0, wide 7272 // add x1 base, x0 7273 // ldr x2, [x1, x0] 7274 // 7275 // In this situation, we can use the [base, xreg] addressing mode to save an 7276 // add/sub: 7277 // 7278 // mov x0, wide 7279 // ldr x2, [base, x0] 7280 auto ValAndVReg = 7281 getIConstantVRegValWithLookThrough(PtrAdd->getOperand(2).getReg(), MRI); 7282 if (ValAndVReg) { 7283 unsigned Scale = Log2_32(SizeInBytes); 7284 int64_t ImmOff = ValAndVReg->Value.getSExtValue(); 7285 7286 // Skip immediates that can be selected in the load/store addresing 7287 // mode. 7288 if (ImmOff % SizeInBytes == 0 && ImmOff >= 0 && 7289 ImmOff < (0x1000 << Scale)) 7290 return std::nullopt; 7291 7292 // Helper lambda to decide whether or not it is preferable to emit an add. 7293 auto isPreferredADD = [](int64_t ImmOff) { 7294 // Constants in [0x0, 0xfff] can be encoded in an add. 7295 if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL) 7296 return true; 7297 7298 // Can it be encoded in an add lsl #12? 7299 if ((ImmOff & 0xffffffffff000fffLL) != 0x0LL) 7300 return false; 7301 7302 // It can be encoded in an add lsl #12, but we may not want to. If it is 7303 // possible to select this as a single movz, then prefer that. A single 7304 // movz is faster than an add with a shift. 7305 return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL && 7306 (ImmOff & 0xffffffffffff0fffLL) != 0x0LL; 7307 }; 7308 7309 // If the immediate can be encoded in a single add/sub, then bail out. 7310 if (isPreferredADD(ImmOff) || isPreferredADD(-ImmOff)) 7311 return std::nullopt; 7312 } 7313 7314 // Try to fold shifts into the addressing mode. 7315 auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes); 7316 if (AddrModeFns) 7317 return AddrModeFns; 7318 7319 // If that doesn't work, see if it's possible to fold in registers from 7320 // a GEP. 7321 return selectAddrModeRegisterOffset(Root); 7322 } 7323 7324 /// This is used for computing addresses like this: 7325 /// 7326 /// ldr x0, [xBase, wOffset, sxtw #LegalShiftVal] 7327 /// 7328 /// Where we have a 64-bit base register, a 32-bit offset register, and an 7329 /// extend (which may or may not be signed). 7330 InstructionSelector::ComplexRendererFns 7331 AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root, 7332 unsigned SizeInBytes) const { 7333 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 7334 7335 MachineInstr *PtrAdd = 7336 getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); 7337 if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI, true)) 7338 return std::nullopt; 7339 7340 MachineOperand &LHS = PtrAdd->getOperand(1); 7341 MachineOperand &RHS = PtrAdd->getOperand(2); 7342 MachineInstr *OffsetInst = getDefIgnoringCopies(RHS.getReg(), MRI); 7343 7344 // The first case is the same as selectAddrModeXRO, except we need an extend. 7345 // In this case, we try to find a shift and extend, and fold them into the 7346 // addressing mode. 7347 // 7348 // E.g. 7349 // 7350 // off_reg = G_Z/S/ANYEXT ext_reg 7351 // val = G_CONSTANT LegalShiftVal 7352 // shift = G_SHL off_reg val 7353 // ptr = G_PTR_ADD base_reg shift 7354 // x = G_LOAD ptr 7355 // 7356 // In this case we can get a load like this: 7357 // 7358 // ldr x0, [base_reg, ext_reg, sxtw #LegalShiftVal] 7359 auto ExtendedShl = selectExtendedSHL(Root, LHS, OffsetInst->getOperand(0), 7360 SizeInBytes, /*WantsExt=*/true); 7361 if (ExtendedShl) 7362 return ExtendedShl; 7363 7364 // There was no shift. We can try and fold a G_Z/S/ANYEXT in alone though. 7365 // 7366 // e.g. 7367 // ldr something, [base_reg, ext_reg, sxtw] 7368 if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI, true)) 7369 return std::nullopt; 7370 7371 // Check if this is an extend. We'll get an extend type if it is. 7372 AArch64_AM::ShiftExtendType Ext = 7373 getExtendTypeForInst(*OffsetInst, MRI, /*IsLoadStore=*/true); 7374 if (Ext == AArch64_AM::InvalidShiftExtend) 7375 return std::nullopt; 7376 7377 // Need a 32-bit wide register. 7378 MachineIRBuilder MIB(*PtrAdd); 7379 Register ExtReg = moveScalarRegClass(OffsetInst->getOperand(1).getReg(), 7380 AArch64::GPR32RegClass, MIB); 7381 unsigned SignExtend = Ext == AArch64_AM::SXTW; 7382 7383 // Base is LHS, offset is ExtReg. 7384 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(LHS.getReg()); }, 7385 [=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }, 7386 [=](MachineInstrBuilder &MIB) { 7387 MIB.addImm(SignExtend); 7388 MIB.addImm(0); 7389 }}}; 7390 } 7391 7392 /// Select a "register plus unscaled signed 9-bit immediate" address. This 7393 /// should only match when there is an offset that is not valid for a scaled 7394 /// immediate addressing mode. The "Size" argument is the size in bytes of the 7395 /// memory reference, which is needed here to know what is valid for a scaled 7396 /// immediate. 7397 InstructionSelector::ComplexRendererFns 7398 AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root, 7399 unsigned Size) const { 7400 MachineRegisterInfo &MRI = 7401 Root.getParent()->getParent()->getParent()->getRegInfo(); 7402 7403 if (!Root.isReg()) 7404 return std::nullopt; 7405 7406 if (!isBaseWithConstantOffset(Root, MRI)) 7407 return std::nullopt; 7408 7409 MachineInstr *RootDef = MRI.getVRegDef(Root.getReg()); 7410 7411 MachineOperand &OffImm = RootDef->getOperand(2); 7412 if (!OffImm.isReg()) 7413 return std::nullopt; 7414 MachineInstr *RHS = MRI.getVRegDef(OffImm.getReg()); 7415 if (RHS->getOpcode() != TargetOpcode::G_CONSTANT) 7416 return std::nullopt; 7417 int64_t RHSC; 7418 MachineOperand &RHSOp1 = RHS->getOperand(1); 7419 if (!RHSOp1.isCImm() || RHSOp1.getCImm()->getBitWidth() > 64) 7420 return std::nullopt; 7421 RHSC = RHSOp1.getCImm()->getSExtValue(); 7422 7423 if (RHSC >= -256 && RHSC < 256) { 7424 MachineOperand &Base = RootDef->getOperand(1); 7425 return {{ 7426 [=](MachineInstrBuilder &MIB) { MIB.add(Base); }, 7427 [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC); }, 7428 }}; 7429 } 7430 return std::nullopt; 7431 } 7432 7433 InstructionSelector::ComplexRendererFns 7434 AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef, 7435 unsigned Size, 7436 MachineRegisterInfo &MRI) const { 7437 if (RootDef.getOpcode() != AArch64::G_ADD_LOW) 7438 return std::nullopt; 7439 MachineInstr &Adrp = *MRI.getVRegDef(RootDef.getOperand(1).getReg()); 7440 if (Adrp.getOpcode() != AArch64::ADRP) 7441 return std::nullopt; 7442 7443 // TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG. 7444 auto Offset = Adrp.getOperand(1).getOffset(); 7445 if (Offset % Size != 0) 7446 return std::nullopt; 7447 7448 auto GV = Adrp.getOperand(1).getGlobal(); 7449 if (GV->isThreadLocal()) 7450 return std::nullopt; 7451 7452 auto &MF = *RootDef.getParent()->getParent(); 7453 if (GV->getPointerAlignment(MF.getDataLayout()) < Size) 7454 return std::nullopt; 7455 7456 unsigned OpFlags = STI.ClassifyGlobalReference(GV, MF.getTarget()); 7457 MachineIRBuilder MIRBuilder(RootDef); 7458 Register AdrpReg = Adrp.getOperand(0).getReg(); 7459 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(AdrpReg); }, 7460 [=](MachineInstrBuilder &MIB) { 7461 MIB.addGlobalAddress(GV, Offset, 7462 OpFlags | AArch64II::MO_PAGEOFF | 7463 AArch64II::MO_NC); 7464 }}}; 7465 } 7466 7467 /// Select a "register plus scaled unsigned 12-bit immediate" address. The 7468 /// "Size" argument is the size in bytes of the memory reference, which 7469 /// determines the scale. 7470 InstructionSelector::ComplexRendererFns 7471 AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root, 7472 unsigned Size) const { 7473 MachineFunction &MF = *Root.getParent()->getParent()->getParent(); 7474 MachineRegisterInfo &MRI = MF.getRegInfo(); 7475 7476 if (!Root.isReg()) 7477 return std::nullopt; 7478 7479 MachineInstr *RootDef = MRI.getVRegDef(Root.getReg()); 7480 if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) { 7481 return {{ 7482 [=](MachineInstrBuilder &MIB) { MIB.add(RootDef->getOperand(1)); }, 7483 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, 7484 }}; 7485 } 7486 7487 CodeModel::Model CM = MF.getTarget().getCodeModel(); 7488 // Check if we can fold in the ADD of small code model ADRP + ADD address. 7489 if (CM == CodeModel::Small) { 7490 auto OpFns = tryFoldAddLowIntoImm(*RootDef, Size, MRI); 7491 if (OpFns) 7492 return OpFns; 7493 } 7494 7495 if (isBaseWithConstantOffset(Root, MRI)) { 7496 MachineOperand &LHS = RootDef->getOperand(1); 7497 MachineOperand &RHS = RootDef->getOperand(2); 7498 MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg()); 7499 MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg()); 7500 7501 int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue(); 7502 unsigned Scale = Log2_32(Size); 7503 if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) { 7504 if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) 7505 return {{ 7506 [=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); }, 7507 [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); }, 7508 }}; 7509 7510 return {{ 7511 [=](MachineInstrBuilder &MIB) { MIB.add(LHS); }, 7512 [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); }, 7513 }}; 7514 } 7515 } 7516 7517 // Before falling back to our general case, check if the unscaled 7518 // instructions can handle this. If so, that's preferable. 7519 if (selectAddrModeUnscaled(Root, Size)) 7520 return std::nullopt; 7521 7522 return {{ 7523 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 7524 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, 7525 }}; 7526 } 7527 7528 /// Given a shift instruction, return the correct shift type for that 7529 /// instruction. 7530 static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) { 7531 switch (MI.getOpcode()) { 7532 default: 7533 return AArch64_AM::InvalidShiftExtend; 7534 case TargetOpcode::G_SHL: 7535 return AArch64_AM::LSL; 7536 case TargetOpcode::G_LSHR: 7537 return AArch64_AM::LSR; 7538 case TargetOpcode::G_ASHR: 7539 return AArch64_AM::ASR; 7540 case TargetOpcode::G_ROTR: 7541 return AArch64_AM::ROR; 7542 } 7543 } 7544 7545 /// Select a "shifted register" operand. If the value is not shifted, set the 7546 /// shift operand to a default value of "lsl 0". 7547 InstructionSelector::ComplexRendererFns 7548 AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root, 7549 bool AllowROR) const { 7550 if (!Root.isReg()) 7551 return std::nullopt; 7552 MachineRegisterInfo &MRI = 7553 Root.getParent()->getParent()->getParent()->getRegInfo(); 7554 7555 // Check if the operand is defined by an instruction which corresponds to 7556 // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc. 7557 MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg()); 7558 AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst); 7559 if (ShType == AArch64_AM::InvalidShiftExtend) 7560 return std::nullopt; 7561 if (ShType == AArch64_AM::ROR && !AllowROR) 7562 return std::nullopt; 7563 if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI, false)) 7564 return std::nullopt; 7565 7566 // Need an immediate on the RHS. 7567 MachineOperand &ShiftRHS = ShiftInst->getOperand(2); 7568 auto Immed = getImmedFromMO(ShiftRHS); 7569 if (!Immed) 7570 return std::nullopt; 7571 7572 // We have something that we can fold. Fold in the shift's LHS and RHS into 7573 // the instruction. 7574 MachineOperand &ShiftLHS = ShiftInst->getOperand(1); 7575 Register ShiftReg = ShiftLHS.getReg(); 7576 7577 unsigned NumBits = MRI.getType(ShiftReg).getSizeInBits(); 7578 unsigned Val = *Immed & (NumBits - 1); 7579 unsigned ShiftVal = AArch64_AM::getShifterImm(ShType, Val); 7580 7581 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ShiftReg); }, 7582 [=](MachineInstrBuilder &MIB) { MIB.addImm(ShiftVal); }}}; 7583 } 7584 7585 AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst( 7586 MachineInstr &MI, MachineRegisterInfo &MRI, bool IsLoadStore) const { 7587 unsigned Opc = MI.getOpcode(); 7588 7589 // Handle explicit extend instructions first. 7590 if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) { 7591 unsigned Size; 7592 if (Opc == TargetOpcode::G_SEXT) 7593 Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 7594 else 7595 Size = MI.getOperand(2).getImm(); 7596 assert(Size != 64 && "Extend from 64 bits?"); 7597 switch (Size) { 7598 case 8: 7599 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTB; 7600 case 16: 7601 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTH; 7602 case 32: 7603 return AArch64_AM::SXTW; 7604 default: 7605 return AArch64_AM::InvalidShiftExtend; 7606 } 7607 } 7608 7609 if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) { 7610 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 7611 assert(Size != 64 && "Extend from 64 bits?"); 7612 switch (Size) { 7613 case 8: 7614 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTB; 7615 case 16: 7616 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTH; 7617 case 32: 7618 return AArch64_AM::UXTW; 7619 default: 7620 return AArch64_AM::InvalidShiftExtend; 7621 } 7622 } 7623 7624 // Don't have an explicit extend. Try to handle a G_AND with a constant mask 7625 // on the RHS. 7626 if (Opc != TargetOpcode::G_AND) 7627 return AArch64_AM::InvalidShiftExtend; 7628 7629 std::optional<uint64_t> MaybeAndMask = getImmedFromMO(MI.getOperand(2)); 7630 if (!MaybeAndMask) 7631 return AArch64_AM::InvalidShiftExtend; 7632 uint64_t AndMask = *MaybeAndMask; 7633 switch (AndMask) { 7634 default: 7635 return AArch64_AM::InvalidShiftExtend; 7636 case 0xFF: 7637 return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend; 7638 case 0xFFFF: 7639 return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend; 7640 case 0xFFFFFFFF: 7641 return AArch64_AM::UXTW; 7642 } 7643 } 7644 7645 Register AArch64InstructionSelector::moveScalarRegClass( 7646 Register Reg, const TargetRegisterClass &RC, MachineIRBuilder &MIB) const { 7647 MachineRegisterInfo &MRI = *MIB.getMRI(); 7648 auto Ty = MRI.getType(Reg); 7649 assert(!Ty.isVector() && "Expected scalars only!"); 7650 if (Ty.getSizeInBits() == TRI.getRegSizeInBits(RC)) 7651 return Reg; 7652 7653 // Create a copy and immediately select it. 7654 // FIXME: We should have an emitCopy function? 7655 auto Copy = MIB.buildCopy({&RC}, {Reg}); 7656 selectCopy(*Copy, TII, MRI, TRI, RBI); 7657 return Copy.getReg(0); 7658 } 7659 7660 /// Select an "extended register" operand. This operand folds in an extend 7661 /// followed by an optional left shift. 7662 InstructionSelector::ComplexRendererFns 7663 AArch64InstructionSelector::selectArithExtendedRegister( 7664 MachineOperand &Root) const { 7665 if (!Root.isReg()) 7666 return std::nullopt; 7667 MachineRegisterInfo &MRI = 7668 Root.getParent()->getParent()->getParent()->getRegInfo(); 7669 7670 uint64_t ShiftVal = 0; 7671 Register ExtReg; 7672 AArch64_AM::ShiftExtendType Ext; 7673 MachineInstr *RootDef = getDefIgnoringCopies(Root.getReg(), MRI); 7674 if (!RootDef) 7675 return std::nullopt; 7676 7677 if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI, false)) 7678 return std::nullopt; 7679 7680 // Check if we can fold a shift and an extend. 7681 if (RootDef->getOpcode() == TargetOpcode::G_SHL) { 7682 // Look for a constant on the RHS of the shift. 7683 MachineOperand &RHS = RootDef->getOperand(2); 7684 std::optional<uint64_t> MaybeShiftVal = getImmedFromMO(RHS); 7685 if (!MaybeShiftVal) 7686 return std::nullopt; 7687 ShiftVal = *MaybeShiftVal; 7688 if (ShiftVal > 4) 7689 return std::nullopt; 7690 // Look for a valid extend instruction on the LHS of the shift. 7691 MachineOperand &LHS = RootDef->getOperand(1); 7692 MachineInstr *ExtDef = getDefIgnoringCopies(LHS.getReg(), MRI); 7693 if (!ExtDef) 7694 return std::nullopt; 7695 Ext = getExtendTypeForInst(*ExtDef, MRI); 7696 if (Ext == AArch64_AM::InvalidShiftExtend) 7697 return std::nullopt; 7698 ExtReg = ExtDef->getOperand(1).getReg(); 7699 } else { 7700 // Didn't get a shift. Try just folding an extend. 7701 Ext = getExtendTypeForInst(*RootDef, MRI); 7702 if (Ext == AArch64_AM::InvalidShiftExtend) 7703 return std::nullopt; 7704 ExtReg = RootDef->getOperand(1).getReg(); 7705 7706 // If we have a 32 bit instruction which zeroes out the high half of a 7707 // register, we get an implicit zero extend for free. Check if we have one. 7708 // FIXME: We actually emit the extend right now even though we don't have 7709 // to. 7710 if (Ext == AArch64_AM::UXTW && MRI.getType(ExtReg).getSizeInBits() == 32) { 7711 MachineInstr *ExtInst = MRI.getVRegDef(ExtReg); 7712 if (isDef32(*ExtInst)) 7713 return std::nullopt; 7714 } 7715 } 7716 7717 // We require a GPR32 here. Narrow the ExtReg if needed using a subregister 7718 // copy. 7719 MachineIRBuilder MIB(*RootDef); 7720 ExtReg = moveScalarRegClass(ExtReg, AArch64::GPR32RegClass, MIB); 7721 7722 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }, 7723 [=](MachineInstrBuilder &MIB) { 7724 MIB.addImm(getArithExtendImm(Ext, ShiftVal)); 7725 }}}; 7726 } 7727 7728 InstructionSelector::ComplexRendererFns 7729 AArch64InstructionSelector::selectExtractHigh(MachineOperand &Root) const { 7730 if (!Root.isReg()) 7731 return std::nullopt; 7732 MachineRegisterInfo &MRI = 7733 Root.getParent()->getParent()->getParent()->getRegInfo(); 7734 7735 auto Extract = getDefSrcRegIgnoringCopies(Root.getReg(), MRI); 7736 while (Extract && Extract->MI->getOpcode() == TargetOpcode::G_BITCAST && 7737 STI.isLittleEndian()) 7738 Extract = 7739 getDefSrcRegIgnoringCopies(Extract->MI->getOperand(1).getReg(), MRI); 7740 if (!Extract) 7741 return std::nullopt; 7742 7743 if (Extract->MI->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) { 7744 if (Extract->Reg == Extract->MI->getOperand(1).getReg()) { 7745 Register ExtReg = Extract->MI->getOperand(2).getReg(); 7746 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }}}; 7747 } 7748 } 7749 if (Extract->MI->getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT) { 7750 LLT SrcTy = MRI.getType(Extract->MI->getOperand(1).getReg()); 7751 auto LaneIdx = getIConstantVRegValWithLookThrough( 7752 Extract->MI->getOperand(2).getReg(), MRI); 7753 if (LaneIdx && SrcTy == LLT::fixed_vector(2, 64) && 7754 LaneIdx->Value.getSExtValue() == 1) { 7755 Register ExtReg = Extract->MI->getOperand(1).getReg(); 7756 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }}}; 7757 } 7758 } 7759 7760 return std::nullopt; 7761 } 7762 7763 void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB, 7764 const MachineInstr &MI, 7765 int OpIdx) const { 7766 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 7767 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 7768 "Expected G_CONSTANT"); 7769 std::optional<int64_t> CstVal = 7770 getIConstantVRegSExtVal(MI.getOperand(0).getReg(), MRI); 7771 assert(CstVal && "Expected constant value"); 7772 MIB.addImm(*CstVal); 7773 } 7774 7775 void AArch64InstructionSelector::renderLogicalImm32( 7776 MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const { 7777 assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 7778 "Expected G_CONSTANT"); 7779 uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue(); 7780 uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 32); 7781 MIB.addImm(Enc); 7782 } 7783 7784 void AArch64InstructionSelector::renderLogicalImm64( 7785 MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const { 7786 assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 7787 "Expected G_CONSTANT"); 7788 uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue(); 7789 uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 64); 7790 MIB.addImm(Enc); 7791 } 7792 7793 void AArch64InstructionSelector::renderUbsanTrap(MachineInstrBuilder &MIB, 7794 const MachineInstr &MI, 7795 int OpIdx) const { 7796 assert(MI.getOpcode() == TargetOpcode::G_UBSANTRAP && OpIdx == 0 && 7797 "Expected G_UBSANTRAP"); 7798 MIB.addImm(MI.getOperand(0).getImm() | ('U' << 8)); 7799 } 7800 7801 void AArch64InstructionSelector::renderFPImm16(MachineInstrBuilder &MIB, 7802 const MachineInstr &MI, 7803 int OpIdx) const { 7804 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 && 7805 "Expected G_FCONSTANT"); 7806 MIB.addImm( 7807 AArch64_AM::getFP16Imm(MI.getOperand(1).getFPImm()->getValueAPF())); 7808 } 7809 7810 void AArch64InstructionSelector::renderFPImm32(MachineInstrBuilder &MIB, 7811 const MachineInstr &MI, 7812 int OpIdx) const { 7813 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 && 7814 "Expected G_FCONSTANT"); 7815 MIB.addImm( 7816 AArch64_AM::getFP32Imm(MI.getOperand(1).getFPImm()->getValueAPF())); 7817 } 7818 7819 void AArch64InstructionSelector::renderFPImm64(MachineInstrBuilder &MIB, 7820 const MachineInstr &MI, 7821 int OpIdx) const { 7822 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 && 7823 "Expected G_FCONSTANT"); 7824 MIB.addImm( 7825 AArch64_AM::getFP64Imm(MI.getOperand(1).getFPImm()->getValueAPF())); 7826 } 7827 7828 void AArch64InstructionSelector::renderFPImm32SIMDModImmType4( 7829 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { 7830 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 && 7831 "Expected G_FCONSTANT"); 7832 MIB.addImm(AArch64_AM::encodeAdvSIMDModImmType4(MI.getOperand(1) 7833 .getFPImm() 7834 ->getValueAPF() 7835 .bitcastToAPInt() 7836 .getZExtValue())); 7837 } 7838 7839 bool AArch64InstructionSelector::isLoadStoreOfNumBytes( 7840 const MachineInstr &MI, unsigned NumBytes) const { 7841 if (!MI.mayLoadOrStore()) 7842 return false; 7843 assert(MI.hasOneMemOperand() && 7844 "Expected load/store to have only one mem op!"); 7845 return (*MI.memoperands_begin())->getSize() == NumBytes; 7846 } 7847 7848 bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const { 7849 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 7850 if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() != 32) 7851 return false; 7852 7853 // Only return true if we know the operation will zero-out the high half of 7854 // the 64-bit register. Truncates can be subregister copies, which don't 7855 // zero out the high bits. Copies and other copy-like instructions can be 7856 // fed by truncates, or could be lowered as subregister copies. 7857 switch (MI.getOpcode()) { 7858 default: 7859 return true; 7860 case TargetOpcode::COPY: 7861 case TargetOpcode::G_BITCAST: 7862 case TargetOpcode::G_TRUNC: 7863 case TargetOpcode::G_PHI: 7864 return false; 7865 } 7866 } 7867 7868 7869 // Perform fixups on the given PHI instruction's operands to force them all 7870 // to be the same as the destination regbank. 7871 static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI, 7872 const AArch64RegisterBankInfo &RBI) { 7873 assert(MI.getOpcode() == TargetOpcode::G_PHI && "Expected a G_PHI"); 7874 Register DstReg = MI.getOperand(0).getReg(); 7875 const RegisterBank *DstRB = MRI.getRegBankOrNull(DstReg); 7876 assert(DstRB && "Expected PHI dst to have regbank assigned"); 7877 MachineIRBuilder MIB(MI); 7878 7879 // Go through each operand and ensure it has the same regbank. 7880 for (MachineOperand &MO : llvm::drop_begin(MI.operands())) { 7881 if (!MO.isReg()) 7882 continue; 7883 Register OpReg = MO.getReg(); 7884 const RegisterBank *RB = MRI.getRegBankOrNull(OpReg); 7885 if (RB != DstRB) { 7886 // Insert a cross-bank copy. 7887 auto *OpDef = MRI.getVRegDef(OpReg); 7888 const LLT &Ty = MRI.getType(OpReg); 7889 MachineBasicBlock &OpDefBB = *OpDef->getParent(); 7890 7891 // Any instruction we insert must appear after all PHIs in the block 7892 // for the block to be valid MIR. 7893 MachineBasicBlock::iterator InsertPt = std::next(OpDef->getIterator()); 7894 if (InsertPt != OpDefBB.end() && InsertPt->isPHI()) 7895 InsertPt = OpDefBB.getFirstNonPHI(); 7896 MIB.setInsertPt(*OpDef->getParent(), InsertPt); 7897 auto Copy = MIB.buildCopy(Ty, OpReg); 7898 MRI.setRegBank(Copy.getReg(0), *DstRB); 7899 MO.setReg(Copy.getReg(0)); 7900 } 7901 } 7902 } 7903 7904 void AArch64InstructionSelector::processPHIs(MachineFunction &MF) { 7905 // We're looking for PHIs, build a list so we don't invalidate iterators. 7906 MachineRegisterInfo &MRI = MF.getRegInfo(); 7907 SmallVector<MachineInstr *, 32> Phis; 7908 for (auto &BB : MF) { 7909 for (auto &MI : BB) { 7910 if (MI.getOpcode() == TargetOpcode::G_PHI) 7911 Phis.emplace_back(&MI); 7912 } 7913 } 7914 7915 for (auto *MI : Phis) { 7916 // We need to do some work here if the operand types are < 16 bit and they 7917 // are split across fpr/gpr banks. Since all types <32b on gpr 7918 // end up being assigned gpr32 regclasses, we can end up with PHIs here 7919 // which try to select between a gpr32 and an fpr16. Ideally RBS shouldn't 7920 // be selecting heterogenous regbanks for operands if possible, but we 7921 // still need to be able to deal with it here. 7922 // 7923 // To fix this, if we have a gpr-bank operand < 32b in size and at least 7924 // one other operand is on the fpr bank, then we add cross-bank copies 7925 // to homogenize the operand banks. For simplicity the bank that we choose 7926 // to settle on is whatever bank the def operand has. For example: 7927 // 7928 // %endbb: 7929 // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2:fpr(s16), %bb2 7930 // => 7931 // %bb2: 7932 // ... 7933 // %in2_copy:gpr(s16) = COPY %in2:fpr(s16) 7934 // ... 7935 // %endbb: 7936 // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2 7937 bool HasGPROp = false, HasFPROp = false; 7938 for (const MachineOperand &MO : llvm::drop_begin(MI->operands())) { 7939 if (!MO.isReg()) 7940 continue; 7941 const LLT &Ty = MRI.getType(MO.getReg()); 7942 if (!Ty.isValid() || !Ty.isScalar()) 7943 break; 7944 if (Ty.getSizeInBits() >= 32) 7945 break; 7946 const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg()); 7947 // If for some reason we don't have a regbank yet. Don't try anything. 7948 if (!RB) 7949 break; 7950 7951 if (RB->getID() == AArch64::GPRRegBankID) 7952 HasGPROp = true; 7953 else 7954 HasFPROp = true; 7955 } 7956 // We have heterogenous regbanks, need to fixup. 7957 if (HasGPROp && HasFPROp) 7958 fixupPHIOpBanks(*MI, MRI, RBI); 7959 } 7960 } 7961 7962 namespace llvm { 7963 InstructionSelector * 7964 createAArch64InstructionSelector(const AArch64TargetMachine &TM, 7965 const AArch64Subtarget &Subtarget, 7966 const AArch64RegisterBankInfo &RBI) { 7967 return new AArch64InstructionSelector(TM, Subtarget, RBI); 7968 } 7969 } 7970