1 //===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the InstructionSelector class for 10 /// AArch64. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AArch64GlobalISelUtils.h" 15 #include "AArch64InstrInfo.h" 16 #include "AArch64MachineFunctionInfo.h" 17 #include "AArch64RegisterBankInfo.h" 18 #include "AArch64RegisterInfo.h" 19 #include "AArch64Subtarget.h" 20 #include "AArch64TargetMachine.h" 21 #include "MCTargetDesc/AArch64AddressingModes.h" 22 #include "MCTargetDesc/AArch64MCTargetDesc.h" 23 #include "llvm/BinaryFormat/Dwarf.h" 24 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" 25 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 26 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" 27 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 29 #include "llvm/CodeGen/GlobalISel/Utils.h" 30 #include "llvm/CodeGen/MachineBasicBlock.h" 31 #include "llvm/CodeGen/MachineConstantPool.h" 32 #include "llvm/CodeGen/MachineFrameInfo.h" 33 #include "llvm/CodeGen/MachineFunction.h" 34 #include "llvm/CodeGen/MachineInstr.h" 35 #include "llvm/CodeGen/MachineInstrBuilder.h" 36 #include "llvm/CodeGen/MachineMemOperand.h" 37 #include "llvm/CodeGen/MachineOperand.h" 38 #include "llvm/CodeGen/MachineRegisterInfo.h" 39 #include "llvm/CodeGen/TargetOpcodes.h" 40 #include "llvm/CodeGen/TargetRegisterInfo.h" 41 #include "llvm/IR/Constants.h" 42 #include "llvm/IR/DerivedTypes.h" 43 #include "llvm/IR/Instructions.h" 44 #include "llvm/IR/IntrinsicsAArch64.h" 45 #include "llvm/IR/PatternMatch.h" 46 #include "llvm/IR/Type.h" 47 #include "llvm/Pass.h" 48 #include "llvm/Support/Debug.h" 49 #include "llvm/Support/raw_ostream.h" 50 #include <optional> 51 52 #define DEBUG_TYPE "aarch64-isel" 53 54 using namespace llvm; 55 using namespace MIPatternMatch; 56 using namespace AArch64GISelUtils; 57 58 namespace llvm { 59 class BlockFrequencyInfo; 60 class ProfileSummaryInfo; 61 } 62 63 namespace { 64 65 #define GET_GLOBALISEL_PREDICATE_BITSET 66 #include "AArch64GenGlobalISel.inc" 67 #undef GET_GLOBALISEL_PREDICATE_BITSET 68 69 70 class AArch64InstructionSelector : public InstructionSelector { 71 public: 72 AArch64InstructionSelector(const AArch64TargetMachine &TM, 73 const AArch64Subtarget &STI, 74 const AArch64RegisterBankInfo &RBI); 75 76 bool select(MachineInstr &I) override; 77 static const char *getName() { return DEBUG_TYPE; } 78 79 void setupMF(MachineFunction &MF, GISelKnownBits *KB, 80 CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, 81 BlockFrequencyInfo *BFI) override { 82 InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI); 83 MIB.setMF(MF); 84 85 // hasFnAttribute() is expensive to call on every BRCOND selection, so 86 // cache it here for each run of the selector. 87 ProduceNonFlagSettingCondBr = 88 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening); 89 MFReturnAddr = Register(); 90 91 processPHIs(MF); 92 } 93 94 private: 95 /// tblgen-erated 'select' implementation, used as the initial selector for 96 /// the patterns that don't require complex C++. 97 bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; 98 99 // A lowering phase that runs before any selection attempts. 100 // Returns true if the instruction was modified. 101 bool preISelLower(MachineInstr &I); 102 103 // An early selection function that runs before the selectImpl() call. 104 bool earlySelect(MachineInstr &I); 105 106 /// Save state that is shared between select calls, call select on \p I and 107 /// then restore the saved state. This can be used to recursively call select 108 /// within a select call. 109 bool selectAndRestoreState(MachineInstr &I); 110 111 // Do some preprocessing of G_PHIs before we begin selection. 112 void processPHIs(MachineFunction &MF); 113 114 bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI); 115 116 /// Eliminate same-sized cross-bank copies into stores before selectImpl(). 117 bool contractCrossBankCopyIntoStore(MachineInstr &I, 118 MachineRegisterInfo &MRI); 119 120 bool convertPtrAddToAdd(MachineInstr &I, MachineRegisterInfo &MRI); 121 122 bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF, 123 MachineRegisterInfo &MRI) const; 124 bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF, 125 MachineRegisterInfo &MRI) const; 126 127 ///@{ 128 /// Helper functions for selectCompareBranch. 129 bool selectCompareBranchFedByFCmp(MachineInstr &I, MachineInstr &FCmp, 130 MachineIRBuilder &MIB) const; 131 bool selectCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp, 132 MachineIRBuilder &MIB) const; 133 bool tryOptCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp, 134 MachineIRBuilder &MIB) const; 135 bool tryOptAndIntoCompareBranch(MachineInstr &AndInst, bool Invert, 136 MachineBasicBlock *DstMBB, 137 MachineIRBuilder &MIB) const; 138 ///@} 139 140 bool selectCompareBranch(MachineInstr &I, MachineFunction &MF, 141 MachineRegisterInfo &MRI); 142 143 bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI); 144 bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI); 145 146 // Helper to generate an equivalent of scalar_to_vector into a new register, 147 // returned via 'Dst'. 148 MachineInstr *emitScalarToVector(unsigned EltSize, 149 const TargetRegisterClass *DstRC, 150 Register Scalar, 151 MachineIRBuilder &MIRBuilder) const; 152 /// Helper to narrow vector that was widened by emitScalarToVector. 153 /// Copy lowest part of 128-bit or 64-bit vector to 64-bit or 32-bit 154 /// vector, correspondingly. 155 MachineInstr *emitNarrowVector(Register DstReg, Register SrcReg, 156 MachineIRBuilder &MIRBuilder, 157 MachineRegisterInfo &MRI) const; 158 159 /// Emit a lane insert into \p DstReg, or a new vector register if 160 /// std::nullopt is provided. 161 /// 162 /// The lane inserted into is defined by \p LaneIdx. The vector source 163 /// register is given by \p SrcReg. The register containing the element is 164 /// given by \p EltReg. 165 MachineInstr *emitLaneInsert(std::optional<Register> DstReg, Register SrcReg, 166 Register EltReg, unsigned LaneIdx, 167 const RegisterBank &RB, 168 MachineIRBuilder &MIRBuilder) const; 169 170 /// Emit a sequence of instructions representing a constant \p CV for a 171 /// vector register \p Dst. (E.g. a MOV, or a load from a constant pool.) 172 /// 173 /// \returns the last instruction in the sequence on success, and nullptr 174 /// otherwise. 175 MachineInstr *emitConstantVector(Register Dst, Constant *CV, 176 MachineIRBuilder &MIRBuilder, 177 MachineRegisterInfo &MRI); 178 179 MachineInstr *tryAdvSIMDModImm8(Register Dst, unsigned DstSize, APInt Bits, 180 MachineIRBuilder &MIRBuilder); 181 182 MachineInstr *tryAdvSIMDModImm16(Register Dst, unsigned DstSize, APInt Bits, 183 MachineIRBuilder &MIRBuilder, bool Inv); 184 185 MachineInstr *tryAdvSIMDModImm32(Register Dst, unsigned DstSize, APInt Bits, 186 MachineIRBuilder &MIRBuilder, bool Inv); 187 MachineInstr *tryAdvSIMDModImm64(Register Dst, unsigned DstSize, APInt Bits, 188 MachineIRBuilder &MIRBuilder); 189 MachineInstr *tryAdvSIMDModImm321s(Register Dst, unsigned DstSize, APInt Bits, 190 MachineIRBuilder &MIRBuilder, bool Inv); 191 MachineInstr *tryAdvSIMDModImmFP(Register Dst, unsigned DstSize, APInt Bits, 192 MachineIRBuilder &MIRBuilder); 193 194 bool selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI); 195 bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy, 196 MachineRegisterInfo &MRI); 197 /// \returns true if a G_BUILD_VECTOR instruction \p MI can be selected as a 198 /// SUBREG_TO_REG. 199 bool tryOptBuildVecToSubregToReg(MachineInstr &MI, MachineRegisterInfo &MRI); 200 bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI); 201 bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI); 202 bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI); 203 204 bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI); 205 bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI); 206 bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI); 207 bool selectSplitVectorUnmerge(MachineInstr &I, MachineRegisterInfo &MRI); 208 209 /// Helper function to select vector load intrinsics like 210 /// @llvm.aarch64.neon.ld2.*, @llvm.aarch64.neon.ld4.*, etc. 211 /// \p Opc is the opcode that the selected instruction should use. 212 /// \p NumVecs is the number of vector destinations for the instruction. 213 /// \p I is the original G_INTRINSIC_W_SIDE_EFFECTS instruction. 214 bool selectVectorLoadIntrinsic(unsigned Opc, unsigned NumVecs, 215 MachineInstr &I); 216 bool selectVectorLoadLaneIntrinsic(unsigned Opc, unsigned NumVecs, 217 MachineInstr &I); 218 void selectVectorStoreIntrinsic(MachineInstr &I, unsigned NumVecs, 219 unsigned Opc); 220 bool selectVectorStoreLaneIntrinsic(MachineInstr &I, unsigned NumVecs, 221 unsigned Opc); 222 bool selectIntrinsicWithSideEffects(MachineInstr &I, 223 MachineRegisterInfo &MRI); 224 bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI); 225 bool selectVectorICmp(MachineInstr &I, MachineRegisterInfo &MRI); 226 bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI); 227 bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI); 228 bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI); 229 bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI); 230 bool selectMOPS(MachineInstr &I, MachineRegisterInfo &MRI); 231 bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI); 232 233 bool selectIndexedExtLoad(MachineInstr &I, MachineRegisterInfo &MRI); 234 bool selectIndexedLoad(MachineInstr &I, MachineRegisterInfo &MRI); 235 bool selectIndexedStore(GIndexedStore &I, MachineRegisterInfo &MRI); 236 237 unsigned emitConstantPoolEntry(const Constant *CPVal, 238 MachineFunction &MF) const; 239 MachineInstr *emitLoadFromConstantPool(const Constant *CPVal, 240 MachineIRBuilder &MIRBuilder) const; 241 242 // Emit a vector concat operation. 243 MachineInstr *emitVectorConcat(std::optional<Register> Dst, Register Op1, 244 Register Op2, 245 MachineIRBuilder &MIRBuilder) const; 246 247 // Emit an integer compare between LHS and RHS, which checks for Predicate. 248 MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS, 249 MachineOperand &Predicate, 250 MachineIRBuilder &MIRBuilder) const; 251 252 /// Emit a floating point comparison between \p LHS and \p RHS. 253 /// \p Pred if given is the intended predicate to use. 254 MachineInstr * 255 emitFPCompare(Register LHS, Register RHS, MachineIRBuilder &MIRBuilder, 256 std::optional<CmpInst::Predicate> = std::nullopt) const; 257 258 MachineInstr * 259 emitInstr(unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps, 260 std::initializer_list<llvm::SrcOp> SrcOps, 261 MachineIRBuilder &MIRBuilder, 262 const ComplexRendererFns &RenderFns = std::nullopt) const; 263 /// Helper function to emit an add or sub instruction. 264 /// 265 /// \p AddrModeAndSizeToOpcode must contain each of the opcode variants above 266 /// in a specific order. 267 /// 268 /// Below is an example of the expected input to \p AddrModeAndSizeToOpcode. 269 /// 270 /// \code 271 /// const std::array<std::array<unsigned, 2>, 4> Table { 272 /// {{AArch64::ADDXri, AArch64::ADDWri}, 273 /// {AArch64::ADDXrs, AArch64::ADDWrs}, 274 /// {AArch64::ADDXrr, AArch64::ADDWrr}, 275 /// {AArch64::SUBXri, AArch64::SUBWri}, 276 /// {AArch64::ADDXrx, AArch64::ADDWrx}}}; 277 /// \endcode 278 /// 279 /// Each row in the table corresponds to a different addressing mode. Each 280 /// column corresponds to a different register size. 281 /// 282 /// \attention Rows must be structured as follows: 283 /// - Row 0: The ri opcode variants 284 /// - Row 1: The rs opcode variants 285 /// - Row 2: The rr opcode variants 286 /// - Row 3: The ri opcode variants for negative immediates 287 /// - Row 4: The rx opcode variants 288 /// 289 /// \attention Columns must be structured as follows: 290 /// - Column 0: The 64-bit opcode variants 291 /// - Column 1: The 32-bit opcode variants 292 /// 293 /// \p Dst is the destination register of the binop to emit. 294 /// \p LHS is the left-hand operand of the binop to emit. 295 /// \p RHS is the right-hand operand of the binop to emit. 296 MachineInstr *emitAddSub( 297 const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode, 298 Register Dst, MachineOperand &LHS, MachineOperand &RHS, 299 MachineIRBuilder &MIRBuilder) const; 300 MachineInstr *emitADD(Register DefReg, MachineOperand &LHS, 301 MachineOperand &RHS, 302 MachineIRBuilder &MIRBuilder) const; 303 MachineInstr *emitADDS(Register Dst, MachineOperand &LHS, MachineOperand &RHS, 304 MachineIRBuilder &MIRBuilder) const; 305 MachineInstr *emitSUBS(Register Dst, MachineOperand &LHS, MachineOperand &RHS, 306 MachineIRBuilder &MIRBuilder) const; 307 MachineInstr *emitADCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS, 308 MachineIRBuilder &MIRBuilder) const; 309 MachineInstr *emitSBCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS, 310 MachineIRBuilder &MIRBuilder) const; 311 MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS, 312 MachineIRBuilder &MIRBuilder) const; 313 MachineInstr *emitTST(MachineOperand &LHS, MachineOperand &RHS, 314 MachineIRBuilder &MIRBuilder) const; 315 MachineInstr *emitSelect(Register Dst, Register LHS, Register RHS, 316 AArch64CC::CondCode CC, 317 MachineIRBuilder &MIRBuilder) const; 318 MachineInstr *emitExtractVectorElt(std::optional<Register> DstReg, 319 const RegisterBank &DstRB, LLT ScalarTy, 320 Register VecReg, unsigned LaneIdx, 321 MachineIRBuilder &MIRBuilder) const; 322 MachineInstr *emitCSINC(Register Dst, Register Src1, Register Src2, 323 AArch64CC::CondCode Pred, 324 MachineIRBuilder &MIRBuilder) const; 325 /// Emit a CSet for a FP compare. 326 /// 327 /// \p Dst is expected to be a 32-bit scalar register. 328 MachineInstr *emitCSetForFCmp(Register Dst, CmpInst::Predicate Pred, 329 MachineIRBuilder &MIRBuilder) const; 330 331 /// Emit an instruction that sets NZCV to the carry-in expected by \p I. 332 /// Might elide the instruction if the previous instruction already sets NZCV 333 /// correctly. 334 MachineInstr *emitCarryIn(MachineInstr &I, Register CarryReg); 335 336 /// Emit the overflow op for \p Opcode. 337 /// 338 /// \p Opcode is expected to be an overflow op's opcode, e.g. G_UADDO, 339 /// G_USUBO, etc. 340 std::pair<MachineInstr *, AArch64CC::CondCode> 341 emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS, 342 MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const; 343 344 bool selectOverflowOp(MachineInstr &I, MachineRegisterInfo &MRI); 345 346 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops). 347 /// In some cases this is even possible with OR operations in the expression. 348 MachineInstr *emitConjunction(Register Val, AArch64CC::CondCode &OutCC, 349 MachineIRBuilder &MIB) const; 350 MachineInstr *emitConditionalComparison(Register LHS, Register RHS, 351 CmpInst::Predicate CC, 352 AArch64CC::CondCode Predicate, 353 AArch64CC::CondCode OutCC, 354 MachineIRBuilder &MIB) const; 355 MachineInstr *emitConjunctionRec(Register Val, AArch64CC::CondCode &OutCC, 356 bool Negate, Register CCOp, 357 AArch64CC::CondCode Predicate, 358 MachineIRBuilder &MIB) const; 359 360 /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg. 361 /// \p IsNegative is true if the test should be "not zero". 362 /// This will also optimize the test bit instruction when possible. 363 MachineInstr *emitTestBit(Register TestReg, uint64_t Bit, bool IsNegative, 364 MachineBasicBlock *DstMBB, 365 MachineIRBuilder &MIB) const; 366 367 /// Emit a CB(N)Z instruction which branches to \p DestMBB. 368 MachineInstr *emitCBZ(Register CompareReg, bool IsNegative, 369 MachineBasicBlock *DestMBB, 370 MachineIRBuilder &MIB) const; 371 372 // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td. 373 // We use these manually instead of using the importer since it doesn't 374 // support SDNodeXForm. 375 ComplexRendererFns selectShiftA_32(const MachineOperand &Root) const; 376 ComplexRendererFns selectShiftB_32(const MachineOperand &Root) const; 377 ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const; 378 ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const; 379 380 ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const; 381 ComplexRendererFns selectArithImmed(MachineOperand &Root) const; 382 ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const; 383 384 ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root, 385 unsigned Size) const; 386 387 ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const { 388 return selectAddrModeUnscaled(Root, 1); 389 } 390 ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const { 391 return selectAddrModeUnscaled(Root, 2); 392 } 393 ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const { 394 return selectAddrModeUnscaled(Root, 4); 395 } 396 ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const { 397 return selectAddrModeUnscaled(Root, 8); 398 } 399 ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const { 400 return selectAddrModeUnscaled(Root, 16); 401 } 402 403 /// Helper to try to fold in a GISEL_ADD_LOW into an immediate, to be used 404 /// from complex pattern matchers like selectAddrModeIndexed(). 405 ComplexRendererFns tryFoldAddLowIntoImm(MachineInstr &RootDef, unsigned Size, 406 MachineRegisterInfo &MRI) const; 407 408 ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root, 409 unsigned Size) const; 410 template <int Width> 411 ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const { 412 return selectAddrModeIndexed(Root, Width / 8); 413 } 414 415 bool isWorthFoldingIntoExtendedReg(MachineInstr &MI, 416 const MachineRegisterInfo &MRI) const; 417 ComplexRendererFns 418 selectAddrModeShiftedExtendXReg(MachineOperand &Root, 419 unsigned SizeInBytes) const; 420 421 /// Returns a \p ComplexRendererFns which contains a base, offset, and whether 422 /// or not a shift + extend should be folded into an addressing mode. Returns 423 /// None when this is not profitable or possible. 424 ComplexRendererFns 425 selectExtendedSHL(MachineOperand &Root, MachineOperand &Base, 426 MachineOperand &Offset, unsigned SizeInBytes, 427 bool WantsExt) const; 428 ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const; 429 ComplexRendererFns selectAddrModeXRO(MachineOperand &Root, 430 unsigned SizeInBytes) const; 431 template <int Width> 432 ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const { 433 return selectAddrModeXRO(Root, Width / 8); 434 } 435 436 ComplexRendererFns selectAddrModeWRO(MachineOperand &Root, 437 unsigned SizeInBytes) const; 438 template <int Width> 439 ComplexRendererFns selectAddrModeWRO(MachineOperand &Root) const { 440 return selectAddrModeWRO(Root, Width / 8); 441 } 442 443 ComplexRendererFns selectShiftedRegister(MachineOperand &Root, 444 bool AllowROR = false) const; 445 446 ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const { 447 return selectShiftedRegister(Root); 448 } 449 450 ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const { 451 return selectShiftedRegister(Root, true); 452 } 453 454 /// Given an extend instruction, determine the correct shift-extend type for 455 /// that instruction. 456 /// 457 /// If the instruction is going to be used in a load or store, pass 458 /// \p IsLoadStore = true. 459 AArch64_AM::ShiftExtendType 460 getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI, 461 bool IsLoadStore = false) const; 462 463 /// Move \p Reg to \p RC if \p Reg is not already on \p RC. 464 /// 465 /// \returns Either \p Reg if no change was necessary, or the new register 466 /// created by moving \p Reg. 467 /// 468 /// Note: This uses emitCopy right now. 469 Register moveScalarRegClass(Register Reg, const TargetRegisterClass &RC, 470 MachineIRBuilder &MIB) const; 471 472 ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const; 473 474 ComplexRendererFns selectExtractHigh(MachineOperand &Root) const; 475 476 void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI, 477 int OpIdx = -1) const; 478 void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I, 479 int OpIdx = -1) const; 480 void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I, 481 int OpIdx = -1) const; 482 void renderFPImm16(MachineInstrBuilder &MIB, const MachineInstr &MI, 483 int OpIdx = -1) const; 484 void renderFPImm32(MachineInstrBuilder &MIB, const MachineInstr &MI, 485 int OpIdx = -1) const; 486 void renderFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI, 487 int OpIdx = -1) const; 488 void renderFPImm32SIMDModImmType4(MachineInstrBuilder &MIB, 489 const MachineInstr &MI, 490 int OpIdx = -1) const; 491 492 // Materialize a GlobalValue or BlockAddress using a movz+movk sequence. 493 void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags); 494 495 // Optimization methods. 496 bool tryOptSelect(GSelect &Sel); 497 bool tryOptSelectConjunction(GSelect &Sel, MachineInstr &CondMI); 498 MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS, 499 MachineOperand &Predicate, 500 MachineIRBuilder &MIRBuilder) const; 501 502 /// Return true if \p MI is a load or store of \p NumBytes bytes. 503 bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const; 504 505 /// Returns true if \p MI is guaranteed to have the high-half of a 64-bit 506 /// register zeroed out. In other words, the result of MI has been explicitly 507 /// zero extended. 508 bool isDef32(const MachineInstr &MI) const; 509 510 const AArch64TargetMachine &TM; 511 const AArch64Subtarget &STI; 512 const AArch64InstrInfo &TII; 513 const AArch64RegisterInfo &TRI; 514 const AArch64RegisterBankInfo &RBI; 515 516 bool ProduceNonFlagSettingCondBr = false; 517 518 // Some cached values used during selection. 519 // We use LR as a live-in register, and we keep track of it here as it can be 520 // clobbered by calls. 521 Register MFReturnAddr; 522 523 MachineIRBuilder MIB; 524 525 #define GET_GLOBALISEL_PREDICATES_DECL 526 #include "AArch64GenGlobalISel.inc" 527 #undef GET_GLOBALISEL_PREDICATES_DECL 528 529 // We declare the temporaries used by selectImpl() in the class to minimize the 530 // cost of constructing placeholder values. 531 #define GET_GLOBALISEL_TEMPORARIES_DECL 532 #include "AArch64GenGlobalISel.inc" 533 #undef GET_GLOBALISEL_TEMPORARIES_DECL 534 }; 535 536 } // end anonymous namespace 537 538 #define GET_GLOBALISEL_IMPL 539 #include "AArch64GenGlobalISel.inc" 540 #undef GET_GLOBALISEL_IMPL 541 542 AArch64InstructionSelector::AArch64InstructionSelector( 543 const AArch64TargetMachine &TM, const AArch64Subtarget &STI, 544 const AArch64RegisterBankInfo &RBI) 545 : TM(TM), STI(STI), TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), 546 RBI(RBI), 547 #define GET_GLOBALISEL_PREDICATES_INIT 548 #include "AArch64GenGlobalISel.inc" 549 #undef GET_GLOBALISEL_PREDICATES_INIT 550 #define GET_GLOBALISEL_TEMPORARIES_INIT 551 #include "AArch64GenGlobalISel.inc" 552 #undef GET_GLOBALISEL_TEMPORARIES_INIT 553 { 554 } 555 556 // FIXME: This should be target-independent, inferred from the types declared 557 // for each class in the bank. 558 // 559 /// Given a register bank, and a type, return the smallest register class that 560 /// can represent that combination. 561 static const TargetRegisterClass * 562 getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB, 563 bool GetAllRegSet = false) { 564 if (RB.getID() == AArch64::GPRRegBankID) { 565 if (Ty.getSizeInBits() <= 32) 566 return GetAllRegSet ? &AArch64::GPR32allRegClass 567 : &AArch64::GPR32RegClass; 568 if (Ty.getSizeInBits() == 64) 569 return GetAllRegSet ? &AArch64::GPR64allRegClass 570 : &AArch64::GPR64RegClass; 571 if (Ty.getSizeInBits() == 128) 572 return &AArch64::XSeqPairsClassRegClass; 573 return nullptr; 574 } 575 576 if (RB.getID() == AArch64::FPRRegBankID) { 577 switch (Ty.getSizeInBits()) { 578 case 8: 579 return &AArch64::FPR8RegClass; 580 case 16: 581 return &AArch64::FPR16RegClass; 582 case 32: 583 return &AArch64::FPR32RegClass; 584 case 64: 585 return &AArch64::FPR64RegClass; 586 case 128: 587 return &AArch64::FPR128RegClass; 588 } 589 return nullptr; 590 } 591 592 return nullptr; 593 } 594 595 /// Given a register bank, and size in bits, return the smallest register class 596 /// that can represent that combination. 597 static const TargetRegisterClass * 598 getMinClassForRegBank(const RegisterBank &RB, unsigned SizeInBits, 599 bool GetAllRegSet = false) { 600 unsigned RegBankID = RB.getID(); 601 602 if (RegBankID == AArch64::GPRRegBankID) { 603 if (SizeInBits <= 32) 604 return GetAllRegSet ? &AArch64::GPR32allRegClass 605 : &AArch64::GPR32RegClass; 606 if (SizeInBits == 64) 607 return GetAllRegSet ? &AArch64::GPR64allRegClass 608 : &AArch64::GPR64RegClass; 609 if (SizeInBits == 128) 610 return &AArch64::XSeqPairsClassRegClass; 611 } 612 613 if (RegBankID == AArch64::FPRRegBankID) { 614 switch (SizeInBits) { 615 default: 616 return nullptr; 617 case 8: 618 return &AArch64::FPR8RegClass; 619 case 16: 620 return &AArch64::FPR16RegClass; 621 case 32: 622 return &AArch64::FPR32RegClass; 623 case 64: 624 return &AArch64::FPR64RegClass; 625 case 128: 626 return &AArch64::FPR128RegClass; 627 } 628 } 629 630 return nullptr; 631 } 632 633 /// Returns the correct subregister to use for a given register class. 634 static bool getSubRegForClass(const TargetRegisterClass *RC, 635 const TargetRegisterInfo &TRI, unsigned &SubReg) { 636 switch (TRI.getRegSizeInBits(*RC)) { 637 case 8: 638 SubReg = AArch64::bsub; 639 break; 640 case 16: 641 SubReg = AArch64::hsub; 642 break; 643 case 32: 644 if (RC != &AArch64::FPR32RegClass) 645 SubReg = AArch64::sub_32; 646 else 647 SubReg = AArch64::ssub; 648 break; 649 case 64: 650 SubReg = AArch64::dsub; 651 break; 652 default: 653 LLVM_DEBUG( 654 dbgs() << "Couldn't find appropriate subregister for register class."); 655 return false; 656 } 657 658 return true; 659 } 660 661 /// Returns the minimum size the given register bank can hold. 662 static unsigned getMinSizeForRegBank(const RegisterBank &RB) { 663 switch (RB.getID()) { 664 case AArch64::GPRRegBankID: 665 return 32; 666 case AArch64::FPRRegBankID: 667 return 8; 668 default: 669 llvm_unreachable("Tried to get minimum size for unknown register bank."); 670 } 671 } 672 673 /// Create a REG_SEQUENCE instruction using the registers in \p Regs. 674 /// Helper function for functions like createDTuple and createQTuple. 675 /// 676 /// \p RegClassIDs - The list of register class IDs available for some tuple of 677 /// a scalar class. E.g. QQRegClassID, QQQRegClassID, QQQQRegClassID. This is 678 /// expected to contain between 2 and 4 tuple classes. 679 /// 680 /// \p SubRegs - The list of subregister classes associated with each register 681 /// class ID in \p RegClassIDs. E.g., QQRegClassID should use the qsub0 682 /// subregister class. The index of each subregister class is expected to 683 /// correspond with the index of each register class. 684 /// 685 /// \returns Either the destination register of REG_SEQUENCE instruction that 686 /// was created, or the 0th element of \p Regs if \p Regs contains a single 687 /// element. 688 static Register createTuple(ArrayRef<Register> Regs, 689 const unsigned RegClassIDs[], 690 const unsigned SubRegs[], MachineIRBuilder &MIB) { 691 unsigned NumRegs = Regs.size(); 692 if (NumRegs == 1) 693 return Regs[0]; 694 assert(NumRegs >= 2 && NumRegs <= 4 && 695 "Only support between two and 4 registers in a tuple!"); 696 const TargetRegisterInfo *TRI = MIB.getMF().getSubtarget().getRegisterInfo(); 697 auto *DesiredClass = TRI->getRegClass(RegClassIDs[NumRegs - 2]); 698 auto RegSequence = 699 MIB.buildInstr(TargetOpcode::REG_SEQUENCE, {DesiredClass}, {}); 700 for (unsigned I = 0, E = Regs.size(); I < E; ++I) { 701 RegSequence.addUse(Regs[I]); 702 RegSequence.addImm(SubRegs[I]); 703 } 704 return RegSequence.getReg(0); 705 } 706 707 /// Create a tuple of D-registers using the registers in \p Regs. 708 static Register createDTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) { 709 static const unsigned RegClassIDs[] = { 710 AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID}; 711 static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1, 712 AArch64::dsub2, AArch64::dsub3}; 713 return createTuple(Regs, RegClassIDs, SubRegs, MIB); 714 } 715 716 /// Create a tuple of Q-registers using the registers in \p Regs. 717 static Register createQTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) { 718 static const unsigned RegClassIDs[] = { 719 AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID}; 720 static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1, 721 AArch64::qsub2, AArch64::qsub3}; 722 return createTuple(Regs, RegClassIDs, SubRegs, MIB); 723 } 724 725 static std::optional<uint64_t> getImmedFromMO(const MachineOperand &Root) { 726 auto &MI = *Root.getParent(); 727 auto &MBB = *MI.getParent(); 728 auto &MF = *MBB.getParent(); 729 auto &MRI = MF.getRegInfo(); 730 uint64_t Immed; 731 if (Root.isImm()) 732 Immed = Root.getImm(); 733 else if (Root.isCImm()) 734 Immed = Root.getCImm()->getZExtValue(); 735 else if (Root.isReg()) { 736 auto ValAndVReg = 737 getIConstantVRegValWithLookThrough(Root.getReg(), MRI, true); 738 if (!ValAndVReg) 739 return std::nullopt; 740 Immed = ValAndVReg->Value.getSExtValue(); 741 } else 742 return std::nullopt; 743 return Immed; 744 } 745 746 /// Check whether \p I is a currently unsupported binary operation: 747 /// - it has an unsized type 748 /// - an operand is not a vreg 749 /// - all operands are not in the same bank 750 /// These are checks that should someday live in the verifier, but right now, 751 /// these are mostly limitations of the aarch64 selector. 752 static bool unsupportedBinOp(const MachineInstr &I, 753 const AArch64RegisterBankInfo &RBI, 754 const MachineRegisterInfo &MRI, 755 const AArch64RegisterInfo &TRI) { 756 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 757 if (!Ty.isValid()) { 758 LLVM_DEBUG(dbgs() << "Generic binop register should be typed\n"); 759 return true; 760 } 761 762 const RegisterBank *PrevOpBank = nullptr; 763 for (auto &MO : I.operands()) { 764 // FIXME: Support non-register operands. 765 if (!MO.isReg()) { 766 LLVM_DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n"); 767 return true; 768 } 769 770 // FIXME: Can generic operations have physical registers operands? If 771 // so, this will need to be taught about that, and we'll need to get the 772 // bank out of the minimal class for the register. 773 // Either way, this needs to be documented (and possibly verified). 774 if (!MO.getReg().isVirtual()) { 775 LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n"); 776 return true; 777 } 778 779 const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI); 780 if (!OpBank) { 781 LLVM_DEBUG(dbgs() << "Generic register has no bank or class\n"); 782 return true; 783 } 784 785 if (PrevOpBank && OpBank != PrevOpBank) { 786 LLVM_DEBUG(dbgs() << "Generic inst operands have different banks\n"); 787 return true; 788 } 789 PrevOpBank = OpBank; 790 } 791 return false; 792 } 793 794 /// Select the AArch64 opcode for the basic binary operation \p GenericOpc 795 /// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID 796 /// and of size \p OpSize. 797 /// \returns \p GenericOpc if the combination is unsupported. 798 static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID, 799 unsigned OpSize) { 800 switch (RegBankID) { 801 case AArch64::GPRRegBankID: 802 if (OpSize == 32) { 803 switch (GenericOpc) { 804 case TargetOpcode::G_SHL: 805 return AArch64::LSLVWr; 806 case TargetOpcode::G_LSHR: 807 return AArch64::LSRVWr; 808 case TargetOpcode::G_ASHR: 809 return AArch64::ASRVWr; 810 default: 811 return GenericOpc; 812 } 813 } else if (OpSize == 64) { 814 switch (GenericOpc) { 815 case TargetOpcode::G_PTR_ADD: 816 return AArch64::ADDXrr; 817 case TargetOpcode::G_SHL: 818 return AArch64::LSLVXr; 819 case TargetOpcode::G_LSHR: 820 return AArch64::LSRVXr; 821 case TargetOpcode::G_ASHR: 822 return AArch64::ASRVXr; 823 default: 824 return GenericOpc; 825 } 826 } 827 break; 828 case AArch64::FPRRegBankID: 829 switch (OpSize) { 830 case 32: 831 switch (GenericOpc) { 832 case TargetOpcode::G_FADD: 833 return AArch64::FADDSrr; 834 case TargetOpcode::G_FSUB: 835 return AArch64::FSUBSrr; 836 case TargetOpcode::G_FMUL: 837 return AArch64::FMULSrr; 838 case TargetOpcode::G_FDIV: 839 return AArch64::FDIVSrr; 840 default: 841 return GenericOpc; 842 } 843 case 64: 844 switch (GenericOpc) { 845 case TargetOpcode::G_FADD: 846 return AArch64::FADDDrr; 847 case TargetOpcode::G_FSUB: 848 return AArch64::FSUBDrr; 849 case TargetOpcode::G_FMUL: 850 return AArch64::FMULDrr; 851 case TargetOpcode::G_FDIV: 852 return AArch64::FDIVDrr; 853 case TargetOpcode::G_OR: 854 return AArch64::ORRv8i8; 855 default: 856 return GenericOpc; 857 } 858 } 859 break; 860 } 861 return GenericOpc; 862 } 863 864 /// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc, 865 /// appropriate for the (value) register bank \p RegBankID and of memory access 866 /// size \p OpSize. This returns the variant with the base+unsigned-immediate 867 /// addressing mode (e.g., LDRXui). 868 /// \returns \p GenericOpc if the combination is unsupported. 869 static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID, 870 unsigned OpSize) { 871 const bool isStore = GenericOpc == TargetOpcode::G_STORE; 872 switch (RegBankID) { 873 case AArch64::GPRRegBankID: 874 switch (OpSize) { 875 case 8: 876 return isStore ? AArch64::STRBBui : AArch64::LDRBBui; 877 case 16: 878 return isStore ? AArch64::STRHHui : AArch64::LDRHHui; 879 case 32: 880 return isStore ? AArch64::STRWui : AArch64::LDRWui; 881 case 64: 882 return isStore ? AArch64::STRXui : AArch64::LDRXui; 883 } 884 break; 885 case AArch64::FPRRegBankID: 886 switch (OpSize) { 887 case 8: 888 return isStore ? AArch64::STRBui : AArch64::LDRBui; 889 case 16: 890 return isStore ? AArch64::STRHui : AArch64::LDRHui; 891 case 32: 892 return isStore ? AArch64::STRSui : AArch64::LDRSui; 893 case 64: 894 return isStore ? AArch64::STRDui : AArch64::LDRDui; 895 case 128: 896 return isStore ? AArch64::STRQui : AArch64::LDRQui; 897 } 898 break; 899 } 900 return GenericOpc; 901 } 902 903 /// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg 904 /// to \p *To. 905 /// 906 /// E.g "To = COPY SrcReg:SubReg" 907 static bool copySubReg(MachineInstr &I, MachineRegisterInfo &MRI, 908 const RegisterBankInfo &RBI, Register SrcReg, 909 const TargetRegisterClass *To, unsigned SubReg) { 910 assert(SrcReg.isValid() && "Expected a valid source register?"); 911 assert(To && "Destination register class cannot be null"); 912 assert(SubReg && "Expected a valid subregister"); 913 914 MachineIRBuilder MIB(I); 915 auto SubRegCopy = 916 MIB.buildInstr(TargetOpcode::COPY, {To}, {}).addReg(SrcReg, 0, SubReg); 917 MachineOperand &RegOp = I.getOperand(1); 918 RegOp.setReg(SubRegCopy.getReg(0)); 919 920 // It's possible that the destination register won't be constrained. Make 921 // sure that happens. 922 if (!I.getOperand(0).getReg().isPhysical()) 923 RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI); 924 925 return true; 926 } 927 928 /// Helper function to get the source and destination register classes for a 929 /// copy. Returns a std::pair containing the source register class for the 930 /// copy, and the destination register class for the copy. If a register class 931 /// cannot be determined, then it will be nullptr. 932 static std::pair<const TargetRegisterClass *, const TargetRegisterClass *> 933 getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII, 934 MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, 935 const RegisterBankInfo &RBI) { 936 Register DstReg = I.getOperand(0).getReg(); 937 Register SrcReg = I.getOperand(1).getReg(); 938 const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); 939 const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); 940 unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI); 941 unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); 942 943 // Special casing for cross-bank copies of s1s. We can technically represent 944 // a 1-bit value with any size of register. The minimum size for a GPR is 32 945 // bits. So, we need to put the FPR on 32 bits as well. 946 // 947 // FIXME: I'm not sure if this case holds true outside of copies. If it does, 948 // then we can pull it into the helpers that get the appropriate class for a 949 // register bank. Or make a new helper that carries along some constraint 950 // information. 951 if (SrcRegBank != DstRegBank && (DstSize == 1 && SrcSize == 1)) 952 SrcSize = DstSize = 32; 953 954 return {getMinClassForRegBank(SrcRegBank, SrcSize, true), 955 getMinClassForRegBank(DstRegBank, DstSize, true)}; 956 } 957 958 // FIXME: We need some sort of API in RBI/TRI to allow generic code to 959 // constrain operands of simple instructions given a TargetRegisterClass 960 // and LLT 961 static bool selectDebugInstr(MachineInstr &I, MachineRegisterInfo &MRI, 962 const RegisterBankInfo &RBI) { 963 for (MachineOperand &MO : I.operands()) { 964 if (!MO.isReg()) 965 continue; 966 Register Reg = MO.getReg(); 967 if (!Reg) 968 continue; 969 if (Reg.isPhysical()) 970 continue; 971 LLT Ty = MRI.getType(Reg); 972 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 973 const TargetRegisterClass *RC = 974 RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); 975 if (!RC) { 976 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); 977 RC = getRegClassForTypeOnBank(Ty, RB); 978 if (!RC) { 979 LLVM_DEBUG( 980 dbgs() << "Warning: DBG_VALUE operand has unexpected size/bank\n"); 981 break; 982 } 983 } 984 RBI.constrainGenericRegister(Reg, *RC, MRI); 985 } 986 987 return true; 988 } 989 990 static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, 991 MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, 992 const RegisterBankInfo &RBI) { 993 Register DstReg = I.getOperand(0).getReg(); 994 Register SrcReg = I.getOperand(1).getReg(); 995 const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); 996 const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); 997 998 // Find the correct register classes for the source and destination registers. 999 const TargetRegisterClass *SrcRC; 1000 const TargetRegisterClass *DstRC; 1001 std::tie(SrcRC, DstRC) = getRegClassesForCopy(I, TII, MRI, TRI, RBI); 1002 1003 if (!DstRC) { 1004 LLVM_DEBUG(dbgs() << "Unexpected dest size " 1005 << RBI.getSizeInBits(DstReg, MRI, TRI) << '\n'); 1006 return false; 1007 } 1008 1009 // Is this a copy? If so, then we may need to insert a subregister copy. 1010 if (I.isCopy()) { 1011 // Yes. Check if there's anything to fix up. 1012 if (!SrcRC) { 1013 LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n"); 1014 return false; 1015 } 1016 1017 unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC); 1018 unsigned DstSize = TRI.getRegSizeInBits(*DstRC); 1019 unsigned SubReg; 1020 1021 // If the source bank doesn't support a subregister copy small enough, 1022 // then we first need to copy to the destination bank. 1023 if (getMinSizeForRegBank(SrcRegBank) > DstSize) { 1024 const TargetRegisterClass *DstTempRC = 1025 getMinClassForRegBank(DstRegBank, SrcSize, /* GetAllRegSet */ true); 1026 getSubRegForClass(DstRC, TRI, SubReg); 1027 1028 MachineIRBuilder MIB(I); 1029 auto Copy = MIB.buildCopy({DstTempRC}, {SrcReg}); 1030 copySubReg(I, MRI, RBI, Copy.getReg(0), DstRC, SubReg); 1031 } else if (SrcSize > DstSize) { 1032 // If the source register is bigger than the destination we need to 1033 // perform a subregister copy. 1034 const TargetRegisterClass *SubRegRC = 1035 getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true); 1036 getSubRegForClass(SubRegRC, TRI, SubReg); 1037 copySubReg(I, MRI, RBI, SrcReg, DstRC, SubReg); 1038 } else if (DstSize > SrcSize) { 1039 // If the destination register is bigger than the source we need to do 1040 // a promotion using SUBREG_TO_REG. 1041 const TargetRegisterClass *PromotionRC = 1042 getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true); 1043 getSubRegForClass(SrcRC, TRI, SubReg); 1044 1045 Register PromoteReg = MRI.createVirtualRegister(PromotionRC); 1046 BuildMI(*I.getParent(), I, I.getDebugLoc(), 1047 TII.get(AArch64::SUBREG_TO_REG), PromoteReg) 1048 .addImm(0) 1049 .addUse(SrcReg) 1050 .addImm(SubReg); 1051 MachineOperand &RegOp = I.getOperand(1); 1052 RegOp.setReg(PromoteReg); 1053 } 1054 1055 // If the destination is a physical register, then there's nothing to 1056 // change, so we're done. 1057 if (DstReg.isPhysical()) 1058 return true; 1059 } 1060 1061 // No need to constrain SrcReg. It will get constrained when we hit another 1062 // of its use or its defs. Copies do not have constraints. 1063 if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { 1064 LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode()) 1065 << " operand\n"); 1066 return false; 1067 } 1068 1069 // If this a GPR ZEXT that we want to just reduce down into a copy. 1070 // The sizes will be mismatched with the source < 32b but that's ok. 1071 if (I.getOpcode() == TargetOpcode::G_ZEXT) { 1072 I.setDesc(TII.get(AArch64::COPY)); 1073 assert(SrcRegBank.getID() == AArch64::GPRRegBankID); 1074 return selectCopy(I, TII, MRI, TRI, RBI); 1075 } 1076 1077 I.setDesc(TII.get(AArch64::COPY)); 1078 return true; 1079 } 1080 1081 static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) { 1082 if (!DstTy.isScalar() || !SrcTy.isScalar()) 1083 return GenericOpc; 1084 1085 const unsigned DstSize = DstTy.getSizeInBits(); 1086 const unsigned SrcSize = SrcTy.getSizeInBits(); 1087 1088 switch (DstSize) { 1089 case 32: 1090 switch (SrcSize) { 1091 case 32: 1092 switch (GenericOpc) { 1093 case TargetOpcode::G_SITOFP: 1094 return AArch64::SCVTFUWSri; 1095 case TargetOpcode::G_UITOFP: 1096 return AArch64::UCVTFUWSri; 1097 case TargetOpcode::G_FPTOSI: 1098 return AArch64::FCVTZSUWSr; 1099 case TargetOpcode::G_FPTOUI: 1100 return AArch64::FCVTZUUWSr; 1101 default: 1102 return GenericOpc; 1103 } 1104 case 64: 1105 switch (GenericOpc) { 1106 case TargetOpcode::G_SITOFP: 1107 return AArch64::SCVTFUXSri; 1108 case TargetOpcode::G_UITOFP: 1109 return AArch64::UCVTFUXSri; 1110 case TargetOpcode::G_FPTOSI: 1111 return AArch64::FCVTZSUWDr; 1112 case TargetOpcode::G_FPTOUI: 1113 return AArch64::FCVTZUUWDr; 1114 default: 1115 return GenericOpc; 1116 } 1117 default: 1118 return GenericOpc; 1119 } 1120 case 64: 1121 switch (SrcSize) { 1122 case 32: 1123 switch (GenericOpc) { 1124 case TargetOpcode::G_SITOFP: 1125 return AArch64::SCVTFUWDri; 1126 case TargetOpcode::G_UITOFP: 1127 return AArch64::UCVTFUWDri; 1128 case TargetOpcode::G_FPTOSI: 1129 return AArch64::FCVTZSUXSr; 1130 case TargetOpcode::G_FPTOUI: 1131 return AArch64::FCVTZUUXSr; 1132 default: 1133 return GenericOpc; 1134 } 1135 case 64: 1136 switch (GenericOpc) { 1137 case TargetOpcode::G_SITOFP: 1138 return AArch64::SCVTFUXDri; 1139 case TargetOpcode::G_UITOFP: 1140 return AArch64::UCVTFUXDri; 1141 case TargetOpcode::G_FPTOSI: 1142 return AArch64::FCVTZSUXDr; 1143 case TargetOpcode::G_FPTOUI: 1144 return AArch64::FCVTZUUXDr; 1145 default: 1146 return GenericOpc; 1147 } 1148 default: 1149 return GenericOpc; 1150 } 1151 default: 1152 return GenericOpc; 1153 }; 1154 return GenericOpc; 1155 } 1156 1157 MachineInstr * 1158 AArch64InstructionSelector::emitSelect(Register Dst, Register True, 1159 Register False, AArch64CC::CondCode CC, 1160 MachineIRBuilder &MIB) const { 1161 MachineRegisterInfo &MRI = *MIB.getMRI(); 1162 assert(RBI.getRegBank(False, MRI, TRI)->getID() == 1163 RBI.getRegBank(True, MRI, TRI)->getID() && 1164 "Expected both select operands to have the same regbank?"); 1165 LLT Ty = MRI.getType(True); 1166 if (Ty.isVector()) 1167 return nullptr; 1168 const unsigned Size = Ty.getSizeInBits(); 1169 assert((Size == 32 || Size == 64) && 1170 "Expected 32 bit or 64 bit select only?"); 1171 const bool Is32Bit = Size == 32; 1172 if (RBI.getRegBank(True, MRI, TRI)->getID() != AArch64::GPRRegBankID) { 1173 unsigned Opc = Is32Bit ? AArch64::FCSELSrrr : AArch64::FCSELDrrr; 1174 auto FCSel = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC); 1175 constrainSelectedInstRegOperands(*FCSel, TII, TRI, RBI); 1176 return &*FCSel; 1177 } 1178 1179 // By default, we'll try and emit a CSEL. 1180 unsigned Opc = Is32Bit ? AArch64::CSELWr : AArch64::CSELXr; 1181 bool Optimized = false; 1182 auto TryFoldBinOpIntoSelect = [&Opc, Is32Bit, &CC, &MRI, 1183 &Optimized](Register &Reg, Register &OtherReg, 1184 bool Invert) { 1185 if (Optimized) 1186 return false; 1187 1188 // Attempt to fold: 1189 // 1190 // %sub = G_SUB 0, %x 1191 // %select = G_SELECT cc, %reg, %sub 1192 // 1193 // Into: 1194 // %select = CSNEG %reg, %x, cc 1195 Register MatchReg; 1196 if (mi_match(Reg, MRI, m_Neg(m_Reg(MatchReg)))) { 1197 Opc = Is32Bit ? AArch64::CSNEGWr : AArch64::CSNEGXr; 1198 Reg = MatchReg; 1199 if (Invert) { 1200 CC = AArch64CC::getInvertedCondCode(CC); 1201 std::swap(Reg, OtherReg); 1202 } 1203 return true; 1204 } 1205 1206 // Attempt to fold: 1207 // 1208 // %xor = G_XOR %x, -1 1209 // %select = G_SELECT cc, %reg, %xor 1210 // 1211 // Into: 1212 // %select = CSINV %reg, %x, cc 1213 if (mi_match(Reg, MRI, m_Not(m_Reg(MatchReg)))) { 1214 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; 1215 Reg = MatchReg; 1216 if (Invert) { 1217 CC = AArch64CC::getInvertedCondCode(CC); 1218 std::swap(Reg, OtherReg); 1219 } 1220 return true; 1221 } 1222 1223 // Attempt to fold: 1224 // 1225 // %add = G_ADD %x, 1 1226 // %select = G_SELECT cc, %reg, %add 1227 // 1228 // Into: 1229 // %select = CSINC %reg, %x, cc 1230 if (mi_match(Reg, MRI, 1231 m_any_of(m_GAdd(m_Reg(MatchReg), m_SpecificICst(1)), 1232 m_GPtrAdd(m_Reg(MatchReg), m_SpecificICst(1))))) { 1233 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; 1234 Reg = MatchReg; 1235 if (Invert) { 1236 CC = AArch64CC::getInvertedCondCode(CC); 1237 std::swap(Reg, OtherReg); 1238 } 1239 return true; 1240 } 1241 1242 return false; 1243 }; 1244 1245 // Helper lambda which tries to use CSINC/CSINV for the instruction when its 1246 // true/false values are constants. 1247 // FIXME: All of these patterns already exist in tablegen. We should be 1248 // able to import these. 1249 auto TryOptSelectCst = [&Opc, &True, &False, &CC, Is32Bit, &MRI, 1250 &Optimized]() { 1251 if (Optimized) 1252 return false; 1253 auto TrueCst = getIConstantVRegValWithLookThrough(True, MRI); 1254 auto FalseCst = getIConstantVRegValWithLookThrough(False, MRI); 1255 if (!TrueCst && !FalseCst) 1256 return false; 1257 1258 Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR; 1259 if (TrueCst && FalseCst) { 1260 int64_t T = TrueCst->Value.getSExtValue(); 1261 int64_t F = FalseCst->Value.getSExtValue(); 1262 1263 if (T == 0 && F == 1) { 1264 // G_SELECT cc, 0, 1 -> CSINC zreg, zreg, cc 1265 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; 1266 True = ZReg; 1267 False = ZReg; 1268 return true; 1269 } 1270 1271 if (T == 0 && F == -1) { 1272 // G_SELECT cc 0, -1 -> CSINV zreg, zreg cc 1273 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; 1274 True = ZReg; 1275 False = ZReg; 1276 return true; 1277 } 1278 } 1279 1280 if (TrueCst) { 1281 int64_t T = TrueCst->Value.getSExtValue(); 1282 if (T == 1) { 1283 // G_SELECT cc, 1, f -> CSINC f, zreg, inv_cc 1284 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; 1285 True = False; 1286 False = ZReg; 1287 CC = AArch64CC::getInvertedCondCode(CC); 1288 return true; 1289 } 1290 1291 if (T == -1) { 1292 // G_SELECT cc, -1, f -> CSINV f, zreg, inv_cc 1293 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; 1294 True = False; 1295 False = ZReg; 1296 CC = AArch64CC::getInvertedCondCode(CC); 1297 return true; 1298 } 1299 } 1300 1301 if (FalseCst) { 1302 int64_t F = FalseCst->Value.getSExtValue(); 1303 if (F == 1) { 1304 // G_SELECT cc, t, 1 -> CSINC t, zreg, cc 1305 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; 1306 False = ZReg; 1307 return true; 1308 } 1309 1310 if (F == -1) { 1311 // G_SELECT cc, t, -1 -> CSINC t, zreg, cc 1312 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; 1313 False = ZReg; 1314 return true; 1315 } 1316 } 1317 return false; 1318 }; 1319 1320 Optimized |= TryFoldBinOpIntoSelect(False, True, /*Invert = */ false); 1321 Optimized |= TryFoldBinOpIntoSelect(True, False, /*Invert = */ true); 1322 Optimized |= TryOptSelectCst(); 1323 auto SelectInst = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC); 1324 constrainSelectedInstRegOperands(*SelectInst, TII, TRI, RBI); 1325 return &*SelectInst; 1326 } 1327 1328 static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) { 1329 switch (P) { 1330 default: 1331 llvm_unreachable("Unknown condition code!"); 1332 case CmpInst::ICMP_NE: 1333 return AArch64CC::NE; 1334 case CmpInst::ICMP_EQ: 1335 return AArch64CC::EQ; 1336 case CmpInst::ICMP_SGT: 1337 return AArch64CC::GT; 1338 case CmpInst::ICMP_SGE: 1339 return AArch64CC::GE; 1340 case CmpInst::ICMP_SLT: 1341 return AArch64CC::LT; 1342 case CmpInst::ICMP_SLE: 1343 return AArch64CC::LE; 1344 case CmpInst::ICMP_UGT: 1345 return AArch64CC::HI; 1346 case CmpInst::ICMP_UGE: 1347 return AArch64CC::HS; 1348 case CmpInst::ICMP_ULT: 1349 return AArch64CC::LO; 1350 case CmpInst::ICMP_ULE: 1351 return AArch64CC::LS; 1352 } 1353 } 1354 1355 /// changeFPCCToORAArch64CC - Convert an IR fp condition code to an AArch64 CC. 1356 static void changeFPCCToORAArch64CC(CmpInst::Predicate CC, 1357 AArch64CC::CondCode &CondCode, 1358 AArch64CC::CondCode &CondCode2) { 1359 CondCode2 = AArch64CC::AL; 1360 switch (CC) { 1361 default: 1362 llvm_unreachable("Unknown FP condition!"); 1363 case CmpInst::FCMP_OEQ: 1364 CondCode = AArch64CC::EQ; 1365 break; 1366 case CmpInst::FCMP_OGT: 1367 CondCode = AArch64CC::GT; 1368 break; 1369 case CmpInst::FCMP_OGE: 1370 CondCode = AArch64CC::GE; 1371 break; 1372 case CmpInst::FCMP_OLT: 1373 CondCode = AArch64CC::MI; 1374 break; 1375 case CmpInst::FCMP_OLE: 1376 CondCode = AArch64CC::LS; 1377 break; 1378 case CmpInst::FCMP_ONE: 1379 CondCode = AArch64CC::MI; 1380 CondCode2 = AArch64CC::GT; 1381 break; 1382 case CmpInst::FCMP_ORD: 1383 CondCode = AArch64CC::VC; 1384 break; 1385 case CmpInst::FCMP_UNO: 1386 CondCode = AArch64CC::VS; 1387 break; 1388 case CmpInst::FCMP_UEQ: 1389 CondCode = AArch64CC::EQ; 1390 CondCode2 = AArch64CC::VS; 1391 break; 1392 case CmpInst::FCMP_UGT: 1393 CondCode = AArch64CC::HI; 1394 break; 1395 case CmpInst::FCMP_UGE: 1396 CondCode = AArch64CC::PL; 1397 break; 1398 case CmpInst::FCMP_ULT: 1399 CondCode = AArch64CC::LT; 1400 break; 1401 case CmpInst::FCMP_ULE: 1402 CondCode = AArch64CC::LE; 1403 break; 1404 case CmpInst::FCMP_UNE: 1405 CondCode = AArch64CC::NE; 1406 break; 1407 } 1408 } 1409 1410 /// Convert an IR fp condition code to an AArch64 CC. 1411 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that 1412 /// should be AND'ed instead of OR'ed. 1413 static void changeFPCCToANDAArch64CC(CmpInst::Predicate CC, 1414 AArch64CC::CondCode &CondCode, 1415 AArch64CC::CondCode &CondCode2) { 1416 CondCode2 = AArch64CC::AL; 1417 switch (CC) { 1418 default: 1419 changeFPCCToORAArch64CC(CC, CondCode, CondCode2); 1420 assert(CondCode2 == AArch64CC::AL); 1421 break; 1422 case CmpInst::FCMP_ONE: 1423 // (a one b) 1424 // == ((a olt b) || (a ogt b)) 1425 // == ((a ord b) && (a une b)) 1426 CondCode = AArch64CC::VC; 1427 CondCode2 = AArch64CC::NE; 1428 break; 1429 case CmpInst::FCMP_UEQ: 1430 // (a ueq b) 1431 // == ((a uno b) || (a oeq b)) 1432 // == ((a ule b) && (a uge b)) 1433 CondCode = AArch64CC::PL; 1434 CondCode2 = AArch64CC::LE; 1435 break; 1436 } 1437 } 1438 1439 /// Return a register which can be used as a bit to test in a TB(N)Z. 1440 static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert, 1441 MachineRegisterInfo &MRI) { 1442 assert(Reg.isValid() && "Expected valid register!"); 1443 bool HasZext = false; 1444 while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) { 1445 unsigned Opc = MI->getOpcode(); 1446 1447 if (!MI->getOperand(0).isReg() || 1448 !MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) 1449 break; 1450 1451 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits. 1452 // 1453 // (tbz (trunc x), b) -> (tbz x, b) is always safe, because the bit number 1454 // on the truncated x is the same as the bit number on x. 1455 if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT || 1456 Opc == TargetOpcode::G_TRUNC) { 1457 if (Opc == TargetOpcode::G_ZEXT) 1458 HasZext = true; 1459 1460 Register NextReg = MI->getOperand(1).getReg(); 1461 // Did we find something worth folding? 1462 if (!NextReg.isValid() || !MRI.hasOneNonDBGUse(NextReg)) 1463 break; 1464 1465 // NextReg is worth folding. Keep looking. 1466 Reg = NextReg; 1467 continue; 1468 } 1469 1470 // Attempt to find a suitable operation with a constant on one side. 1471 std::optional<uint64_t> C; 1472 Register TestReg; 1473 switch (Opc) { 1474 default: 1475 break; 1476 case TargetOpcode::G_AND: 1477 case TargetOpcode::G_XOR: { 1478 TestReg = MI->getOperand(1).getReg(); 1479 Register ConstantReg = MI->getOperand(2).getReg(); 1480 auto VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI); 1481 if (!VRegAndVal) { 1482 // AND commutes, check the other side for a constant. 1483 // FIXME: Can we canonicalize the constant so that it's always on the 1484 // same side at some point earlier? 1485 std::swap(ConstantReg, TestReg); 1486 VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI); 1487 } 1488 if (VRegAndVal) { 1489 if (HasZext) 1490 C = VRegAndVal->Value.getZExtValue(); 1491 else 1492 C = VRegAndVal->Value.getSExtValue(); 1493 } 1494 break; 1495 } 1496 case TargetOpcode::G_ASHR: 1497 case TargetOpcode::G_LSHR: 1498 case TargetOpcode::G_SHL: { 1499 TestReg = MI->getOperand(1).getReg(); 1500 auto VRegAndVal = 1501 getIConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI); 1502 if (VRegAndVal) 1503 C = VRegAndVal->Value.getSExtValue(); 1504 break; 1505 } 1506 } 1507 1508 // Didn't find a constant or viable register. Bail out of the loop. 1509 if (!C || !TestReg.isValid()) 1510 break; 1511 1512 // We found a suitable instruction with a constant. Check to see if we can 1513 // walk through the instruction. 1514 Register NextReg; 1515 unsigned TestRegSize = MRI.getType(TestReg).getSizeInBits(); 1516 switch (Opc) { 1517 default: 1518 break; 1519 case TargetOpcode::G_AND: 1520 // (tbz (and x, m), b) -> (tbz x, b) when the b-th bit of m is set. 1521 if ((*C >> Bit) & 1) 1522 NextReg = TestReg; 1523 break; 1524 case TargetOpcode::G_SHL: 1525 // (tbz (shl x, c), b) -> (tbz x, b-c) when b-c is positive and fits in 1526 // the type of the register. 1527 if (*C <= Bit && (Bit - *C) < TestRegSize) { 1528 NextReg = TestReg; 1529 Bit = Bit - *C; 1530 } 1531 break; 1532 case TargetOpcode::G_ASHR: 1533 // (tbz (ashr x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits 1534 // in x 1535 NextReg = TestReg; 1536 Bit = Bit + *C; 1537 if (Bit >= TestRegSize) 1538 Bit = TestRegSize - 1; 1539 break; 1540 case TargetOpcode::G_LSHR: 1541 // (tbz (lshr x, c), b) -> (tbz x, b+c) when b + c is < # bits in x 1542 if ((Bit + *C) < TestRegSize) { 1543 NextReg = TestReg; 1544 Bit = Bit + *C; 1545 } 1546 break; 1547 case TargetOpcode::G_XOR: 1548 // We can walk through a G_XOR by inverting whether we use tbz/tbnz when 1549 // appropriate. 1550 // 1551 // e.g. If x' = xor x, c, and the b-th bit is set in c then 1552 // 1553 // tbz x', b -> tbnz x, b 1554 // 1555 // Because x' only has the b-th bit set if x does not. 1556 if ((*C >> Bit) & 1) 1557 Invert = !Invert; 1558 NextReg = TestReg; 1559 break; 1560 } 1561 1562 // Check if we found anything worth folding. 1563 if (!NextReg.isValid()) 1564 return Reg; 1565 Reg = NextReg; 1566 } 1567 1568 return Reg; 1569 } 1570 1571 MachineInstr *AArch64InstructionSelector::emitTestBit( 1572 Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB, 1573 MachineIRBuilder &MIB) const { 1574 assert(TestReg.isValid()); 1575 assert(ProduceNonFlagSettingCondBr && 1576 "Cannot emit TB(N)Z with speculation tracking!"); 1577 MachineRegisterInfo &MRI = *MIB.getMRI(); 1578 1579 // Attempt to optimize the test bit by walking over instructions. 1580 TestReg = getTestBitReg(TestReg, Bit, IsNegative, MRI); 1581 LLT Ty = MRI.getType(TestReg); 1582 unsigned Size = Ty.getSizeInBits(); 1583 assert(!Ty.isVector() && "Expected a scalar!"); 1584 assert(Bit < 64 && "Bit is too large!"); 1585 1586 // When the test register is a 64-bit register, we have to narrow to make 1587 // TBNZW work. 1588 bool UseWReg = Bit < 32; 1589 unsigned NecessarySize = UseWReg ? 32 : 64; 1590 if (Size != NecessarySize) 1591 TestReg = moveScalarRegClass( 1592 TestReg, UseWReg ? AArch64::GPR32RegClass : AArch64::GPR64RegClass, 1593 MIB); 1594 1595 static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX}, 1596 {AArch64::TBZW, AArch64::TBNZW}}; 1597 unsigned Opc = OpcTable[UseWReg][IsNegative]; 1598 auto TestBitMI = 1599 MIB.buildInstr(Opc).addReg(TestReg).addImm(Bit).addMBB(DstMBB); 1600 constrainSelectedInstRegOperands(*TestBitMI, TII, TRI, RBI); 1601 return &*TestBitMI; 1602 } 1603 1604 bool AArch64InstructionSelector::tryOptAndIntoCompareBranch( 1605 MachineInstr &AndInst, bool Invert, MachineBasicBlock *DstMBB, 1606 MachineIRBuilder &MIB) const { 1607 assert(AndInst.getOpcode() == TargetOpcode::G_AND && "Expected G_AND only?"); 1608 // Given something like this: 1609 // 1610 // %x = ...Something... 1611 // %one = G_CONSTANT i64 1 1612 // %zero = G_CONSTANT i64 0 1613 // %and = G_AND %x, %one 1614 // %cmp = G_ICMP intpred(ne), %and, %zero 1615 // %cmp_trunc = G_TRUNC %cmp 1616 // G_BRCOND %cmp_trunc, %bb.3 1617 // 1618 // We want to try and fold the AND into the G_BRCOND and produce either a 1619 // TBNZ (when we have intpred(ne)) or a TBZ (when we have intpred(eq)). 1620 // 1621 // In this case, we'd get 1622 // 1623 // TBNZ %x %bb.3 1624 // 1625 1626 // Check if the AND has a constant on its RHS which we can use as a mask. 1627 // If it's a power of 2, then it's the same as checking a specific bit. 1628 // (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set) 1629 auto MaybeBit = getIConstantVRegValWithLookThrough( 1630 AndInst.getOperand(2).getReg(), *MIB.getMRI()); 1631 if (!MaybeBit) 1632 return false; 1633 1634 int32_t Bit = MaybeBit->Value.exactLogBase2(); 1635 if (Bit < 0) 1636 return false; 1637 1638 Register TestReg = AndInst.getOperand(1).getReg(); 1639 1640 // Emit a TB(N)Z. 1641 emitTestBit(TestReg, Bit, Invert, DstMBB, MIB); 1642 return true; 1643 } 1644 1645 MachineInstr *AArch64InstructionSelector::emitCBZ(Register CompareReg, 1646 bool IsNegative, 1647 MachineBasicBlock *DestMBB, 1648 MachineIRBuilder &MIB) const { 1649 assert(ProduceNonFlagSettingCondBr && "CBZ does not set flags!"); 1650 MachineRegisterInfo &MRI = *MIB.getMRI(); 1651 assert(RBI.getRegBank(CompareReg, MRI, TRI)->getID() == 1652 AArch64::GPRRegBankID && 1653 "Expected GPRs only?"); 1654 auto Ty = MRI.getType(CompareReg); 1655 unsigned Width = Ty.getSizeInBits(); 1656 assert(!Ty.isVector() && "Expected scalar only?"); 1657 assert(Width <= 64 && "Expected width to be at most 64?"); 1658 static const unsigned OpcTable[2][2] = {{AArch64::CBZW, AArch64::CBZX}, 1659 {AArch64::CBNZW, AArch64::CBNZX}}; 1660 unsigned Opc = OpcTable[IsNegative][Width == 64]; 1661 auto BranchMI = MIB.buildInstr(Opc, {}, {CompareReg}).addMBB(DestMBB); 1662 constrainSelectedInstRegOperands(*BranchMI, TII, TRI, RBI); 1663 return &*BranchMI; 1664 } 1665 1666 bool AArch64InstructionSelector::selectCompareBranchFedByFCmp( 1667 MachineInstr &I, MachineInstr &FCmp, MachineIRBuilder &MIB) const { 1668 assert(FCmp.getOpcode() == TargetOpcode::G_FCMP); 1669 assert(I.getOpcode() == TargetOpcode::G_BRCOND); 1670 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't 1671 // totally clean. Some of them require two branches to implement. 1672 auto Pred = (CmpInst::Predicate)FCmp.getOperand(1).getPredicate(); 1673 emitFPCompare(FCmp.getOperand(2).getReg(), FCmp.getOperand(3).getReg(), MIB, 1674 Pred); 1675 AArch64CC::CondCode CC1, CC2; 1676 changeFCMPPredToAArch64CC(static_cast<CmpInst::Predicate>(Pred), CC1, CC2); 1677 MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); 1678 MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC1).addMBB(DestMBB); 1679 if (CC2 != AArch64CC::AL) 1680 MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC2).addMBB(DestMBB); 1681 I.eraseFromParent(); 1682 return true; 1683 } 1684 1685 bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp( 1686 MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const { 1687 assert(ICmp.getOpcode() == TargetOpcode::G_ICMP); 1688 assert(I.getOpcode() == TargetOpcode::G_BRCOND); 1689 // Attempt to optimize the G_BRCOND + G_ICMP into a TB(N)Z/CB(N)Z. 1690 // 1691 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z 1692 // instructions will not be produced, as they are conditional branch 1693 // instructions that do not set flags. 1694 if (!ProduceNonFlagSettingCondBr) 1695 return false; 1696 1697 MachineRegisterInfo &MRI = *MIB.getMRI(); 1698 MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); 1699 auto Pred = 1700 static_cast<CmpInst::Predicate>(ICmp.getOperand(1).getPredicate()); 1701 Register LHS = ICmp.getOperand(2).getReg(); 1702 Register RHS = ICmp.getOperand(3).getReg(); 1703 1704 // We're allowed to emit a TB(N)Z/CB(N)Z. Try to do that. 1705 auto VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI); 1706 MachineInstr *AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI); 1707 1708 // When we can emit a TB(N)Z, prefer that. 1709 // 1710 // Handle non-commutative condition codes first. 1711 // Note that we don't want to do this when we have a G_AND because it can 1712 // become a tst. The tst will make the test bit in the TB(N)Z redundant. 1713 if (VRegAndVal && !AndInst) { 1714 int64_t C = VRegAndVal->Value.getSExtValue(); 1715 1716 // When we have a greater-than comparison, we can just test if the msb is 1717 // zero. 1718 if (C == -1 && Pred == CmpInst::ICMP_SGT) { 1719 uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1; 1720 emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB); 1721 I.eraseFromParent(); 1722 return true; 1723 } 1724 1725 // When we have a less than comparison, we can just test if the msb is not 1726 // zero. 1727 if (C == 0 && Pred == CmpInst::ICMP_SLT) { 1728 uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1; 1729 emitTestBit(LHS, Bit, /*IsNegative = */ true, DestMBB, MIB); 1730 I.eraseFromParent(); 1731 return true; 1732 } 1733 1734 // Inversely, if we have a signed greater-than-or-equal comparison to zero, 1735 // we can test if the msb is zero. 1736 if (C == 0 && Pred == CmpInst::ICMP_SGE) { 1737 uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1; 1738 emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB); 1739 I.eraseFromParent(); 1740 return true; 1741 } 1742 } 1743 1744 // Attempt to handle commutative condition codes. Right now, that's only 1745 // eq/ne. 1746 if (ICmpInst::isEquality(Pred)) { 1747 if (!VRegAndVal) { 1748 std::swap(RHS, LHS); 1749 VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI); 1750 AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI); 1751 } 1752 1753 if (VRegAndVal && VRegAndVal->Value == 0) { 1754 // If there's a G_AND feeding into this branch, try to fold it away by 1755 // emitting a TB(N)Z instead. 1756 // 1757 // Note: If we have LT, then it *is* possible to fold, but it wouldn't be 1758 // beneficial. When we have an AND and LT, we need a TST/ANDS, so folding 1759 // would be redundant. 1760 if (AndInst && 1761 tryOptAndIntoCompareBranch( 1762 *AndInst, /*Invert = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB)) { 1763 I.eraseFromParent(); 1764 return true; 1765 } 1766 1767 // Otherwise, try to emit a CB(N)Z instead. 1768 auto LHSTy = MRI.getType(LHS); 1769 if (!LHSTy.isVector() && LHSTy.getSizeInBits() <= 64) { 1770 emitCBZ(LHS, /*IsNegative = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB); 1771 I.eraseFromParent(); 1772 return true; 1773 } 1774 } 1775 } 1776 1777 return false; 1778 } 1779 1780 bool AArch64InstructionSelector::selectCompareBranchFedByICmp( 1781 MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const { 1782 assert(ICmp.getOpcode() == TargetOpcode::G_ICMP); 1783 assert(I.getOpcode() == TargetOpcode::G_BRCOND); 1784 if (tryOptCompareBranchFedByICmp(I, ICmp, MIB)) 1785 return true; 1786 1787 // Couldn't optimize. Emit a compare + a Bcc. 1788 MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); 1789 auto PredOp = ICmp.getOperand(1); 1790 emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB); 1791 const AArch64CC::CondCode CC = changeICMPPredToAArch64CC( 1792 static_cast<CmpInst::Predicate>(PredOp.getPredicate())); 1793 MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB); 1794 I.eraseFromParent(); 1795 return true; 1796 } 1797 1798 bool AArch64InstructionSelector::selectCompareBranch( 1799 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) { 1800 Register CondReg = I.getOperand(0).getReg(); 1801 MachineInstr *CCMI = MRI.getVRegDef(CondReg); 1802 // Try to select the G_BRCOND using whatever is feeding the condition if 1803 // possible. 1804 unsigned CCMIOpc = CCMI->getOpcode(); 1805 if (CCMIOpc == TargetOpcode::G_FCMP) 1806 return selectCompareBranchFedByFCmp(I, *CCMI, MIB); 1807 if (CCMIOpc == TargetOpcode::G_ICMP) 1808 return selectCompareBranchFedByICmp(I, *CCMI, MIB); 1809 1810 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z 1811 // instructions will not be produced, as they are conditional branch 1812 // instructions that do not set flags. 1813 if (ProduceNonFlagSettingCondBr) { 1814 emitTestBit(CondReg, /*Bit = */ 0, /*IsNegative = */ true, 1815 I.getOperand(1).getMBB(), MIB); 1816 I.eraseFromParent(); 1817 return true; 1818 } 1819 1820 // Can't emit TB(N)Z/CB(N)Z. Emit a tst + bcc instead. 1821 auto TstMI = 1822 MIB.buildInstr(AArch64::ANDSWri, {LLT::scalar(32)}, {CondReg}).addImm(1); 1823 constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI); 1824 auto Bcc = MIB.buildInstr(AArch64::Bcc) 1825 .addImm(AArch64CC::NE) 1826 .addMBB(I.getOperand(1).getMBB()); 1827 I.eraseFromParent(); 1828 return constrainSelectedInstRegOperands(*Bcc, TII, TRI, RBI); 1829 } 1830 1831 /// Returns the element immediate value of a vector shift operand if found. 1832 /// This needs to detect a splat-like operation, e.g. a G_BUILD_VECTOR. 1833 static std::optional<int64_t> getVectorShiftImm(Register Reg, 1834 MachineRegisterInfo &MRI) { 1835 assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand"); 1836 MachineInstr *OpMI = MRI.getVRegDef(Reg); 1837 return getAArch64VectorSplatScalar(*OpMI, MRI); 1838 } 1839 1840 /// Matches and returns the shift immediate value for a SHL instruction given 1841 /// a shift operand. 1842 static std::optional<int64_t> getVectorSHLImm(LLT SrcTy, Register Reg, 1843 MachineRegisterInfo &MRI) { 1844 std::optional<int64_t> ShiftImm = getVectorShiftImm(Reg, MRI); 1845 if (!ShiftImm) 1846 return std::nullopt; 1847 // Check the immediate is in range for a SHL. 1848 int64_t Imm = *ShiftImm; 1849 if (Imm < 0) 1850 return std::nullopt; 1851 switch (SrcTy.getElementType().getSizeInBits()) { 1852 default: 1853 LLVM_DEBUG(dbgs() << "Unhandled element type for vector shift"); 1854 return std::nullopt; 1855 case 8: 1856 if (Imm > 7) 1857 return std::nullopt; 1858 break; 1859 case 16: 1860 if (Imm > 15) 1861 return std::nullopt; 1862 break; 1863 case 32: 1864 if (Imm > 31) 1865 return std::nullopt; 1866 break; 1867 case 64: 1868 if (Imm > 63) 1869 return std::nullopt; 1870 break; 1871 } 1872 return Imm; 1873 } 1874 1875 bool AArch64InstructionSelector::selectVectorSHL(MachineInstr &I, 1876 MachineRegisterInfo &MRI) { 1877 assert(I.getOpcode() == TargetOpcode::G_SHL); 1878 Register DstReg = I.getOperand(0).getReg(); 1879 const LLT Ty = MRI.getType(DstReg); 1880 Register Src1Reg = I.getOperand(1).getReg(); 1881 Register Src2Reg = I.getOperand(2).getReg(); 1882 1883 if (!Ty.isVector()) 1884 return false; 1885 1886 // Check if we have a vector of constants on RHS that we can select as the 1887 // immediate form. 1888 std::optional<int64_t> ImmVal = getVectorSHLImm(Ty, Src2Reg, MRI); 1889 1890 unsigned Opc = 0; 1891 if (Ty == LLT::fixed_vector(2, 64)) { 1892 Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64; 1893 } else if (Ty == LLT::fixed_vector(4, 32)) { 1894 Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32; 1895 } else if (Ty == LLT::fixed_vector(2, 32)) { 1896 Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32; 1897 } else if (Ty == LLT::fixed_vector(4, 16)) { 1898 Opc = ImmVal ? AArch64::SHLv4i16_shift : AArch64::USHLv4i16; 1899 } else if (Ty == LLT::fixed_vector(8, 16)) { 1900 Opc = ImmVal ? AArch64::SHLv8i16_shift : AArch64::USHLv8i16; 1901 } else if (Ty == LLT::fixed_vector(16, 8)) { 1902 Opc = ImmVal ? AArch64::SHLv16i8_shift : AArch64::USHLv16i8; 1903 } else if (Ty == LLT::fixed_vector(8, 8)) { 1904 Opc = ImmVal ? AArch64::SHLv8i8_shift : AArch64::USHLv8i8; 1905 } else { 1906 LLVM_DEBUG(dbgs() << "Unhandled G_SHL type"); 1907 return false; 1908 } 1909 1910 auto Shl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg}); 1911 if (ImmVal) 1912 Shl.addImm(*ImmVal); 1913 else 1914 Shl.addUse(Src2Reg); 1915 constrainSelectedInstRegOperands(*Shl, TII, TRI, RBI); 1916 I.eraseFromParent(); 1917 return true; 1918 } 1919 1920 bool AArch64InstructionSelector::selectVectorAshrLshr( 1921 MachineInstr &I, MachineRegisterInfo &MRI) { 1922 assert(I.getOpcode() == TargetOpcode::G_ASHR || 1923 I.getOpcode() == TargetOpcode::G_LSHR); 1924 Register DstReg = I.getOperand(0).getReg(); 1925 const LLT Ty = MRI.getType(DstReg); 1926 Register Src1Reg = I.getOperand(1).getReg(); 1927 Register Src2Reg = I.getOperand(2).getReg(); 1928 1929 if (!Ty.isVector()) 1930 return false; 1931 1932 bool IsASHR = I.getOpcode() == TargetOpcode::G_ASHR; 1933 1934 // We expect the immediate case to be lowered in the PostLegalCombiner to 1935 // AArch64ISD::VASHR or AArch64ISD::VLSHR equivalents. 1936 1937 // There is not a shift right register instruction, but the shift left 1938 // register instruction takes a signed value, where negative numbers specify a 1939 // right shift. 1940 1941 unsigned Opc = 0; 1942 unsigned NegOpc = 0; 1943 const TargetRegisterClass *RC = 1944 getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID)); 1945 if (Ty == LLT::fixed_vector(2, 64)) { 1946 Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64; 1947 NegOpc = AArch64::NEGv2i64; 1948 } else if (Ty == LLT::fixed_vector(4, 32)) { 1949 Opc = IsASHR ? AArch64::SSHLv4i32 : AArch64::USHLv4i32; 1950 NegOpc = AArch64::NEGv4i32; 1951 } else if (Ty == LLT::fixed_vector(2, 32)) { 1952 Opc = IsASHR ? AArch64::SSHLv2i32 : AArch64::USHLv2i32; 1953 NegOpc = AArch64::NEGv2i32; 1954 } else if (Ty == LLT::fixed_vector(4, 16)) { 1955 Opc = IsASHR ? AArch64::SSHLv4i16 : AArch64::USHLv4i16; 1956 NegOpc = AArch64::NEGv4i16; 1957 } else if (Ty == LLT::fixed_vector(8, 16)) { 1958 Opc = IsASHR ? AArch64::SSHLv8i16 : AArch64::USHLv8i16; 1959 NegOpc = AArch64::NEGv8i16; 1960 } else if (Ty == LLT::fixed_vector(16, 8)) { 1961 Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8; 1962 NegOpc = AArch64::NEGv16i8; 1963 } else if (Ty == LLT::fixed_vector(8, 8)) { 1964 Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8; 1965 NegOpc = AArch64::NEGv8i8; 1966 } else { 1967 LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type"); 1968 return false; 1969 } 1970 1971 auto Neg = MIB.buildInstr(NegOpc, {RC}, {Src2Reg}); 1972 constrainSelectedInstRegOperands(*Neg, TII, TRI, RBI); 1973 auto SShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Neg}); 1974 constrainSelectedInstRegOperands(*SShl, TII, TRI, RBI); 1975 I.eraseFromParent(); 1976 return true; 1977 } 1978 1979 bool AArch64InstructionSelector::selectVaStartAAPCS( 1980 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { 1981 return false; 1982 } 1983 1984 bool AArch64InstructionSelector::selectVaStartDarwin( 1985 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { 1986 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 1987 Register ListReg = I.getOperand(0).getReg(); 1988 1989 Register ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); 1990 1991 int FrameIdx = FuncInfo->getVarArgsStackIndex(); 1992 if (MF.getSubtarget<AArch64Subtarget>().isCallingConvWin64( 1993 MF.getFunction().getCallingConv())) { 1994 FrameIdx = FuncInfo->getVarArgsGPRSize() > 0 1995 ? FuncInfo->getVarArgsGPRIndex() 1996 : FuncInfo->getVarArgsStackIndex(); 1997 } 1998 1999 auto MIB = 2000 BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri)) 2001 .addDef(ArgsAddrReg) 2002 .addFrameIndex(FrameIdx) 2003 .addImm(0) 2004 .addImm(0); 2005 2006 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 2007 2008 MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui)) 2009 .addUse(ArgsAddrReg) 2010 .addUse(ListReg) 2011 .addImm(0) 2012 .addMemOperand(*I.memoperands_begin()); 2013 2014 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 2015 I.eraseFromParent(); 2016 return true; 2017 } 2018 2019 void AArch64InstructionSelector::materializeLargeCMVal( 2020 MachineInstr &I, const Value *V, unsigned OpFlags) { 2021 MachineBasicBlock &MBB = *I.getParent(); 2022 MachineFunction &MF = *MBB.getParent(); 2023 MachineRegisterInfo &MRI = MF.getRegInfo(); 2024 2025 auto MovZ = MIB.buildInstr(AArch64::MOVZXi, {&AArch64::GPR64RegClass}, {}); 2026 MovZ->addOperand(MF, I.getOperand(1)); 2027 MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 | 2028 AArch64II::MO_NC); 2029 MovZ->addOperand(MF, MachineOperand::CreateImm(0)); 2030 constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI); 2031 2032 auto BuildMovK = [&](Register SrcReg, unsigned char Flags, unsigned Offset, 2033 Register ForceDstReg) { 2034 Register DstReg = ForceDstReg 2035 ? ForceDstReg 2036 : MRI.createVirtualRegister(&AArch64::GPR64RegClass); 2037 auto MovI = MIB.buildInstr(AArch64::MOVKXi).addDef(DstReg).addUse(SrcReg); 2038 if (auto *GV = dyn_cast<GlobalValue>(V)) { 2039 MovI->addOperand(MF, MachineOperand::CreateGA( 2040 GV, MovZ->getOperand(1).getOffset(), Flags)); 2041 } else { 2042 MovI->addOperand( 2043 MF, MachineOperand::CreateBA(cast<BlockAddress>(V), 2044 MovZ->getOperand(1).getOffset(), Flags)); 2045 } 2046 MovI->addOperand(MF, MachineOperand::CreateImm(Offset)); 2047 constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI); 2048 return DstReg; 2049 }; 2050 Register DstReg = BuildMovK(MovZ.getReg(0), 2051 AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0); 2052 DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0); 2053 BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg()); 2054 } 2055 2056 bool AArch64InstructionSelector::preISelLower(MachineInstr &I) { 2057 MachineBasicBlock &MBB = *I.getParent(); 2058 MachineFunction &MF = *MBB.getParent(); 2059 MachineRegisterInfo &MRI = MF.getRegInfo(); 2060 2061 switch (I.getOpcode()) { 2062 case TargetOpcode::G_STORE: { 2063 bool Changed = contractCrossBankCopyIntoStore(I, MRI); 2064 MachineOperand &SrcOp = I.getOperand(0); 2065 if (MRI.getType(SrcOp.getReg()).isPointer()) { 2066 // Allow matching with imported patterns for stores of pointers. Unlike 2067 // G_LOAD/G_PTR_ADD, we may not have selected all users. So, emit a copy 2068 // and constrain. 2069 auto Copy = MIB.buildCopy(LLT::scalar(64), SrcOp); 2070 Register NewSrc = Copy.getReg(0); 2071 SrcOp.setReg(NewSrc); 2072 RBI.constrainGenericRegister(NewSrc, AArch64::GPR64RegClass, MRI); 2073 Changed = true; 2074 } 2075 return Changed; 2076 } 2077 case TargetOpcode::G_PTR_ADD: 2078 return convertPtrAddToAdd(I, MRI); 2079 case TargetOpcode::G_LOAD: { 2080 // For scalar loads of pointers, we try to convert the dest type from p0 2081 // to s64 so that our imported patterns can match. Like with the G_PTR_ADD 2082 // conversion, this should be ok because all users should have been 2083 // selected already, so the type doesn't matter for them. 2084 Register DstReg = I.getOperand(0).getReg(); 2085 const LLT DstTy = MRI.getType(DstReg); 2086 if (!DstTy.isPointer()) 2087 return false; 2088 MRI.setType(DstReg, LLT::scalar(64)); 2089 return true; 2090 } 2091 case AArch64::G_DUP: { 2092 // Convert the type from p0 to s64 to help selection. 2093 LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 2094 if (!DstTy.getElementType().isPointer()) 2095 return false; 2096 auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(1).getReg()); 2097 MRI.setType(I.getOperand(0).getReg(), 2098 DstTy.changeElementType(LLT::scalar(64))); 2099 MRI.setRegClass(NewSrc.getReg(0), &AArch64::GPR64RegClass); 2100 I.getOperand(1).setReg(NewSrc.getReg(0)); 2101 return true; 2102 } 2103 case TargetOpcode::G_UITOFP: 2104 case TargetOpcode::G_SITOFP: { 2105 // If both source and destination regbanks are FPR, then convert the opcode 2106 // to G_SITOF so that the importer can select it to an fpr variant. 2107 // Otherwise, it ends up matching an fpr/gpr variant and adding a cross-bank 2108 // copy. 2109 Register SrcReg = I.getOperand(1).getReg(); 2110 LLT SrcTy = MRI.getType(SrcReg); 2111 LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 2112 if (SrcTy.isVector() || SrcTy.getSizeInBits() != DstTy.getSizeInBits()) 2113 return false; 2114 2115 if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::FPRRegBankID) { 2116 if (I.getOpcode() == TargetOpcode::G_SITOFP) 2117 I.setDesc(TII.get(AArch64::G_SITOF)); 2118 else 2119 I.setDesc(TII.get(AArch64::G_UITOF)); 2120 return true; 2121 } 2122 return false; 2123 } 2124 default: 2125 return false; 2126 } 2127 } 2128 2129 /// This lowering tries to look for G_PTR_ADD instructions and then converts 2130 /// them to a standard G_ADD with a COPY on the source. 2131 /// 2132 /// The motivation behind this is to expose the add semantics to the imported 2133 /// tablegen patterns. We shouldn't need to check for uses being loads/stores, 2134 /// because the selector works bottom up, uses before defs. By the time we 2135 /// end up trying to select a G_PTR_ADD, we should have already attempted to 2136 /// fold this into addressing modes and were therefore unsuccessful. 2137 bool AArch64InstructionSelector::convertPtrAddToAdd( 2138 MachineInstr &I, MachineRegisterInfo &MRI) { 2139 assert(I.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD"); 2140 Register DstReg = I.getOperand(0).getReg(); 2141 Register AddOp1Reg = I.getOperand(1).getReg(); 2142 const LLT PtrTy = MRI.getType(DstReg); 2143 if (PtrTy.getAddressSpace() != 0) 2144 return false; 2145 2146 const LLT CastPtrTy = 2147 PtrTy.isVector() ? LLT::fixed_vector(2, 64) : LLT::scalar(64); 2148 auto PtrToInt = MIB.buildPtrToInt(CastPtrTy, AddOp1Reg); 2149 // Set regbanks on the registers. 2150 if (PtrTy.isVector()) 2151 MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::FPRRegBankID)); 2152 else 2153 MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID)); 2154 2155 // Now turn the %dst(p0) = G_PTR_ADD %base, off into: 2156 // %dst(intty) = G_ADD %intbase, off 2157 I.setDesc(TII.get(TargetOpcode::G_ADD)); 2158 MRI.setType(DstReg, CastPtrTy); 2159 I.getOperand(1).setReg(PtrToInt.getReg(0)); 2160 if (!select(*PtrToInt)) { 2161 LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd"); 2162 return false; 2163 } 2164 2165 // Also take the opportunity here to try to do some optimization. 2166 // Try to convert this into a G_SUB if the offset is a 0-x negate idiom. 2167 Register NegatedReg; 2168 if (!mi_match(I.getOperand(2).getReg(), MRI, m_Neg(m_Reg(NegatedReg)))) 2169 return true; 2170 I.getOperand(2).setReg(NegatedReg); 2171 I.setDesc(TII.get(TargetOpcode::G_SUB)); 2172 return true; 2173 } 2174 2175 bool AArch64InstructionSelector::earlySelectSHL(MachineInstr &I, 2176 MachineRegisterInfo &MRI) { 2177 // We try to match the immediate variant of LSL, which is actually an alias 2178 // for a special case of UBFM. Otherwise, we fall back to the imported 2179 // selector which will match the register variant. 2180 assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op"); 2181 const auto &MO = I.getOperand(2); 2182 auto VRegAndVal = getIConstantVRegVal(MO.getReg(), MRI); 2183 if (!VRegAndVal) 2184 return false; 2185 2186 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 2187 if (DstTy.isVector()) 2188 return false; 2189 bool Is64Bit = DstTy.getSizeInBits() == 64; 2190 auto Imm1Fn = Is64Bit ? selectShiftA_64(MO) : selectShiftA_32(MO); 2191 auto Imm2Fn = Is64Bit ? selectShiftB_64(MO) : selectShiftB_32(MO); 2192 2193 if (!Imm1Fn || !Imm2Fn) 2194 return false; 2195 2196 auto NewI = 2197 MIB.buildInstr(Is64Bit ? AArch64::UBFMXri : AArch64::UBFMWri, 2198 {I.getOperand(0).getReg()}, {I.getOperand(1).getReg()}); 2199 2200 for (auto &RenderFn : *Imm1Fn) 2201 RenderFn(NewI); 2202 for (auto &RenderFn : *Imm2Fn) 2203 RenderFn(NewI); 2204 2205 I.eraseFromParent(); 2206 return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI); 2207 } 2208 2209 bool AArch64InstructionSelector::contractCrossBankCopyIntoStore( 2210 MachineInstr &I, MachineRegisterInfo &MRI) { 2211 assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE"); 2212 // If we're storing a scalar, it doesn't matter what register bank that 2213 // scalar is on. All that matters is the size. 2214 // 2215 // So, if we see something like this (with a 32-bit scalar as an example): 2216 // 2217 // %x:gpr(s32) = ... something ... 2218 // %y:fpr(s32) = COPY %x:gpr(s32) 2219 // G_STORE %y:fpr(s32) 2220 // 2221 // We can fix this up into something like this: 2222 // 2223 // G_STORE %x:gpr(s32) 2224 // 2225 // And then continue the selection process normally. 2226 Register DefDstReg = getSrcRegIgnoringCopies(I.getOperand(0).getReg(), MRI); 2227 if (!DefDstReg.isValid()) 2228 return false; 2229 LLT DefDstTy = MRI.getType(DefDstReg); 2230 Register StoreSrcReg = I.getOperand(0).getReg(); 2231 LLT StoreSrcTy = MRI.getType(StoreSrcReg); 2232 2233 // If we get something strange like a physical register, then we shouldn't 2234 // go any further. 2235 if (!DefDstTy.isValid()) 2236 return false; 2237 2238 // Are the source and dst types the same size? 2239 if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits()) 2240 return false; 2241 2242 if (RBI.getRegBank(StoreSrcReg, MRI, TRI) == 2243 RBI.getRegBank(DefDstReg, MRI, TRI)) 2244 return false; 2245 2246 // We have a cross-bank copy, which is entering a store. Let's fold it. 2247 I.getOperand(0).setReg(DefDstReg); 2248 return true; 2249 } 2250 2251 bool AArch64InstructionSelector::earlySelect(MachineInstr &I) { 2252 assert(I.getParent() && "Instruction should be in a basic block!"); 2253 assert(I.getParent()->getParent() && "Instruction should be in a function!"); 2254 2255 MachineBasicBlock &MBB = *I.getParent(); 2256 MachineFunction &MF = *MBB.getParent(); 2257 MachineRegisterInfo &MRI = MF.getRegInfo(); 2258 2259 switch (I.getOpcode()) { 2260 case AArch64::G_DUP: { 2261 // Before selecting a DUP instruction, check if it is better selected as a 2262 // MOV or load from a constant pool. 2263 Register Src = I.getOperand(1).getReg(); 2264 auto ValAndVReg = getAnyConstantVRegValWithLookThrough(Src, MRI); 2265 if (!ValAndVReg) 2266 return false; 2267 LLVMContext &Ctx = MF.getFunction().getContext(); 2268 Register Dst = I.getOperand(0).getReg(); 2269 auto *CV = ConstantDataVector::getSplat( 2270 MRI.getType(Dst).getNumElements(), 2271 ConstantInt::get(Type::getIntNTy(Ctx, MRI.getType(Src).getSizeInBits()), 2272 ValAndVReg->Value)); 2273 if (!emitConstantVector(Dst, CV, MIB, MRI)) 2274 return false; 2275 I.eraseFromParent(); 2276 return true; 2277 } 2278 case TargetOpcode::G_SEXT: 2279 // Check for i64 sext(i32 vector_extract) prior to tablegen to select SMOV 2280 // over a normal extend. 2281 if (selectUSMovFromExtend(I, MRI)) 2282 return true; 2283 return false; 2284 case TargetOpcode::G_BR: 2285 return false; 2286 case TargetOpcode::G_SHL: 2287 return earlySelectSHL(I, MRI); 2288 case TargetOpcode::G_CONSTANT: { 2289 bool IsZero = false; 2290 if (I.getOperand(1).isCImm()) 2291 IsZero = I.getOperand(1).getCImm()->isZero(); 2292 else if (I.getOperand(1).isImm()) 2293 IsZero = I.getOperand(1).getImm() == 0; 2294 2295 if (!IsZero) 2296 return false; 2297 2298 Register DefReg = I.getOperand(0).getReg(); 2299 LLT Ty = MRI.getType(DefReg); 2300 if (Ty.getSizeInBits() == 64) { 2301 I.getOperand(1).ChangeToRegister(AArch64::XZR, false); 2302 RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI); 2303 } else if (Ty.getSizeInBits() == 32) { 2304 I.getOperand(1).ChangeToRegister(AArch64::WZR, false); 2305 RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI); 2306 } else 2307 return false; 2308 2309 I.setDesc(TII.get(TargetOpcode::COPY)); 2310 return true; 2311 } 2312 2313 case TargetOpcode::G_ADD: { 2314 // Check if this is being fed by a G_ICMP on either side. 2315 // 2316 // (cmp pred, x, y) + z 2317 // 2318 // In the above case, when the cmp is true, we increment z by 1. So, we can 2319 // fold the add into the cset for the cmp by using cinc. 2320 // 2321 // FIXME: This would probably be a lot nicer in PostLegalizerLowering. 2322 Register AddDst = I.getOperand(0).getReg(); 2323 Register AddLHS = I.getOperand(1).getReg(); 2324 Register AddRHS = I.getOperand(2).getReg(); 2325 // Only handle scalars. 2326 LLT Ty = MRI.getType(AddLHS); 2327 if (Ty.isVector()) 2328 return false; 2329 // Since G_ICMP is modeled as ADDS/SUBS/ANDS, we can handle 32 bits or 64 2330 // bits. 2331 unsigned Size = Ty.getSizeInBits(); 2332 if (Size != 32 && Size != 64) 2333 return false; 2334 auto MatchCmp = [&](Register Reg) -> MachineInstr * { 2335 if (!MRI.hasOneNonDBGUse(Reg)) 2336 return nullptr; 2337 // If the LHS of the add is 32 bits, then we want to fold a 32-bit 2338 // compare. 2339 if (Size == 32) 2340 return getOpcodeDef(TargetOpcode::G_ICMP, Reg, MRI); 2341 // We model scalar compares using 32-bit destinations right now. 2342 // If it's a 64-bit compare, it'll have 64-bit sources. 2343 Register ZExt; 2344 if (!mi_match(Reg, MRI, 2345 m_OneNonDBGUse(m_GZExt(m_OneNonDBGUse(m_Reg(ZExt)))))) 2346 return nullptr; 2347 auto *Cmp = getOpcodeDef(TargetOpcode::G_ICMP, ZExt, MRI); 2348 if (!Cmp || 2349 MRI.getType(Cmp->getOperand(2).getReg()).getSizeInBits() != 64) 2350 return nullptr; 2351 return Cmp; 2352 }; 2353 // Try to match 2354 // z + (cmp pred, x, y) 2355 MachineInstr *Cmp = MatchCmp(AddRHS); 2356 if (!Cmp) { 2357 // (cmp pred, x, y) + z 2358 std::swap(AddLHS, AddRHS); 2359 Cmp = MatchCmp(AddRHS); 2360 if (!Cmp) 2361 return false; 2362 } 2363 auto &PredOp = Cmp->getOperand(1); 2364 auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate()); 2365 const AArch64CC::CondCode InvCC = 2366 changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred)); 2367 MIB.setInstrAndDebugLoc(I); 2368 emitIntegerCompare(/*LHS=*/Cmp->getOperand(2), 2369 /*RHS=*/Cmp->getOperand(3), PredOp, MIB); 2370 emitCSINC(/*Dst=*/AddDst, /*Src =*/AddLHS, /*Src2=*/AddLHS, InvCC, MIB); 2371 I.eraseFromParent(); 2372 return true; 2373 } 2374 case TargetOpcode::G_OR: { 2375 // Look for operations that take the lower `Width=Size-ShiftImm` bits of 2376 // `ShiftSrc` and insert them into the upper `Width` bits of `MaskSrc` via 2377 // shifting and masking that we can replace with a BFI (encoded as a BFM). 2378 Register Dst = I.getOperand(0).getReg(); 2379 LLT Ty = MRI.getType(Dst); 2380 2381 if (!Ty.isScalar()) 2382 return false; 2383 2384 unsigned Size = Ty.getSizeInBits(); 2385 if (Size != 32 && Size != 64) 2386 return false; 2387 2388 Register ShiftSrc; 2389 int64_t ShiftImm; 2390 Register MaskSrc; 2391 int64_t MaskImm; 2392 if (!mi_match( 2393 Dst, MRI, 2394 m_GOr(m_OneNonDBGUse(m_GShl(m_Reg(ShiftSrc), m_ICst(ShiftImm))), 2395 m_OneNonDBGUse(m_GAnd(m_Reg(MaskSrc), m_ICst(MaskImm)))))) 2396 return false; 2397 2398 if (ShiftImm > Size || ((1ULL << ShiftImm) - 1ULL) != uint64_t(MaskImm)) 2399 return false; 2400 2401 int64_t Immr = Size - ShiftImm; 2402 int64_t Imms = Size - ShiftImm - 1; 2403 unsigned Opc = Size == 32 ? AArch64::BFMWri : AArch64::BFMXri; 2404 emitInstr(Opc, {Dst}, {MaskSrc, ShiftSrc, Immr, Imms}, MIB); 2405 I.eraseFromParent(); 2406 return true; 2407 } 2408 case TargetOpcode::G_FENCE: { 2409 if (I.getOperand(1).getImm() == 0) 2410 BuildMI(MBB, I, MIMetadata(I), TII.get(TargetOpcode::MEMBARRIER)); 2411 else 2412 BuildMI(MBB, I, MIMetadata(I), TII.get(AArch64::DMB)) 2413 .addImm(I.getOperand(0).getImm() == 4 ? 0x9 : 0xb); 2414 I.eraseFromParent(); 2415 return true; 2416 } 2417 default: 2418 return false; 2419 } 2420 } 2421 2422 bool AArch64InstructionSelector::select(MachineInstr &I) { 2423 assert(I.getParent() && "Instruction should be in a basic block!"); 2424 assert(I.getParent()->getParent() && "Instruction should be in a function!"); 2425 2426 MachineBasicBlock &MBB = *I.getParent(); 2427 MachineFunction &MF = *MBB.getParent(); 2428 MachineRegisterInfo &MRI = MF.getRegInfo(); 2429 2430 const AArch64Subtarget *Subtarget = &MF.getSubtarget<AArch64Subtarget>(); 2431 if (Subtarget->requiresStrictAlign()) { 2432 // We don't support this feature yet. 2433 LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n"); 2434 return false; 2435 } 2436 2437 MIB.setInstrAndDebugLoc(I); 2438 2439 unsigned Opcode = I.getOpcode(); 2440 // G_PHI requires same handling as PHI 2441 if (!I.isPreISelOpcode() || Opcode == TargetOpcode::G_PHI) { 2442 // Certain non-generic instructions also need some special handling. 2443 2444 if (Opcode == TargetOpcode::LOAD_STACK_GUARD) 2445 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2446 2447 if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) { 2448 const Register DefReg = I.getOperand(0).getReg(); 2449 const LLT DefTy = MRI.getType(DefReg); 2450 2451 const RegClassOrRegBank &RegClassOrBank = 2452 MRI.getRegClassOrRegBank(DefReg); 2453 2454 const TargetRegisterClass *DefRC 2455 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); 2456 if (!DefRC) { 2457 if (!DefTy.isValid()) { 2458 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); 2459 return false; 2460 } 2461 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); 2462 DefRC = getRegClassForTypeOnBank(DefTy, RB); 2463 if (!DefRC) { 2464 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); 2465 return false; 2466 } 2467 } 2468 2469 I.setDesc(TII.get(TargetOpcode::PHI)); 2470 2471 return RBI.constrainGenericRegister(DefReg, *DefRC, MRI); 2472 } 2473 2474 if (I.isCopy()) 2475 return selectCopy(I, TII, MRI, TRI, RBI); 2476 2477 if (I.isDebugInstr()) 2478 return selectDebugInstr(I, MRI, RBI); 2479 2480 return true; 2481 } 2482 2483 2484 if (I.getNumOperands() != I.getNumExplicitOperands()) { 2485 LLVM_DEBUG( 2486 dbgs() << "Generic instruction has unexpected implicit operands\n"); 2487 return false; 2488 } 2489 2490 // Try to do some lowering before we start instruction selecting. These 2491 // lowerings are purely transformations on the input G_MIR and so selection 2492 // must continue after any modification of the instruction. 2493 if (preISelLower(I)) { 2494 Opcode = I.getOpcode(); // The opcode may have been modified, refresh it. 2495 } 2496 2497 // There may be patterns where the importer can't deal with them optimally, 2498 // but does select it to a suboptimal sequence so our custom C++ selection 2499 // code later never has a chance to work on it. Therefore, we have an early 2500 // selection attempt here to give priority to certain selection routines 2501 // over the imported ones. 2502 if (earlySelect(I)) 2503 return true; 2504 2505 if (selectImpl(I, *CoverageInfo)) 2506 return true; 2507 2508 LLT Ty = 2509 I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{}; 2510 2511 switch (Opcode) { 2512 case TargetOpcode::G_SBFX: 2513 case TargetOpcode::G_UBFX: { 2514 static const unsigned OpcTable[2][2] = { 2515 {AArch64::UBFMWri, AArch64::UBFMXri}, 2516 {AArch64::SBFMWri, AArch64::SBFMXri}}; 2517 bool IsSigned = Opcode == TargetOpcode::G_SBFX; 2518 unsigned Size = Ty.getSizeInBits(); 2519 unsigned Opc = OpcTable[IsSigned][Size == 64]; 2520 auto Cst1 = 2521 getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), MRI); 2522 assert(Cst1 && "Should have gotten a constant for src 1?"); 2523 auto Cst2 = 2524 getIConstantVRegValWithLookThrough(I.getOperand(3).getReg(), MRI); 2525 assert(Cst2 && "Should have gotten a constant for src 2?"); 2526 auto LSB = Cst1->Value.getZExtValue(); 2527 auto Width = Cst2->Value.getZExtValue(); 2528 auto BitfieldInst = 2529 MIB.buildInstr(Opc, {I.getOperand(0)}, {I.getOperand(1)}) 2530 .addImm(LSB) 2531 .addImm(LSB + Width - 1); 2532 I.eraseFromParent(); 2533 return constrainSelectedInstRegOperands(*BitfieldInst, TII, TRI, RBI); 2534 } 2535 case TargetOpcode::G_BRCOND: 2536 return selectCompareBranch(I, MF, MRI); 2537 2538 case TargetOpcode::G_BRINDIRECT: { 2539 I.setDesc(TII.get(AArch64::BR)); 2540 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2541 } 2542 2543 case TargetOpcode::G_BRJT: 2544 return selectBrJT(I, MRI); 2545 2546 case AArch64::G_ADD_LOW: { 2547 // This op may have been separated from it's ADRP companion by the localizer 2548 // or some other code motion pass. Given that many CPUs will try to 2549 // macro fuse these operations anyway, select this into a MOVaddr pseudo 2550 // which will later be expanded into an ADRP+ADD pair after scheduling. 2551 MachineInstr *BaseMI = MRI.getVRegDef(I.getOperand(1).getReg()); 2552 if (BaseMI->getOpcode() != AArch64::ADRP) { 2553 I.setDesc(TII.get(AArch64::ADDXri)); 2554 I.addOperand(MachineOperand::CreateImm(0)); 2555 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2556 } 2557 assert(TM.getCodeModel() == CodeModel::Small && 2558 "Expected small code model"); 2559 auto Op1 = BaseMI->getOperand(1); 2560 auto Op2 = I.getOperand(2); 2561 auto MovAddr = MIB.buildInstr(AArch64::MOVaddr, {I.getOperand(0)}, {}) 2562 .addGlobalAddress(Op1.getGlobal(), Op1.getOffset(), 2563 Op1.getTargetFlags()) 2564 .addGlobalAddress(Op2.getGlobal(), Op2.getOffset(), 2565 Op2.getTargetFlags()); 2566 I.eraseFromParent(); 2567 return constrainSelectedInstRegOperands(*MovAddr, TII, TRI, RBI); 2568 } 2569 2570 case TargetOpcode::G_BSWAP: { 2571 // Handle vector types for G_BSWAP directly. 2572 Register DstReg = I.getOperand(0).getReg(); 2573 LLT DstTy = MRI.getType(DstReg); 2574 2575 // We should only get vector types here; everything else is handled by the 2576 // importer right now. 2577 if (!DstTy.isVector() || DstTy.getSizeInBits() > 128) { 2578 LLVM_DEBUG(dbgs() << "Dst type for G_BSWAP currently unsupported.\n"); 2579 return false; 2580 } 2581 2582 // Only handle 4 and 2 element vectors for now. 2583 // TODO: 16-bit elements. 2584 unsigned NumElts = DstTy.getNumElements(); 2585 if (NumElts != 4 && NumElts != 2) { 2586 LLVM_DEBUG(dbgs() << "Unsupported number of elements for G_BSWAP.\n"); 2587 return false; 2588 } 2589 2590 // Choose the correct opcode for the supported types. Right now, that's 2591 // v2s32, v4s32, and v2s64. 2592 unsigned Opc = 0; 2593 unsigned EltSize = DstTy.getElementType().getSizeInBits(); 2594 if (EltSize == 32) 2595 Opc = (DstTy.getNumElements() == 2) ? AArch64::REV32v8i8 2596 : AArch64::REV32v16i8; 2597 else if (EltSize == 64) 2598 Opc = AArch64::REV64v16i8; 2599 2600 // We should always get something by the time we get here... 2601 assert(Opc != 0 && "Didn't get an opcode for G_BSWAP?"); 2602 2603 I.setDesc(TII.get(Opc)); 2604 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2605 } 2606 2607 case TargetOpcode::G_FCONSTANT: 2608 case TargetOpcode::G_CONSTANT: { 2609 const bool isFP = Opcode == TargetOpcode::G_FCONSTANT; 2610 2611 const LLT s8 = LLT::scalar(8); 2612 const LLT s16 = LLT::scalar(16); 2613 const LLT s32 = LLT::scalar(32); 2614 const LLT s64 = LLT::scalar(64); 2615 const LLT s128 = LLT::scalar(128); 2616 const LLT p0 = LLT::pointer(0, 64); 2617 2618 const Register DefReg = I.getOperand(0).getReg(); 2619 const LLT DefTy = MRI.getType(DefReg); 2620 const unsigned DefSize = DefTy.getSizeInBits(); 2621 const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); 2622 2623 // FIXME: Redundant check, but even less readable when factored out. 2624 if (isFP) { 2625 if (Ty != s16 && Ty != s32 && Ty != s64 && Ty != s128) { 2626 LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty 2627 << " constant, expected: " << s16 << " or " << s32 2628 << " or " << s64 << " or " << s128 << '\n'); 2629 return false; 2630 } 2631 2632 if (RB.getID() != AArch64::FPRRegBankID) { 2633 LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty 2634 << " constant on bank: " << RB 2635 << ", expected: FPR\n"); 2636 return false; 2637 } 2638 2639 // The case when we have 0.0 is covered by tablegen. Reject it here so we 2640 // can be sure tablegen works correctly and isn't rescued by this code. 2641 // 0.0 is not covered by tablegen for FP128. So we will handle this 2642 // scenario in the code here. 2643 if (DefSize != 128 && I.getOperand(1).getFPImm()->isExactlyValue(0.0)) 2644 return false; 2645 } else { 2646 // s32 and s64 are covered by tablegen. 2647 if (Ty != p0 && Ty != s8 && Ty != s16) { 2648 LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty 2649 << " constant, expected: " << s32 << ", " << s64 2650 << ", or " << p0 << '\n'); 2651 return false; 2652 } 2653 2654 if (RB.getID() != AArch64::GPRRegBankID) { 2655 LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty 2656 << " constant on bank: " << RB 2657 << ", expected: GPR\n"); 2658 return false; 2659 } 2660 } 2661 2662 if (isFP) { 2663 const TargetRegisterClass &FPRRC = *getRegClassForTypeOnBank(DefTy, RB); 2664 // For 16, 64, and 128b values, emit a constant pool load. 2665 switch (DefSize) { 2666 default: 2667 llvm_unreachable("Unexpected destination size for G_FCONSTANT?"); 2668 case 32: 2669 case 64: { 2670 bool OptForSize = shouldOptForSize(&MF); 2671 const auto &TLI = MF.getSubtarget().getTargetLowering(); 2672 // If TLI says that this fpimm is illegal, then we'll expand to a 2673 // constant pool load. 2674 if (TLI->isFPImmLegal(I.getOperand(1).getFPImm()->getValueAPF(), 2675 EVT::getFloatingPointVT(DefSize), OptForSize)) 2676 break; 2677 [[fallthrough]]; 2678 } 2679 case 16: 2680 case 128: { 2681 auto *FPImm = I.getOperand(1).getFPImm(); 2682 auto *LoadMI = emitLoadFromConstantPool(FPImm, MIB); 2683 if (!LoadMI) { 2684 LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n"); 2685 return false; 2686 } 2687 MIB.buildCopy({DefReg}, {LoadMI->getOperand(0).getReg()}); 2688 I.eraseFromParent(); 2689 return RBI.constrainGenericRegister(DefReg, FPRRC, MRI); 2690 } 2691 } 2692 2693 assert((DefSize == 32 || DefSize == 64) && "Unexpected const def size"); 2694 // Either emit a FMOV, or emit a copy to emit a normal mov. 2695 const Register DefGPRReg = MRI.createVirtualRegister( 2696 DefSize == 32 ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass); 2697 MachineOperand &RegOp = I.getOperand(0); 2698 RegOp.setReg(DefGPRReg); 2699 MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator())); 2700 MIB.buildCopy({DefReg}, {DefGPRReg}); 2701 2702 if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) { 2703 LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n"); 2704 return false; 2705 } 2706 2707 MachineOperand &ImmOp = I.getOperand(1); 2708 // FIXME: Is going through int64_t always correct? 2709 ImmOp.ChangeToImmediate( 2710 ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue()); 2711 } else if (I.getOperand(1).isCImm()) { 2712 uint64_t Val = I.getOperand(1).getCImm()->getZExtValue(); 2713 I.getOperand(1).ChangeToImmediate(Val); 2714 } else if (I.getOperand(1).isImm()) { 2715 uint64_t Val = I.getOperand(1).getImm(); 2716 I.getOperand(1).ChangeToImmediate(Val); 2717 } 2718 2719 const unsigned MovOpc = 2720 DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm; 2721 I.setDesc(TII.get(MovOpc)); 2722 constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2723 return true; 2724 } 2725 case TargetOpcode::G_EXTRACT: { 2726 Register DstReg = I.getOperand(0).getReg(); 2727 Register SrcReg = I.getOperand(1).getReg(); 2728 LLT SrcTy = MRI.getType(SrcReg); 2729 LLT DstTy = MRI.getType(DstReg); 2730 (void)DstTy; 2731 unsigned SrcSize = SrcTy.getSizeInBits(); 2732 2733 if (SrcTy.getSizeInBits() > 64) { 2734 // This should be an extract of an s128, which is like a vector extract. 2735 if (SrcTy.getSizeInBits() != 128) 2736 return false; 2737 // Only support extracting 64 bits from an s128 at the moment. 2738 if (DstTy.getSizeInBits() != 64) 2739 return false; 2740 2741 unsigned Offset = I.getOperand(2).getImm(); 2742 if (Offset % 64 != 0) 2743 return false; 2744 2745 // Check we have the right regbank always. 2746 const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI); 2747 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); 2748 assert(SrcRB.getID() == DstRB.getID() && "Wrong extract regbank!"); 2749 2750 if (SrcRB.getID() == AArch64::GPRRegBankID) { 2751 auto NewI = 2752 MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}) 2753 .addUse(SrcReg, 0, 2754 Offset == 0 ? AArch64::sube64 : AArch64::subo64); 2755 constrainOperandRegClass(MF, TRI, MRI, TII, RBI, *NewI, 2756 AArch64::GPR64RegClass, NewI->getOperand(0)); 2757 I.eraseFromParent(); 2758 return true; 2759 } 2760 2761 // Emit the same code as a vector extract. 2762 // Offset must be a multiple of 64. 2763 unsigned LaneIdx = Offset / 64; 2764 MachineInstr *Extract = emitExtractVectorElt( 2765 DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB); 2766 if (!Extract) 2767 return false; 2768 I.eraseFromParent(); 2769 return true; 2770 } 2771 2772 I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri)); 2773 MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() + 2774 Ty.getSizeInBits() - 1); 2775 2776 if (SrcSize < 64) { 2777 assert(SrcSize == 32 && DstTy.getSizeInBits() == 16 && 2778 "unexpected G_EXTRACT types"); 2779 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2780 } 2781 2782 DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); 2783 MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator())); 2784 MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {}) 2785 .addReg(DstReg, 0, AArch64::sub_32); 2786 RBI.constrainGenericRegister(I.getOperand(0).getReg(), 2787 AArch64::GPR32RegClass, MRI); 2788 I.getOperand(0).setReg(DstReg); 2789 2790 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2791 } 2792 2793 case TargetOpcode::G_INSERT: { 2794 LLT SrcTy = MRI.getType(I.getOperand(2).getReg()); 2795 LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 2796 unsigned DstSize = DstTy.getSizeInBits(); 2797 // Larger inserts are vectors, same-size ones should be something else by 2798 // now (split up or turned into COPYs). 2799 if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32) 2800 return false; 2801 2802 I.setDesc(TII.get(DstSize == 64 ? AArch64::BFMXri : AArch64::BFMWri)); 2803 unsigned LSB = I.getOperand(3).getImm(); 2804 unsigned Width = MRI.getType(I.getOperand(2).getReg()).getSizeInBits(); 2805 I.getOperand(3).setImm((DstSize - LSB) % DstSize); 2806 MachineInstrBuilder(MF, I).addImm(Width - 1); 2807 2808 if (DstSize < 64) { 2809 assert(DstSize == 32 && SrcTy.getSizeInBits() == 16 && 2810 "unexpected G_INSERT types"); 2811 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2812 } 2813 2814 Register SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); 2815 BuildMI(MBB, I.getIterator(), I.getDebugLoc(), 2816 TII.get(AArch64::SUBREG_TO_REG)) 2817 .addDef(SrcReg) 2818 .addImm(0) 2819 .addUse(I.getOperand(2).getReg()) 2820 .addImm(AArch64::sub_32); 2821 RBI.constrainGenericRegister(I.getOperand(2).getReg(), 2822 AArch64::GPR32RegClass, MRI); 2823 I.getOperand(2).setReg(SrcReg); 2824 2825 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2826 } 2827 case TargetOpcode::G_FRAME_INDEX: { 2828 // allocas and G_FRAME_INDEX are only supported in addrspace(0). 2829 if (Ty != LLT::pointer(0, 64)) { 2830 LLVM_DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty 2831 << ", expected: " << LLT::pointer(0, 64) << '\n'); 2832 return false; 2833 } 2834 I.setDesc(TII.get(AArch64::ADDXri)); 2835 2836 // MOs for a #0 shifted immediate. 2837 I.addOperand(MachineOperand::CreateImm(0)); 2838 I.addOperand(MachineOperand::CreateImm(0)); 2839 2840 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2841 } 2842 2843 case TargetOpcode::G_GLOBAL_VALUE: { 2844 auto GV = I.getOperand(1).getGlobal(); 2845 if (GV->isThreadLocal()) 2846 return selectTLSGlobalValue(I, MRI); 2847 2848 unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM); 2849 if (OpFlags & AArch64II::MO_GOT) { 2850 I.setDesc(TII.get(AArch64::LOADgot)); 2851 I.getOperand(1).setTargetFlags(OpFlags); 2852 } else if (TM.getCodeModel() == CodeModel::Large && 2853 !TM.isPositionIndependent()) { 2854 // Materialize the global using movz/movk instructions. 2855 materializeLargeCMVal(I, GV, OpFlags); 2856 I.eraseFromParent(); 2857 return true; 2858 } else if (TM.getCodeModel() == CodeModel::Tiny) { 2859 I.setDesc(TII.get(AArch64::ADR)); 2860 I.getOperand(1).setTargetFlags(OpFlags); 2861 } else { 2862 I.setDesc(TII.get(AArch64::MOVaddr)); 2863 I.getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_PAGE); 2864 MachineInstrBuilder MIB(MF, I); 2865 MIB.addGlobalAddress(GV, I.getOperand(1).getOffset(), 2866 OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 2867 } 2868 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2869 } 2870 2871 case TargetOpcode::G_ZEXTLOAD: 2872 case TargetOpcode::G_LOAD: 2873 case TargetOpcode::G_STORE: { 2874 GLoadStore &LdSt = cast<GLoadStore>(I); 2875 bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD; 2876 LLT PtrTy = MRI.getType(LdSt.getPointerReg()); 2877 2878 if (PtrTy != LLT::pointer(0, 64)) { 2879 LLVM_DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy 2880 << ", expected: " << LLT::pointer(0, 64) << '\n'); 2881 return false; 2882 } 2883 2884 uint64_t MemSizeInBytes = LdSt.getMemSize(); 2885 unsigned MemSizeInBits = LdSt.getMemSizeInBits(); 2886 AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering(); 2887 2888 // Need special instructions for atomics that affect ordering. 2889 if (Order != AtomicOrdering::NotAtomic && 2890 Order != AtomicOrdering::Unordered && 2891 Order != AtomicOrdering::Monotonic) { 2892 assert(!isa<GZExtLoad>(LdSt)); 2893 if (MemSizeInBytes > 64) 2894 return false; 2895 2896 if (isa<GLoad>(LdSt)) { 2897 static constexpr unsigned LDAPROpcodes[] = { 2898 AArch64::LDAPRB, AArch64::LDAPRH, AArch64::LDAPRW, AArch64::LDAPRX}; 2899 static constexpr unsigned LDAROpcodes[] = { 2900 AArch64::LDARB, AArch64::LDARH, AArch64::LDARW, AArch64::LDARX}; 2901 ArrayRef<unsigned> Opcodes = 2902 STI.hasRCPC() && Order != AtomicOrdering::SequentiallyConsistent 2903 ? LDAPROpcodes 2904 : LDAROpcodes; 2905 I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)])); 2906 } else { 2907 static constexpr unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH, 2908 AArch64::STLRW, AArch64::STLRX}; 2909 Register ValReg = LdSt.getReg(0); 2910 if (MRI.getType(ValReg).getSizeInBits() == 64 && MemSizeInBits != 64) { 2911 // Emit a subreg copy of 32 bits. 2912 Register NewVal = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 2913 MIB.buildInstr(TargetOpcode::COPY, {NewVal}, {}) 2914 .addReg(I.getOperand(0).getReg(), 0, AArch64::sub_32); 2915 I.getOperand(0).setReg(NewVal); 2916 } 2917 I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)])); 2918 } 2919 constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2920 return true; 2921 } 2922 2923 #ifndef NDEBUG 2924 const Register PtrReg = LdSt.getPointerReg(); 2925 const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI); 2926 // Check that the pointer register is valid. 2927 assert(PtrRB.getID() == AArch64::GPRRegBankID && 2928 "Load/Store pointer operand isn't a GPR"); 2929 assert(MRI.getType(PtrReg).isPointer() && 2930 "Load/Store pointer operand isn't a pointer"); 2931 #endif 2932 2933 const Register ValReg = LdSt.getReg(0); 2934 const LLT ValTy = MRI.getType(ValReg); 2935 const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI); 2936 2937 // The code below doesn't support truncating stores, so we need to split it 2938 // again. 2939 if (isa<GStore>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) { 2940 unsigned SubReg; 2941 LLT MemTy = LdSt.getMMO().getMemoryType(); 2942 auto *RC = getRegClassForTypeOnBank(MemTy, RB); 2943 if (!getSubRegForClass(RC, TRI, SubReg)) 2944 return false; 2945 2946 // Generate a subreg copy. 2947 auto Copy = MIB.buildInstr(TargetOpcode::COPY, {MemTy}, {}) 2948 .addReg(ValReg, 0, SubReg) 2949 .getReg(0); 2950 RBI.constrainGenericRegister(Copy, *RC, MRI); 2951 LdSt.getOperand(0).setReg(Copy); 2952 } else if (isa<GLoad>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) { 2953 // If this is an any-extending load from the FPR bank, split it into a regular 2954 // load + extend. 2955 if (RB.getID() == AArch64::FPRRegBankID) { 2956 unsigned SubReg; 2957 LLT MemTy = LdSt.getMMO().getMemoryType(); 2958 auto *RC = getRegClassForTypeOnBank(MemTy, RB); 2959 if (!getSubRegForClass(RC, TRI, SubReg)) 2960 return false; 2961 Register OldDst = LdSt.getReg(0); 2962 Register NewDst = 2963 MRI.createGenericVirtualRegister(LdSt.getMMO().getMemoryType()); 2964 LdSt.getOperand(0).setReg(NewDst); 2965 MRI.setRegBank(NewDst, RB); 2966 // Generate a SUBREG_TO_REG to extend it. 2967 MIB.setInsertPt(MIB.getMBB(), std::next(LdSt.getIterator())); 2968 MIB.buildInstr(AArch64::SUBREG_TO_REG, {OldDst}, {}) 2969 .addImm(0) 2970 .addUse(NewDst) 2971 .addImm(SubReg); 2972 auto SubRegRC = getRegClassForTypeOnBank(MRI.getType(OldDst), RB); 2973 RBI.constrainGenericRegister(OldDst, *SubRegRC, MRI); 2974 MIB.setInstr(LdSt); 2975 } 2976 } 2977 2978 // Helper lambda for partially selecting I. Either returns the original 2979 // instruction with an updated opcode, or a new instruction. 2980 auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * { 2981 bool IsStore = isa<GStore>(I); 2982 const unsigned NewOpc = 2983 selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits); 2984 if (NewOpc == I.getOpcode()) 2985 return nullptr; 2986 // Check if we can fold anything into the addressing mode. 2987 auto AddrModeFns = 2988 selectAddrModeIndexed(I.getOperand(1), MemSizeInBytes); 2989 if (!AddrModeFns) { 2990 // Can't fold anything. Use the original instruction. 2991 I.setDesc(TII.get(NewOpc)); 2992 I.addOperand(MachineOperand::CreateImm(0)); 2993 return &I; 2994 } 2995 2996 // Folded something. Create a new instruction and return it. 2997 auto NewInst = MIB.buildInstr(NewOpc, {}, {}, I.getFlags()); 2998 Register CurValReg = I.getOperand(0).getReg(); 2999 IsStore ? NewInst.addUse(CurValReg) : NewInst.addDef(CurValReg); 3000 NewInst.cloneMemRefs(I); 3001 for (auto &Fn : *AddrModeFns) 3002 Fn(NewInst); 3003 I.eraseFromParent(); 3004 return &*NewInst; 3005 }; 3006 3007 MachineInstr *LoadStore = SelectLoadStoreAddressingMode(); 3008 if (!LoadStore) 3009 return false; 3010 3011 // If we're storing a 0, use WZR/XZR. 3012 if (Opcode == TargetOpcode::G_STORE) { 3013 auto CVal = getIConstantVRegValWithLookThrough( 3014 LoadStore->getOperand(0).getReg(), MRI); 3015 if (CVal && CVal->Value == 0) { 3016 switch (LoadStore->getOpcode()) { 3017 case AArch64::STRWui: 3018 case AArch64::STRHHui: 3019 case AArch64::STRBBui: 3020 LoadStore->getOperand(0).setReg(AArch64::WZR); 3021 break; 3022 case AArch64::STRXui: 3023 LoadStore->getOperand(0).setReg(AArch64::XZR); 3024 break; 3025 } 3026 } 3027 } 3028 3029 if (IsZExtLoad) { 3030 // The zextload from a smaller type to i32 should be handled by the 3031 // importer. 3032 if (MRI.getType(LoadStore->getOperand(0).getReg()).getSizeInBits() != 64) 3033 return false; 3034 // If we have a ZEXTLOAD then change the load's type to be a narrower reg 3035 // and zero_extend with SUBREG_TO_REG. 3036 Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 3037 Register DstReg = LoadStore->getOperand(0).getReg(); 3038 LoadStore->getOperand(0).setReg(LdReg); 3039 3040 MIB.setInsertPt(MIB.getMBB(), std::next(LoadStore->getIterator())); 3041 MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {}) 3042 .addImm(0) 3043 .addUse(LdReg) 3044 .addImm(AArch64::sub_32); 3045 constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI); 3046 return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass, 3047 MRI); 3048 } 3049 return constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI); 3050 } 3051 3052 case TargetOpcode::G_INDEXED_ZEXTLOAD: 3053 case TargetOpcode::G_INDEXED_SEXTLOAD: 3054 return selectIndexedExtLoad(I, MRI); 3055 case TargetOpcode::G_INDEXED_LOAD: 3056 return selectIndexedLoad(I, MRI); 3057 case TargetOpcode::G_INDEXED_STORE: 3058 return selectIndexedStore(cast<GIndexedStore>(I), MRI); 3059 3060 case TargetOpcode::G_SMULH: 3061 case TargetOpcode::G_UMULH: { 3062 // Reject the various things we don't support yet. 3063 if (unsupportedBinOp(I, RBI, MRI, TRI)) 3064 return false; 3065 3066 const Register DefReg = I.getOperand(0).getReg(); 3067 const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); 3068 3069 if (RB.getID() != AArch64::GPRRegBankID) { 3070 LLVM_DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n"); 3071 return false; 3072 } 3073 3074 if (Ty != LLT::scalar(64)) { 3075 LLVM_DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty 3076 << ", expected: " << LLT::scalar(64) << '\n'); 3077 return false; 3078 } 3079 3080 unsigned NewOpc = I.getOpcode() == TargetOpcode::G_SMULH ? AArch64::SMULHrr 3081 : AArch64::UMULHrr; 3082 I.setDesc(TII.get(NewOpc)); 3083 3084 // Now that we selected an opcode, we need to constrain the register 3085 // operands to use appropriate classes. 3086 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3087 } 3088 case TargetOpcode::G_LSHR: 3089 case TargetOpcode::G_ASHR: 3090 if (MRI.getType(I.getOperand(0).getReg()).isVector()) 3091 return selectVectorAshrLshr(I, MRI); 3092 [[fallthrough]]; 3093 case TargetOpcode::G_SHL: 3094 if (Opcode == TargetOpcode::G_SHL && 3095 MRI.getType(I.getOperand(0).getReg()).isVector()) 3096 return selectVectorSHL(I, MRI); 3097 3098 // These shifts were legalized to have 64 bit shift amounts because we 3099 // want to take advantage of the selection patterns that assume the 3100 // immediates are s64s, however, selectBinaryOp will assume both operands 3101 // will have the same bit size. 3102 { 3103 Register SrcReg = I.getOperand(1).getReg(); 3104 Register ShiftReg = I.getOperand(2).getReg(); 3105 const LLT ShiftTy = MRI.getType(ShiftReg); 3106 const LLT SrcTy = MRI.getType(SrcReg); 3107 if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 32 && 3108 ShiftTy.getSizeInBits() == 64) { 3109 assert(!ShiftTy.isVector() && "unexpected vector shift ty"); 3110 // Insert a subregister copy to implement a 64->32 trunc 3111 auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {}) 3112 .addReg(ShiftReg, 0, AArch64::sub_32); 3113 MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID)); 3114 I.getOperand(2).setReg(Trunc.getReg(0)); 3115 } 3116 } 3117 [[fallthrough]]; 3118 case TargetOpcode::G_OR: { 3119 // Reject the various things we don't support yet. 3120 if (unsupportedBinOp(I, RBI, MRI, TRI)) 3121 return false; 3122 3123 const unsigned OpSize = Ty.getSizeInBits(); 3124 3125 const Register DefReg = I.getOperand(0).getReg(); 3126 const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); 3127 3128 const unsigned NewOpc = selectBinaryOp(I.getOpcode(), RB.getID(), OpSize); 3129 if (NewOpc == I.getOpcode()) 3130 return false; 3131 3132 I.setDesc(TII.get(NewOpc)); 3133 // FIXME: Should the type be always reset in setDesc? 3134 3135 // Now that we selected an opcode, we need to constrain the register 3136 // operands to use appropriate classes. 3137 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3138 } 3139 3140 case TargetOpcode::G_PTR_ADD: { 3141 emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2), MIB); 3142 I.eraseFromParent(); 3143 return true; 3144 } 3145 3146 case TargetOpcode::G_SADDE: 3147 case TargetOpcode::G_UADDE: 3148 case TargetOpcode::G_SSUBE: 3149 case TargetOpcode::G_USUBE: 3150 case TargetOpcode::G_SADDO: 3151 case TargetOpcode::G_UADDO: 3152 case TargetOpcode::G_SSUBO: 3153 case TargetOpcode::G_USUBO: 3154 return selectOverflowOp(I, MRI); 3155 3156 case TargetOpcode::G_PTRMASK: { 3157 Register MaskReg = I.getOperand(2).getReg(); 3158 std::optional<int64_t> MaskVal = getIConstantVRegSExtVal(MaskReg, MRI); 3159 // TODO: Implement arbitrary cases 3160 if (!MaskVal || !isShiftedMask_64(*MaskVal)) 3161 return false; 3162 3163 uint64_t Mask = *MaskVal; 3164 I.setDesc(TII.get(AArch64::ANDXri)); 3165 I.getOperand(2).ChangeToImmediate( 3166 AArch64_AM::encodeLogicalImmediate(Mask, 64)); 3167 3168 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3169 } 3170 case TargetOpcode::G_PTRTOINT: 3171 case TargetOpcode::G_TRUNC: { 3172 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 3173 const LLT SrcTy = MRI.getType(I.getOperand(1).getReg()); 3174 3175 const Register DstReg = I.getOperand(0).getReg(); 3176 const Register SrcReg = I.getOperand(1).getReg(); 3177 3178 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); 3179 const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI); 3180 3181 if (DstRB.getID() != SrcRB.getID()) { 3182 LLVM_DEBUG( 3183 dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n"); 3184 return false; 3185 } 3186 3187 if (DstRB.getID() == AArch64::GPRRegBankID) { 3188 const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB); 3189 if (!DstRC) 3190 return false; 3191 3192 const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(SrcTy, SrcRB); 3193 if (!SrcRC) 3194 return false; 3195 3196 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) || 3197 !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { 3198 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n"); 3199 return false; 3200 } 3201 3202 if (DstRC == SrcRC) { 3203 // Nothing to be done 3204 } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) && 3205 SrcTy == LLT::scalar(64)) { 3206 llvm_unreachable("TableGen can import this case"); 3207 return false; 3208 } else if (DstRC == &AArch64::GPR32RegClass && 3209 SrcRC == &AArch64::GPR64RegClass) { 3210 I.getOperand(1).setSubReg(AArch64::sub_32); 3211 } else { 3212 LLVM_DEBUG( 3213 dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n"); 3214 return false; 3215 } 3216 3217 I.setDesc(TII.get(TargetOpcode::COPY)); 3218 return true; 3219 } else if (DstRB.getID() == AArch64::FPRRegBankID) { 3220 if (DstTy == LLT::fixed_vector(4, 16) && 3221 SrcTy == LLT::fixed_vector(4, 32)) { 3222 I.setDesc(TII.get(AArch64::XTNv4i16)); 3223 constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3224 return true; 3225 } 3226 3227 if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) { 3228 MachineInstr *Extract = emitExtractVectorElt( 3229 DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB); 3230 if (!Extract) 3231 return false; 3232 I.eraseFromParent(); 3233 return true; 3234 } 3235 3236 // We might have a vector G_PTRTOINT, in which case just emit a COPY. 3237 if (Opcode == TargetOpcode::G_PTRTOINT) { 3238 assert(DstTy.isVector() && "Expected an FPR ptrtoint to be a vector"); 3239 I.setDesc(TII.get(TargetOpcode::COPY)); 3240 return selectCopy(I, TII, MRI, TRI, RBI); 3241 } 3242 } 3243 3244 return false; 3245 } 3246 3247 case TargetOpcode::G_ANYEXT: { 3248 if (selectUSMovFromExtend(I, MRI)) 3249 return true; 3250 3251 const Register DstReg = I.getOperand(0).getReg(); 3252 const Register SrcReg = I.getOperand(1).getReg(); 3253 3254 const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI); 3255 if (RBDst.getID() != AArch64::GPRRegBankID) { 3256 LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst 3257 << ", expected: GPR\n"); 3258 return false; 3259 } 3260 3261 const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI); 3262 if (RBSrc.getID() != AArch64::GPRRegBankID) { 3263 LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc 3264 << ", expected: GPR\n"); 3265 return false; 3266 } 3267 3268 const unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); 3269 3270 if (DstSize == 0) { 3271 LLVM_DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n"); 3272 return false; 3273 } 3274 3275 if (DstSize != 64 && DstSize > 32) { 3276 LLVM_DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize 3277 << ", expected: 32 or 64\n"); 3278 return false; 3279 } 3280 // At this point G_ANYEXT is just like a plain COPY, but we need 3281 // to explicitly form the 64-bit value if any. 3282 if (DstSize > 32) { 3283 Register ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass); 3284 BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG)) 3285 .addDef(ExtSrc) 3286 .addImm(0) 3287 .addUse(SrcReg) 3288 .addImm(AArch64::sub_32); 3289 I.getOperand(1).setReg(ExtSrc); 3290 } 3291 return selectCopy(I, TII, MRI, TRI, RBI); 3292 } 3293 3294 case TargetOpcode::G_ZEXT: 3295 case TargetOpcode::G_SEXT_INREG: 3296 case TargetOpcode::G_SEXT: { 3297 if (selectUSMovFromExtend(I, MRI)) 3298 return true; 3299 3300 unsigned Opcode = I.getOpcode(); 3301 const bool IsSigned = Opcode != TargetOpcode::G_ZEXT; 3302 const Register DefReg = I.getOperand(0).getReg(); 3303 Register SrcReg = I.getOperand(1).getReg(); 3304 const LLT DstTy = MRI.getType(DefReg); 3305 const LLT SrcTy = MRI.getType(SrcReg); 3306 unsigned DstSize = DstTy.getSizeInBits(); 3307 unsigned SrcSize = SrcTy.getSizeInBits(); 3308 3309 // SEXT_INREG has the same src reg size as dst, the size of the value to be 3310 // extended is encoded in the imm. 3311 if (Opcode == TargetOpcode::G_SEXT_INREG) 3312 SrcSize = I.getOperand(2).getImm(); 3313 3314 if (DstTy.isVector()) 3315 return false; // Should be handled by imported patterns. 3316 3317 assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() == 3318 AArch64::GPRRegBankID && 3319 "Unexpected ext regbank"); 3320 3321 MachineInstr *ExtI; 3322 3323 // First check if we're extending the result of a load which has a dest type 3324 // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest 3325 // GPR register on AArch64 and all loads which are smaller automatically 3326 // zero-extend the upper bits. E.g. 3327 // %v(s8) = G_LOAD %p, :: (load 1) 3328 // %v2(s32) = G_ZEXT %v(s8) 3329 if (!IsSigned) { 3330 auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI); 3331 bool IsGPR = 3332 RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID; 3333 if (LoadMI && IsGPR) { 3334 const MachineMemOperand *MemOp = *LoadMI->memoperands_begin(); 3335 unsigned BytesLoaded = MemOp->getSize(); 3336 if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded) 3337 return selectCopy(I, TII, MRI, TRI, RBI); 3338 } 3339 3340 // For the 32-bit -> 64-bit case, we can emit a mov (ORRWrs) 3341 // + SUBREG_TO_REG. 3342 if (IsGPR && SrcSize == 32 && DstSize == 64) { 3343 Register SubregToRegSrc = 3344 MRI.createVirtualRegister(&AArch64::GPR32RegClass); 3345 const Register ZReg = AArch64::WZR; 3346 MIB.buildInstr(AArch64::ORRWrs, {SubregToRegSrc}, {ZReg, SrcReg}) 3347 .addImm(0); 3348 3349 MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {}) 3350 .addImm(0) 3351 .addUse(SubregToRegSrc) 3352 .addImm(AArch64::sub_32); 3353 3354 if (!RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, 3355 MRI)) { 3356 LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n"); 3357 return false; 3358 } 3359 3360 if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, 3361 MRI)) { 3362 LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n"); 3363 return false; 3364 } 3365 3366 I.eraseFromParent(); 3367 return true; 3368 } 3369 } 3370 3371 if (DstSize == 64) { 3372 if (Opcode != TargetOpcode::G_SEXT_INREG) { 3373 // FIXME: Can we avoid manually doing this? 3374 if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, 3375 MRI)) { 3376 LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode) 3377 << " operand\n"); 3378 return false; 3379 } 3380 SrcReg = MIB.buildInstr(AArch64::SUBREG_TO_REG, 3381 {&AArch64::GPR64RegClass}, {}) 3382 .addImm(0) 3383 .addUse(SrcReg) 3384 .addImm(AArch64::sub_32) 3385 .getReg(0); 3386 } 3387 3388 ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri, 3389 {DefReg}, {SrcReg}) 3390 .addImm(0) 3391 .addImm(SrcSize - 1); 3392 } else if (DstSize <= 32) { 3393 ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri, 3394 {DefReg}, {SrcReg}) 3395 .addImm(0) 3396 .addImm(SrcSize - 1); 3397 } else { 3398 return false; 3399 } 3400 3401 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 3402 I.eraseFromParent(); 3403 return true; 3404 } 3405 3406 case TargetOpcode::G_SITOFP: 3407 case TargetOpcode::G_UITOFP: 3408 case TargetOpcode::G_FPTOSI: 3409 case TargetOpcode::G_FPTOUI: { 3410 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()), 3411 SrcTy = MRI.getType(I.getOperand(1).getReg()); 3412 const unsigned NewOpc = selectFPConvOpc(Opcode, DstTy, SrcTy); 3413 if (NewOpc == Opcode) 3414 return false; 3415 3416 I.setDesc(TII.get(NewOpc)); 3417 constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3418 I.setFlags(MachineInstr::NoFPExcept); 3419 3420 return true; 3421 } 3422 3423 case TargetOpcode::G_FREEZE: 3424 return selectCopy(I, TII, MRI, TRI, RBI); 3425 3426 case TargetOpcode::G_INTTOPTR: 3427 // The importer is currently unable to import pointer types since they 3428 // didn't exist in SelectionDAG. 3429 return selectCopy(I, TII, MRI, TRI, RBI); 3430 3431 case TargetOpcode::G_BITCAST: 3432 // Imported SelectionDAG rules can handle every bitcast except those that 3433 // bitcast from a type to the same type. Ideally, these shouldn't occur 3434 // but we might not run an optimizer that deletes them. The other exception 3435 // is bitcasts involving pointer types, as SelectionDAG has no knowledge 3436 // of them. 3437 return selectCopy(I, TII, MRI, TRI, RBI); 3438 3439 case TargetOpcode::G_SELECT: { 3440 auto &Sel = cast<GSelect>(I); 3441 const Register CondReg = Sel.getCondReg(); 3442 const Register TReg = Sel.getTrueReg(); 3443 const Register FReg = Sel.getFalseReg(); 3444 3445 if (tryOptSelect(Sel)) 3446 return true; 3447 3448 // Make sure to use an unused vreg instead of wzr, so that the peephole 3449 // optimizations will be able to optimize these. 3450 Register DeadVReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 3451 auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg}) 3452 .addImm(AArch64_AM::encodeLogicalImmediate(1, 32)); 3453 constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI); 3454 if (!emitSelect(Sel.getReg(0), TReg, FReg, AArch64CC::NE, MIB)) 3455 return false; 3456 Sel.eraseFromParent(); 3457 return true; 3458 } 3459 case TargetOpcode::G_ICMP: { 3460 if (Ty.isVector()) 3461 return selectVectorICmp(I, MRI); 3462 3463 if (Ty != LLT::scalar(32)) { 3464 LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty 3465 << ", expected: " << LLT::scalar(32) << '\n'); 3466 return false; 3467 } 3468 3469 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate()); 3470 const AArch64CC::CondCode InvCC = 3471 changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred)); 3472 emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1), MIB); 3473 emitCSINC(/*Dst=*/I.getOperand(0).getReg(), /*Src1=*/AArch64::WZR, 3474 /*Src2=*/AArch64::WZR, InvCC, MIB); 3475 I.eraseFromParent(); 3476 return true; 3477 } 3478 3479 case TargetOpcode::G_FCMP: { 3480 CmpInst::Predicate Pred = 3481 static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate()); 3482 if (!emitFPCompare(I.getOperand(2).getReg(), I.getOperand(3).getReg(), MIB, 3483 Pred) || 3484 !emitCSetForFCmp(I.getOperand(0).getReg(), Pred, MIB)) 3485 return false; 3486 I.eraseFromParent(); 3487 return true; 3488 } 3489 case TargetOpcode::G_VASTART: 3490 return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI) 3491 : selectVaStartAAPCS(I, MF, MRI); 3492 case TargetOpcode::G_INTRINSIC: 3493 return selectIntrinsic(I, MRI); 3494 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: 3495 return selectIntrinsicWithSideEffects(I, MRI); 3496 case TargetOpcode::G_IMPLICIT_DEF: { 3497 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); 3498 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 3499 const Register DstReg = I.getOperand(0).getReg(); 3500 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); 3501 const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB); 3502 RBI.constrainGenericRegister(DstReg, *DstRC, MRI); 3503 return true; 3504 } 3505 case TargetOpcode::G_BLOCK_ADDR: { 3506 if (TM.getCodeModel() == CodeModel::Large && !TM.isPositionIndependent()) { 3507 materializeLargeCMVal(I, I.getOperand(1).getBlockAddress(), 0); 3508 I.eraseFromParent(); 3509 return true; 3510 } else { 3511 I.setDesc(TII.get(AArch64::MOVaddrBA)); 3512 auto MovMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::MOVaddrBA), 3513 I.getOperand(0).getReg()) 3514 .addBlockAddress(I.getOperand(1).getBlockAddress(), 3515 /* Offset */ 0, AArch64II::MO_PAGE) 3516 .addBlockAddress( 3517 I.getOperand(1).getBlockAddress(), /* Offset */ 0, 3518 AArch64II::MO_NC | AArch64II::MO_PAGEOFF); 3519 I.eraseFromParent(); 3520 return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI); 3521 } 3522 } 3523 case AArch64::G_DUP: { 3524 // When the scalar of G_DUP is an s8/s16 gpr, they can't be selected by 3525 // imported patterns. Do it manually here. Avoiding generating s16 gpr is 3526 // difficult because at RBS we may end up pessimizing the fpr case if we 3527 // decided to add an anyextend to fix this. Manual selection is the most 3528 // robust solution for now. 3529 if (RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() != 3530 AArch64::GPRRegBankID) 3531 return false; // We expect the fpr regbank case to be imported. 3532 LLT VecTy = MRI.getType(I.getOperand(0).getReg()); 3533 if (VecTy == LLT::fixed_vector(8, 8)) 3534 I.setDesc(TII.get(AArch64::DUPv8i8gpr)); 3535 else if (VecTy == LLT::fixed_vector(16, 8)) 3536 I.setDesc(TII.get(AArch64::DUPv16i8gpr)); 3537 else if (VecTy == LLT::fixed_vector(4, 16)) 3538 I.setDesc(TII.get(AArch64::DUPv4i16gpr)); 3539 else if (VecTy == LLT::fixed_vector(8, 16)) 3540 I.setDesc(TII.get(AArch64::DUPv8i16gpr)); 3541 else 3542 return false; 3543 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3544 } 3545 case TargetOpcode::G_BUILD_VECTOR: 3546 return selectBuildVector(I, MRI); 3547 case TargetOpcode::G_MERGE_VALUES: 3548 return selectMergeValues(I, MRI); 3549 case TargetOpcode::G_UNMERGE_VALUES: 3550 return selectUnmergeValues(I, MRI); 3551 case TargetOpcode::G_SHUFFLE_VECTOR: 3552 return selectShuffleVector(I, MRI); 3553 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 3554 return selectExtractElt(I, MRI); 3555 case TargetOpcode::G_INSERT_VECTOR_ELT: 3556 return selectInsertElt(I, MRI); 3557 case TargetOpcode::G_CONCAT_VECTORS: 3558 return selectConcatVectors(I, MRI); 3559 case TargetOpcode::G_JUMP_TABLE: 3560 return selectJumpTable(I, MRI); 3561 case TargetOpcode::G_MEMCPY: 3562 case TargetOpcode::G_MEMCPY_INLINE: 3563 case TargetOpcode::G_MEMMOVE: 3564 case TargetOpcode::G_MEMSET: 3565 assert(STI.hasMOPS() && "Shouldn't get here without +mops feature"); 3566 return selectMOPS(I, MRI); 3567 } 3568 3569 return false; 3570 } 3571 3572 bool AArch64InstructionSelector::selectAndRestoreState(MachineInstr &I) { 3573 MachineIRBuilderState OldMIBState = MIB.getState(); 3574 bool Success = select(I); 3575 MIB.setState(OldMIBState); 3576 return Success; 3577 } 3578 3579 bool AArch64InstructionSelector::selectMOPS(MachineInstr &GI, 3580 MachineRegisterInfo &MRI) { 3581 unsigned Mopcode; 3582 switch (GI.getOpcode()) { 3583 case TargetOpcode::G_MEMCPY: 3584 case TargetOpcode::G_MEMCPY_INLINE: 3585 Mopcode = AArch64::MOPSMemoryCopyPseudo; 3586 break; 3587 case TargetOpcode::G_MEMMOVE: 3588 Mopcode = AArch64::MOPSMemoryMovePseudo; 3589 break; 3590 case TargetOpcode::G_MEMSET: 3591 // For tagged memset see llvm.aarch64.mops.memset.tag 3592 Mopcode = AArch64::MOPSMemorySetPseudo; 3593 break; 3594 } 3595 3596 auto &DstPtr = GI.getOperand(0); 3597 auto &SrcOrVal = GI.getOperand(1); 3598 auto &Size = GI.getOperand(2); 3599 3600 // Create copies of the registers that can be clobbered. 3601 const Register DstPtrCopy = MRI.cloneVirtualRegister(DstPtr.getReg()); 3602 const Register SrcValCopy = MRI.cloneVirtualRegister(SrcOrVal.getReg()); 3603 const Register SizeCopy = MRI.cloneVirtualRegister(Size.getReg()); 3604 3605 const bool IsSet = Mopcode == AArch64::MOPSMemorySetPseudo; 3606 const auto &SrcValRegClass = 3607 IsSet ? AArch64::GPR64RegClass : AArch64::GPR64commonRegClass; 3608 3609 // Constrain to specific registers 3610 RBI.constrainGenericRegister(DstPtrCopy, AArch64::GPR64commonRegClass, MRI); 3611 RBI.constrainGenericRegister(SrcValCopy, SrcValRegClass, MRI); 3612 RBI.constrainGenericRegister(SizeCopy, AArch64::GPR64RegClass, MRI); 3613 3614 MIB.buildCopy(DstPtrCopy, DstPtr); 3615 MIB.buildCopy(SrcValCopy, SrcOrVal); 3616 MIB.buildCopy(SizeCopy, Size); 3617 3618 // New instruction uses the copied registers because it must update them. 3619 // The defs are not used since they don't exist in G_MEM*. They are still 3620 // tied. 3621 // Note: order of operands is different from G_MEMSET, G_MEMCPY, G_MEMMOVE 3622 Register DefDstPtr = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass); 3623 Register DefSize = MRI.createVirtualRegister(&AArch64::GPR64RegClass); 3624 if (IsSet) { 3625 MIB.buildInstr(Mopcode, {DefDstPtr, DefSize}, 3626 {DstPtrCopy, SizeCopy, SrcValCopy}); 3627 } else { 3628 Register DefSrcPtr = MRI.createVirtualRegister(&SrcValRegClass); 3629 MIB.buildInstr(Mopcode, {DefDstPtr, DefSrcPtr, DefSize}, 3630 {DstPtrCopy, SrcValCopy, SizeCopy}); 3631 } 3632 3633 GI.eraseFromParent(); 3634 return true; 3635 } 3636 3637 bool AArch64InstructionSelector::selectBrJT(MachineInstr &I, 3638 MachineRegisterInfo &MRI) { 3639 assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT"); 3640 Register JTAddr = I.getOperand(0).getReg(); 3641 unsigned JTI = I.getOperand(1).getIndex(); 3642 Register Index = I.getOperand(2).getReg(); 3643 3644 Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); 3645 Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass); 3646 3647 MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(JTI, 4, nullptr); 3648 auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32, 3649 {TargetReg, ScratchReg}, {JTAddr, Index}) 3650 .addJumpTableIndex(JTI); 3651 // Save the jump table info. 3652 MIB.buildInstr(TargetOpcode::JUMP_TABLE_DEBUG_INFO, {}, 3653 {static_cast<int64_t>(JTI)}); 3654 // Build the indirect branch. 3655 MIB.buildInstr(AArch64::BR, {}, {TargetReg}); 3656 I.eraseFromParent(); 3657 return constrainSelectedInstRegOperands(*JumpTableInst, TII, TRI, RBI); 3658 } 3659 3660 bool AArch64InstructionSelector::selectJumpTable(MachineInstr &I, 3661 MachineRegisterInfo &MRI) { 3662 assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table"); 3663 assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!"); 3664 3665 Register DstReg = I.getOperand(0).getReg(); 3666 unsigned JTI = I.getOperand(1).getIndex(); 3667 // We generate a MOVaddrJT which will get expanded to an ADRP + ADD later. 3668 auto MovMI = 3669 MIB.buildInstr(AArch64::MOVaddrJT, {DstReg}, {}) 3670 .addJumpTableIndex(JTI, AArch64II::MO_PAGE) 3671 .addJumpTableIndex(JTI, AArch64II::MO_NC | AArch64II::MO_PAGEOFF); 3672 I.eraseFromParent(); 3673 return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI); 3674 } 3675 3676 bool AArch64InstructionSelector::selectTLSGlobalValue( 3677 MachineInstr &I, MachineRegisterInfo &MRI) { 3678 if (!STI.isTargetMachO()) 3679 return false; 3680 MachineFunction &MF = *I.getParent()->getParent(); 3681 MF.getFrameInfo().setAdjustsStack(true); 3682 3683 const auto &GlobalOp = I.getOperand(1); 3684 assert(GlobalOp.getOffset() == 0 && 3685 "Shouldn't have an offset on TLS globals!"); 3686 const GlobalValue &GV = *GlobalOp.getGlobal(); 3687 3688 auto LoadGOT = 3689 MIB.buildInstr(AArch64::LOADgot, {&AArch64::GPR64commonRegClass}, {}) 3690 .addGlobalAddress(&GV, 0, AArch64II::MO_TLS); 3691 3692 auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass}, 3693 {LoadGOT.getReg(0)}) 3694 .addImm(0); 3695 3696 MIB.buildCopy(Register(AArch64::X0), LoadGOT.getReg(0)); 3697 // TLS calls preserve all registers except those that absolutely must be 3698 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be 3699 // silly). 3700 MIB.buildInstr(getBLRCallOpcode(MF), {}, {Load}) 3701 .addUse(AArch64::X0, RegState::Implicit) 3702 .addDef(AArch64::X0, RegState::Implicit) 3703 .addRegMask(TRI.getTLSCallPreservedMask()); 3704 3705 MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X0)); 3706 RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR64RegClass, 3707 MRI); 3708 I.eraseFromParent(); 3709 return true; 3710 } 3711 3712 bool AArch64InstructionSelector::selectVectorICmp( 3713 MachineInstr &I, MachineRegisterInfo &MRI) { 3714 Register DstReg = I.getOperand(0).getReg(); 3715 LLT DstTy = MRI.getType(DstReg); 3716 Register SrcReg = I.getOperand(2).getReg(); 3717 Register Src2Reg = I.getOperand(3).getReg(); 3718 LLT SrcTy = MRI.getType(SrcReg); 3719 3720 unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits(); 3721 unsigned NumElts = DstTy.getNumElements(); 3722 3723 // First index is element size, 0 == 8b, 1 == 16b, 2 == 32b, 3 == 64b 3724 // Second index is num elts, 0 == v2, 1 == v4, 2 == v8, 3 == v16 3725 // Third index is cc opcode: 3726 // 0 == eq 3727 // 1 == ugt 3728 // 2 == uge 3729 // 3 == ult 3730 // 4 == ule 3731 // 5 == sgt 3732 // 6 == sge 3733 // 7 == slt 3734 // 8 == sle 3735 // ne is done by negating 'eq' result. 3736 3737 // This table below assumes that for some comparisons the operands will be 3738 // commuted. 3739 // ult op == commute + ugt op 3740 // ule op == commute + uge op 3741 // slt op == commute + sgt op 3742 // sle op == commute + sge op 3743 unsigned PredIdx = 0; 3744 bool SwapOperands = false; 3745 CmpInst::Predicate Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); 3746 switch (Pred) { 3747 case CmpInst::ICMP_NE: 3748 case CmpInst::ICMP_EQ: 3749 PredIdx = 0; 3750 break; 3751 case CmpInst::ICMP_UGT: 3752 PredIdx = 1; 3753 break; 3754 case CmpInst::ICMP_UGE: 3755 PredIdx = 2; 3756 break; 3757 case CmpInst::ICMP_ULT: 3758 PredIdx = 3; 3759 SwapOperands = true; 3760 break; 3761 case CmpInst::ICMP_ULE: 3762 PredIdx = 4; 3763 SwapOperands = true; 3764 break; 3765 case CmpInst::ICMP_SGT: 3766 PredIdx = 5; 3767 break; 3768 case CmpInst::ICMP_SGE: 3769 PredIdx = 6; 3770 break; 3771 case CmpInst::ICMP_SLT: 3772 PredIdx = 7; 3773 SwapOperands = true; 3774 break; 3775 case CmpInst::ICMP_SLE: 3776 PredIdx = 8; 3777 SwapOperands = true; 3778 break; 3779 default: 3780 llvm_unreachable("Unhandled icmp predicate"); 3781 return false; 3782 } 3783 3784 // This table obviously should be tablegen'd when we have our GISel native 3785 // tablegen selector. 3786 3787 static const unsigned OpcTable[4][4][9] = { 3788 { 3789 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3790 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3791 0 /* invalid */}, 3792 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3793 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3794 0 /* invalid */}, 3795 {AArch64::CMEQv8i8, AArch64::CMHIv8i8, AArch64::CMHSv8i8, 3796 AArch64::CMHIv8i8, AArch64::CMHSv8i8, AArch64::CMGTv8i8, 3797 AArch64::CMGEv8i8, AArch64::CMGTv8i8, AArch64::CMGEv8i8}, 3798 {AArch64::CMEQv16i8, AArch64::CMHIv16i8, AArch64::CMHSv16i8, 3799 AArch64::CMHIv16i8, AArch64::CMHSv16i8, AArch64::CMGTv16i8, 3800 AArch64::CMGEv16i8, AArch64::CMGTv16i8, AArch64::CMGEv16i8} 3801 }, 3802 { 3803 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3804 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3805 0 /* invalid */}, 3806 {AArch64::CMEQv4i16, AArch64::CMHIv4i16, AArch64::CMHSv4i16, 3807 AArch64::CMHIv4i16, AArch64::CMHSv4i16, AArch64::CMGTv4i16, 3808 AArch64::CMGEv4i16, AArch64::CMGTv4i16, AArch64::CMGEv4i16}, 3809 {AArch64::CMEQv8i16, AArch64::CMHIv8i16, AArch64::CMHSv8i16, 3810 AArch64::CMHIv8i16, AArch64::CMHSv8i16, AArch64::CMGTv8i16, 3811 AArch64::CMGEv8i16, AArch64::CMGTv8i16, AArch64::CMGEv8i16}, 3812 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3813 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3814 0 /* invalid */} 3815 }, 3816 { 3817 {AArch64::CMEQv2i32, AArch64::CMHIv2i32, AArch64::CMHSv2i32, 3818 AArch64::CMHIv2i32, AArch64::CMHSv2i32, AArch64::CMGTv2i32, 3819 AArch64::CMGEv2i32, AArch64::CMGTv2i32, AArch64::CMGEv2i32}, 3820 {AArch64::CMEQv4i32, AArch64::CMHIv4i32, AArch64::CMHSv4i32, 3821 AArch64::CMHIv4i32, AArch64::CMHSv4i32, AArch64::CMGTv4i32, 3822 AArch64::CMGEv4i32, AArch64::CMGTv4i32, AArch64::CMGEv4i32}, 3823 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3824 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3825 0 /* invalid */}, 3826 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3827 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3828 0 /* invalid */} 3829 }, 3830 { 3831 {AArch64::CMEQv2i64, AArch64::CMHIv2i64, AArch64::CMHSv2i64, 3832 AArch64::CMHIv2i64, AArch64::CMHSv2i64, AArch64::CMGTv2i64, 3833 AArch64::CMGEv2i64, AArch64::CMGTv2i64, AArch64::CMGEv2i64}, 3834 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3835 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3836 0 /* invalid */}, 3837 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3838 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3839 0 /* invalid */}, 3840 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3841 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 3842 0 /* invalid */} 3843 }, 3844 }; 3845 unsigned EltIdx = Log2_32(SrcEltSize / 8); 3846 unsigned NumEltsIdx = Log2_32(NumElts / 2); 3847 unsigned Opc = OpcTable[EltIdx][NumEltsIdx][PredIdx]; 3848 if (!Opc) { 3849 LLVM_DEBUG(dbgs() << "Could not map G_ICMP to cmp opcode"); 3850 return false; 3851 } 3852 3853 const RegisterBank &VecRB = *RBI.getRegBank(SrcReg, MRI, TRI); 3854 const TargetRegisterClass *SrcRC = 3855 getRegClassForTypeOnBank(SrcTy, VecRB, true); 3856 if (!SrcRC) { 3857 LLVM_DEBUG(dbgs() << "Could not determine source register class.\n"); 3858 return false; 3859 } 3860 3861 unsigned NotOpc = Pred == ICmpInst::ICMP_NE ? AArch64::NOTv8i8 : 0; 3862 if (SrcTy.getSizeInBits() == 128) 3863 NotOpc = NotOpc ? AArch64::NOTv16i8 : 0; 3864 3865 if (SwapOperands) 3866 std::swap(SrcReg, Src2Reg); 3867 3868 auto Cmp = MIB.buildInstr(Opc, {SrcRC}, {SrcReg, Src2Reg}); 3869 constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI); 3870 3871 // Invert if we had a 'ne' cc. 3872 if (NotOpc) { 3873 Cmp = MIB.buildInstr(NotOpc, {DstReg}, {Cmp}); 3874 constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI); 3875 } else { 3876 MIB.buildCopy(DstReg, Cmp.getReg(0)); 3877 } 3878 RBI.constrainGenericRegister(DstReg, *SrcRC, MRI); 3879 I.eraseFromParent(); 3880 return true; 3881 } 3882 3883 MachineInstr *AArch64InstructionSelector::emitScalarToVector( 3884 unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar, 3885 MachineIRBuilder &MIRBuilder) const { 3886 auto Undef = MIRBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstRC}, {}); 3887 3888 auto BuildFn = [&](unsigned SubregIndex) { 3889 auto Ins = 3890 MIRBuilder 3891 .buildInstr(TargetOpcode::INSERT_SUBREG, {DstRC}, {Undef, Scalar}) 3892 .addImm(SubregIndex); 3893 constrainSelectedInstRegOperands(*Undef, TII, TRI, RBI); 3894 constrainSelectedInstRegOperands(*Ins, TII, TRI, RBI); 3895 return &*Ins; 3896 }; 3897 3898 switch (EltSize) { 3899 case 8: 3900 return BuildFn(AArch64::bsub); 3901 case 16: 3902 return BuildFn(AArch64::hsub); 3903 case 32: 3904 return BuildFn(AArch64::ssub); 3905 case 64: 3906 return BuildFn(AArch64::dsub); 3907 default: 3908 return nullptr; 3909 } 3910 } 3911 3912 MachineInstr * 3913 AArch64InstructionSelector::emitNarrowVector(Register DstReg, Register SrcReg, 3914 MachineIRBuilder &MIB, 3915 MachineRegisterInfo &MRI) const { 3916 LLT DstTy = MRI.getType(DstReg); 3917 const TargetRegisterClass *RC = 3918 getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(SrcReg, MRI, TRI)); 3919 if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) { 3920 LLVM_DEBUG(dbgs() << "Unsupported register class!\n"); 3921 return nullptr; 3922 } 3923 unsigned SubReg = 0; 3924 if (!getSubRegForClass(RC, TRI, SubReg)) 3925 return nullptr; 3926 if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) { 3927 LLVM_DEBUG(dbgs() << "Unsupported destination size! (" 3928 << DstTy.getSizeInBits() << "\n"); 3929 return nullptr; 3930 } 3931 auto Copy = MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}) 3932 .addReg(SrcReg, 0, SubReg); 3933 RBI.constrainGenericRegister(DstReg, *RC, MRI); 3934 return Copy; 3935 } 3936 3937 bool AArch64InstructionSelector::selectMergeValues( 3938 MachineInstr &I, MachineRegisterInfo &MRI) { 3939 assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode"); 3940 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 3941 const LLT SrcTy = MRI.getType(I.getOperand(1).getReg()); 3942 assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation"); 3943 const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI); 3944 3945 if (I.getNumOperands() != 3) 3946 return false; 3947 3948 // Merging 2 s64s into an s128. 3949 if (DstTy == LLT::scalar(128)) { 3950 if (SrcTy.getSizeInBits() != 64) 3951 return false; 3952 Register DstReg = I.getOperand(0).getReg(); 3953 Register Src1Reg = I.getOperand(1).getReg(); 3954 Register Src2Reg = I.getOperand(2).getReg(); 3955 auto Tmp = MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstTy}, {}); 3956 MachineInstr *InsMI = emitLaneInsert(std::nullopt, Tmp.getReg(0), Src1Reg, 3957 /* LaneIdx */ 0, RB, MIB); 3958 if (!InsMI) 3959 return false; 3960 MachineInstr *Ins2MI = emitLaneInsert(DstReg, InsMI->getOperand(0).getReg(), 3961 Src2Reg, /* LaneIdx */ 1, RB, MIB); 3962 if (!Ins2MI) 3963 return false; 3964 constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI); 3965 constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI); 3966 I.eraseFromParent(); 3967 return true; 3968 } 3969 3970 if (RB.getID() != AArch64::GPRRegBankID) 3971 return false; 3972 3973 if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32) 3974 return false; 3975 3976 auto *DstRC = &AArch64::GPR64RegClass; 3977 Register SubToRegDef = MRI.createVirtualRegister(DstRC); 3978 MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(), 3979 TII.get(TargetOpcode::SUBREG_TO_REG)) 3980 .addDef(SubToRegDef) 3981 .addImm(0) 3982 .addUse(I.getOperand(1).getReg()) 3983 .addImm(AArch64::sub_32); 3984 Register SubToRegDef2 = MRI.createVirtualRegister(DstRC); 3985 // Need to anyext the second scalar before we can use bfm 3986 MachineInstr &SubRegMI2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(), 3987 TII.get(TargetOpcode::SUBREG_TO_REG)) 3988 .addDef(SubToRegDef2) 3989 .addImm(0) 3990 .addUse(I.getOperand(2).getReg()) 3991 .addImm(AArch64::sub_32); 3992 MachineInstr &BFM = 3993 *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::BFMXri)) 3994 .addDef(I.getOperand(0).getReg()) 3995 .addUse(SubToRegDef) 3996 .addUse(SubToRegDef2) 3997 .addImm(32) 3998 .addImm(31); 3999 constrainSelectedInstRegOperands(SubRegMI, TII, TRI, RBI); 4000 constrainSelectedInstRegOperands(SubRegMI2, TII, TRI, RBI); 4001 constrainSelectedInstRegOperands(BFM, TII, TRI, RBI); 4002 I.eraseFromParent(); 4003 return true; 4004 } 4005 4006 static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg, 4007 const unsigned EltSize) { 4008 // Choose a lane copy opcode and subregister based off of the size of the 4009 // vector's elements. 4010 switch (EltSize) { 4011 case 8: 4012 CopyOpc = AArch64::DUPi8; 4013 ExtractSubReg = AArch64::bsub; 4014 break; 4015 case 16: 4016 CopyOpc = AArch64::DUPi16; 4017 ExtractSubReg = AArch64::hsub; 4018 break; 4019 case 32: 4020 CopyOpc = AArch64::DUPi32; 4021 ExtractSubReg = AArch64::ssub; 4022 break; 4023 case 64: 4024 CopyOpc = AArch64::DUPi64; 4025 ExtractSubReg = AArch64::dsub; 4026 break; 4027 default: 4028 // Unknown size, bail out. 4029 LLVM_DEBUG(dbgs() << "Elt size '" << EltSize << "' unsupported.\n"); 4030 return false; 4031 } 4032 return true; 4033 } 4034 4035 MachineInstr *AArch64InstructionSelector::emitExtractVectorElt( 4036 std::optional<Register> DstReg, const RegisterBank &DstRB, LLT ScalarTy, 4037 Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const { 4038 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 4039 unsigned CopyOpc = 0; 4040 unsigned ExtractSubReg = 0; 4041 if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, ScalarTy.getSizeInBits())) { 4042 LLVM_DEBUG( 4043 dbgs() << "Couldn't determine lane copy opcode for instruction.\n"); 4044 return nullptr; 4045 } 4046 4047 const TargetRegisterClass *DstRC = 4048 getRegClassForTypeOnBank(ScalarTy, DstRB, true); 4049 if (!DstRC) { 4050 LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n"); 4051 return nullptr; 4052 } 4053 4054 const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI); 4055 const LLT &VecTy = MRI.getType(VecReg); 4056 const TargetRegisterClass *VecRC = 4057 getRegClassForTypeOnBank(VecTy, VecRB, true); 4058 if (!VecRC) { 4059 LLVM_DEBUG(dbgs() << "Could not determine source register class.\n"); 4060 return nullptr; 4061 } 4062 4063 // The register that we're going to copy into. 4064 Register InsertReg = VecReg; 4065 if (!DstReg) 4066 DstReg = MRI.createVirtualRegister(DstRC); 4067 // If the lane index is 0, we just use a subregister COPY. 4068 if (LaneIdx == 0) { 4069 auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {*DstReg}, {}) 4070 .addReg(VecReg, 0, ExtractSubReg); 4071 RBI.constrainGenericRegister(*DstReg, *DstRC, MRI); 4072 return &*Copy; 4073 } 4074 4075 // Lane copies require 128-bit wide registers. If we're dealing with an 4076 // unpacked vector, then we need to move up to that width. Insert an implicit 4077 // def and a subregister insert to get us there. 4078 if (VecTy.getSizeInBits() != 128) { 4079 MachineInstr *ScalarToVector = emitScalarToVector( 4080 VecTy.getSizeInBits(), &AArch64::FPR128RegClass, VecReg, MIRBuilder); 4081 if (!ScalarToVector) 4082 return nullptr; 4083 InsertReg = ScalarToVector->getOperand(0).getReg(); 4084 } 4085 4086 MachineInstr *LaneCopyMI = 4087 MIRBuilder.buildInstr(CopyOpc, {*DstReg}, {InsertReg}).addImm(LaneIdx); 4088 constrainSelectedInstRegOperands(*LaneCopyMI, TII, TRI, RBI); 4089 4090 // Make sure that we actually constrain the initial copy. 4091 RBI.constrainGenericRegister(*DstReg, *DstRC, MRI); 4092 return LaneCopyMI; 4093 } 4094 4095 bool AArch64InstructionSelector::selectExtractElt( 4096 MachineInstr &I, MachineRegisterInfo &MRI) { 4097 assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT && 4098 "unexpected opcode!"); 4099 Register DstReg = I.getOperand(0).getReg(); 4100 const LLT NarrowTy = MRI.getType(DstReg); 4101 const Register SrcReg = I.getOperand(1).getReg(); 4102 const LLT WideTy = MRI.getType(SrcReg); 4103 (void)WideTy; 4104 assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() && 4105 "source register size too small!"); 4106 assert(!NarrowTy.isVector() && "cannot extract vector into vector!"); 4107 4108 // Need the lane index to determine the correct copy opcode. 4109 MachineOperand &LaneIdxOp = I.getOperand(2); 4110 assert(LaneIdxOp.isReg() && "Lane index operand was not a register?"); 4111 4112 if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) { 4113 LLVM_DEBUG(dbgs() << "Cannot extract into GPR.\n"); 4114 return false; 4115 } 4116 4117 // Find the index to extract from. 4118 auto VRegAndVal = getIConstantVRegValWithLookThrough(LaneIdxOp.getReg(), MRI); 4119 if (!VRegAndVal) 4120 return false; 4121 unsigned LaneIdx = VRegAndVal->Value.getSExtValue(); 4122 4123 4124 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); 4125 MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, NarrowTy, SrcReg, 4126 LaneIdx, MIB); 4127 if (!Extract) 4128 return false; 4129 4130 I.eraseFromParent(); 4131 return true; 4132 } 4133 4134 bool AArch64InstructionSelector::selectSplitVectorUnmerge( 4135 MachineInstr &I, MachineRegisterInfo &MRI) { 4136 unsigned NumElts = I.getNumOperands() - 1; 4137 Register SrcReg = I.getOperand(NumElts).getReg(); 4138 const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg()); 4139 const LLT SrcTy = MRI.getType(SrcReg); 4140 4141 assert(NarrowTy.isVector() && "Expected an unmerge into vectors"); 4142 if (SrcTy.getSizeInBits() > 128) { 4143 LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge"); 4144 return false; 4145 } 4146 4147 // We implement a split vector operation by treating the sub-vectors as 4148 // scalars and extracting them. 4149 const RegisterBank &DstRB = 4150 *RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI); 4151 for (unsigned OpIdx = 0; OpIdx < NumElts; ++OpIdx) { 4152 Register Dst = I.getOperand(OpIdx).getReg(); 4153 MachineInstr *Extract = 4154 emitExtractVectorElt(Dst, DstRB, NarrowTy, SrcReg, OpIdx, MIB); 4155 if (!Extract) 4156 return false; 4157 } 4158 I.eraseFromParent(); 4159 return true; 4160 } 4161 4162 bool AArch64InstructionSelector::selectUnmergeValues(MachineInstr &I, 4163 MachineRegisterInfo &MRI) { 4164 assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES && 4165 "unexpected opcode"); 4166 4167 // TODO: Handle unmerging into GPRs and from scalars to scalars. 4168 if (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() != 4169 AArch64::FPRRegBankID || 4170 RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() != 4171 AArch64::FPRRegBankID) { 4172 LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar " 4173 "currently unsupported.\n"); 4174 return false; 4175 } 4176 4177 // The last operand is the vector source register, and every other operand is 4178 // a register to unpack into. 4179 unsigned NumElts = I.getNumOperands() - 1; 4180 Register SrcReg = I.getOperand(NumElts).getReg(); 4181 const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg()); 4182 const LLT WideTy = MRI.getType(SrcReg); 4183 (void)WideTy; 4184 assert((WideTy.isVector() || WideTy.getSizeInBits() == 128) && 4185 "can only unmerge from vector or s128 types!"); 4186 assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() && 4187 "source register size too small!"); 4188 4189 if (!NarrowTy.isScalar()) 4190 return selectSplitVectorUnmerge(I, MRI); 4191 4192 // Choose a lane copy opcode and subregister based off of the size of the 4193 // vector's elements. 4194 unsigned CopyOpc = 0; 4195 unsigned ExtractSubReg = 0; 4196 if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, NarrowTy.getSizeInBits())) 4197 return false; 4198 4199 // Set up for the lane copies. 4200 MachineBasicBlock &MBB = *I.getParent(); 4201 4202 // Stores the registers we'll be copying from. 4203 SmallVector<Register, 4> InsertRegs; 4204 4205 // We'll use the first register twice, so we only need NumElts-1 registers. 4206 unsigned NumInsertRegs = NumElts - 1; 4207 4208 // If our elements fit into exactly 128 bits, then we can copy from the source 4209 // directly. Otherwise, we need to do a bit of setup with some subregister 4210 // inserts. 4211 if (NarrowTy.getSizeInBits() * NumElts == 128) { 4212 InsertRegs = SmallVector<Register, 4>(NumInsertRegs, SrcReg); 4213 } else { 4214 // No. We have to perform subregister inserts. For each insert, create an 4215 // implicit def and a subregister insert, and save the register we create. 4216 const TargetRegisterClass *RC = getRegClassForTypeOnBank( 4217 LLT::fixed_vector(NumElts, WideTy.getScalarSizeInBits()), 4218 *RBI.getRegBank(SrcReg, MRI, TRI)); 4219 unsigned SubReg = 0; 4220 bool Found = getSubRegForClass(RC, TRI, SubReg); 4221 (void)Found; 4222 assert(Found && "expected to find last operand's subeg idx"); 4223 for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) { 4224 Register ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass); 4225 MachineInstr &ImpDefMI = 4226 *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::IMPLICIT_DEF), 4227 ImpDefReg); 4228 4229 // Now, create the subregister insert from SrcReg. 4230 Register InsertReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass); 4231 MachineInstr &InsMI = 4232 *BuildMI(MBB, I, I.getDebugLoc(), 4233 TII.get(TargetOpcode::INSERT_SUBREG), InsertReg) 4234 .addUse(ImpDefReg) 4235 .addUse(SrcReg) 4236 .addImm(SubReg); 4237 4238 constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI); 4239 constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI); 4240 4241 // Save the register so that we can copy from it after. 4242 InsertRegs.push_back(InsertReg); 4243 } 4244 } 4245 4246 // Now that we've created any necessary subregister inserts, we can 4247 // create the copies. 4248 // 4249 // Perform the first copy separately as a subregister copy. 4250 Register CopyTo = I.getOperand(0).getReg(); 4251 auto FirstCopy = MIB.buildInstr(TargetOpcode::COPY, {CopyTo}, {}) 4252 .addReg(InsertRegs[0], 0, ExtractSubReg); 4253 constrainSelectedInstRegOperands(*FirstCopy, TII, TRI, RBI); 4254 4255 // Now, perform the remaining copies as vector lane copies. 4256 unsigned LaneIdx = 1; 4257 for (Register InsReg : InsertRegs) { 4258 Register CopyTo = I.getOperand(LaneIdx).getReg(); 4259 MachineInstr &CopyInst = 4260 *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CopyOpc), CopyTo) 4261 .addUse(InsReg) 4262 .addImm(LaneIdx); 4263 constrainSelectedInstRegOperands(CopyInst, TII, TRI, RBI); 4264 ++LaneIdx; 4265 } 4266 4267 // Separately constrain the first copy's destination. Because of the 4268 // limitation in constrainOperandRegClass, we can't guarantee that this will 4269 // actually be constrained. So, do it ourselves using the second operand. 4270 const TargetRegisterClass *RC = 4271 MRI.getRegClassOrNull(I.getOperand(1).getReg()); 4272 if (!RC) { 4273 LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n"); 4274 return false; 4275 } 4276 4277 RBI.constrainGenericRegister(CopyTo, *RC, MRI); 4278 I.eraseFromParent(); 4279 return true; 4280 } 4281 4282 bool AArch64InstructionSelector::selectConcatVectors( 4283 MachineInstr &I, MachineRegisterInfo &MRI) { 4284 assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS && 4285 "Unexpected opcode"); 4286 Register Dst = I.getOperand(0).getReg(); 4287 Register Op1 = I.getOperand(1).getReg(); 4288 Register Op2 = I.getOperand(2).getReg(); 4289 MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIB); 4290 if (!ConcatMI) 4291 return false; 4292 I.eraseFromParent(); 4293 return true; 4294 } 4295 4296 unsigned 4297 AArch64InstructionSelector::emitConstantPoolEntry(const Constant *CPVal, 4298 MachineFunction &MF) const { 4299 Type *CPTy = CPVal->getType(); 4300 Align Alignment = MF.getDataLayout().getPrefTypeAlign(CPTy); 4301 4302 MachineConstantPool *MCP = MF.getConstantPool(); 4303 return MCP->getConstantPoolIndex(CPVal, Alignment); 4304 } 4305 4306 MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool( 4307 const Constant *CPVal, MachineIRBuilder &MIRBuilder) const { 4308 const TargetRegisterClass *RC; 4309 unsigned Opc; 4310 bool IsTiny = TM.getCodeModel() == CodeModel::Tiny; 4311 unsigned Size = MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType()); 4312 switch (Size) { 4313 case 16: 4314 RC = &AArch64::FPR128RegClass; 4315 Opc = IsTiny ? AArch64::LDRQl : AArch64::LDRQui; 4316 break; 4317 case 8: 4318 RC = &AArch64::FPR64RegClass; 4319 Opc = IsTiny ? AArch64::LDRDl : AArch64::LDRDui; 4320 break; 4321 case 4: 4322 RC = &AArch64::FPR32RegClass; 4323 Opc = IsTiny ? AArch64::LDRSl : AArch64::LDRSui; 4324 break; 4325 case 2: 4326 RC = &AArch64::FPR16RegClass; 4327 Opc = AArch64::LDRHui; 4328 break; 4329 default: 4330 LLVM_DEBUG(dbgs() << "Could not load from constant pool of type " 4331 << *CPVal->getType()); 4332 return nullptr; 4333 } 4334 4335 MachineInstr *LoadMI = nullptr; 4336 auto &MF = MIRBuilder.getMF(); 4337 unsigned CPIdx = emitConstantPoolEntry(CPVal, MF); 4338 if (IsTiny && (Size == 16 || Size == 8 || Size == 4)) { 4339 // Use load(literal) for tiny code model. 4340 LoadMI = &*MIRBuilder.buildInstr(Opc, {RC}, {}).addConstantPoolIndex(CPIdx); 4341 } else { 4342 auto Adrp = 4343 MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {}) 4344 .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE); 4345 4346 LoadMI = &*MIRBuilder.buildInstr(Opc, {RC}, {Adrp}) 4347 .addConstantPoolIndex( 4348 CPIdx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 4349 4350 constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI); 4351 } 4352 4353 MachinePointerInfo PtrInfo = MachinePointerInfo::getConstantPool(MF); 4354 LoadMI->addMemOperand(MF, MF.getMachineMemOperand(PtrInfo, 4355 MachineMemOperand::MOLoad, 4356 Size, Align(Size))); 4357 constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI); 4358 return LoadMI; 4359 } 4360 4361 /// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given 4362 /// size and RB. 4363 static std::pair<unsigned, unsigned> 4364 getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) { 4365 unsigned Opc, SubregIdx; 4366 if (RB.getID() == AArch64::GPRRegBankID) { 4367 if (EltSize == 8) { 4368 Opc = AArch64::INSvi8gpr; 4369 SubregIdx = AArch64::bsub; 4370 } else if (EltSize == 16) { 4371 Opc = AArch64::INSvi16gpr; 4372 SubregIdx = AArch64::ssub; 4373 } else if (EltSize == 32) { 4374 Opc = AArch64::INSvi32gpr; 4375 SubregIdx = AArch64::ssub; 4376 } else if (EltSize == 64) { 4377 Opc = AArch64::INSvi64gpr; 4378 SubregIdx = AArch64::dsub; 4379 } else { 4380 llvm_unreachable("invalid elt size!"); 4381 } 4382 } else { 4383 if (EltSize == 8) { 4384 Opc = AArch64::INSvi8lane; 4385 SubregIdx = AArch64::bsub; 4386 } else if (EltSize == 16) { 4387 Opc = AArch64::INSvi16lane; 4388 SubregIdx = AArch64::hsub; 4389 } else if (EltSize == 32) { 4390 Opc = AArch64::INSvi32lane; 4391 SubregIdx = AArch64::ssub; 4392 } else if (EltSize == 64) { 4393 Opc = AArch64::INSvi64lane; 4394 SubregIdx = AArch64::dsub; 4395 } else { 4396 llvm_unreachable("invalid elt size!"); 4397 } 4398 } 4399 return std::make_pair(Opc, SubregIdx); 4400 } 4401 4402 MachineInstr *AArch64InstructionSelector::emitInstr( 4403 unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps, 4404 std::initializer_list<llvm::SrcOp> SrcOps, MachineIRBuilder &MIRBuilder, 4405 const ComplexRendererFns &RenderFns) const { 4406 assert(Opcode && "Expected an opcode?"); 4407 assert(!isPreISelGenericOpcode(Opcode) && 4408 "Function should only be used to produce selected instructions!"); 4409 auto MI = MIRBuilder.buildInstr(Opcode, DstOps, SrcOps); 4410 if (RenderFns) 4411 for (auto &Fn : *RenderFns) 4412 Fn(MI); 4413 constrainSelectedInstRegOperands(*MI, TII, TRI, RBI); 4414 return &*MI; 4415 } 4416 4417 MachineInstr *AArch64InstructionSelector::emitAddSub( 4418 const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode, 4419 Register Dst, MachineOperand &LHS, MachineOperand &RHS, 4420 MachineIRBuilder &MIRBuilder) const { 4421 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); 4422 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?"); 4423 auto Ty = MRI.getType(LHS.getReg()); 4424 assert(!Ty.isVector() && "Expected a scalar or pointer?"); 4425 unsigned Size = Ty.getSizeInBits(); 4426 assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit type only"); 4427 bool Is32Bit = Size == 32; 4428 4429 // INSTRri form with positive arithmetic immediate. 4430 if (auto Fns = selectArithImmed(RHS)) 4431 return emitInstr(AddrModeAndSizeToOpcode[0][Is32Bit], {Dst}, {LHS}, 4432 MIRBuilder, Fns); 4433 4434 // INSTRri form with negative arithmetic immediate. 4435 if (auto Fns = selectNegArithImmed(RHS)) 4436 return emitInstr(AddrModeAndSizeToOpcode[3][Is32Bit], {Dst}, {LHS}, 4437 MIRBuilder, Fns); 4438 4439 // INSTRrx form. 4440 if (auto Fns = selectArithExtendedRegister(RHS)) 4441 return emitInstr(AddrModeAndSizeToOpcode[4][Is32Bit], {Dst}, {LHS}, 4442 MIRBuilder, Fns); 4443 4444 // INSTRrs form. 4445 if (auto Fns = selectShiftedRegister(RHS)) 4446 return emitInstr(AddrModeAndSizeToOpcode[1][Is32Bit], {Dst}, {LHS}, 4447 MIRBuilder, Fns); 4448 return emitInstr(AddrModeAndSizeToOpcode[2][Is32Bit], {Dst}, {LHS, RHS}, 4449 MIRBuilder); 4450 } 4451 4452 MachineInstr * 4453 AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS, 4454 MachineOperand &RHS, 4455 MachineIRBuilder &MIRBuilder) const { 4456 const std::array<std::array<unsigned, 2>, 5> OpcTable{ 4457 {{AArch64::ADDXri, AArch64::ADDWri}, 4458 {AArch64::ADDXrs, AArch64::ADDWrs}, 4459 {AArch64::ADDXrr, AArch64::ADDWrr}, 4460 {AArch64::SUBXri, AArch64::SUBWri}, 4461 {AArch64::ADDXrx, AArch64::ADDWrx}}}; 4462 return emitAddSub(OpcTable, DefReg, LHS, RHS, MIRBuilder); 4463 } 4464 4465 MachineInstr * 4466 AArch64InstructionSelector::emitADDS(Register Dst, MachineOperand &LHS, 4467 MachineOperand &RHS, 4468 MachineIRBuilder &MIRBuilder) const { 4469 const std::array<std::array<unsigned, 2>, 5> OpcTable{ 4470 {{AArch64::ADDSXri, AArch64::ADDSWri}, 4471 {AArch64::ADDSXrs, AArch64::ADDSWrs}, 4472 {AArch64::ADDSXrr, AArch64::ADDSWrr}, 4473 {AArch64::SUBSXri, AArch64::SUBSWri}, 4474 {AArch64::ADDSXrx, AArch64::ADDSWrx}}}; 4475 return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder); 4476 } 4477 4478 MachineInstr * 4479 AArch64InstructionSelector::emitSUBS(Register Dst, MachineOperand &LHS, 4480 MachineOperand &RHS, 4481 MachineIRBuilder &MIRBuilder) const { 4482 const std::array<std::array<unsigned, 2>, 5> OpcTable{ 4483 {{AArch64::SUBSXri, AArch64::SUBSWri}, 4484 {AArch64::SUBSXrs, AArch64::SUBSWrs}, 4485 {AArch64::SUBSXrr, AArch64::SUBSWrr}, 4486 {AArch64::ADDSXri, AArch64::ADDSWri}, 4487 {AArch64::SUBSXrx, AArch64::SUBSWrx}}}; 4488 return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder); 4489 } 4490 4491 MachineInstr * 4492 AArch64InstructionSelector::emitADCS(Register Dst, MachineOperand &LHS, 4493 MachineOperand &RHS, 4494 MachineIRBuilder &MIRBuilder) const { 4495 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?"); 4496 MachineRegisterInfo *MRI = MIRBuilder.getMRI(); 4497 bool Is32Bit = (MRI->getType(LHS.getReg()).getSizeInBits() == 32); 4498 static const unsigned OpcTable[2] = {AArch64::ADCSXr, AArch64::ADCSWr}; 4499 return emitInstr(OpcTable[Is32Bit], {Dst}, {LHS, RHS}, MIRBuilder); 4500 } 4501 4502 MachineInstr * 4503 AArch64InstructionSelector::emitSBCS(Register Dst, MachineOperand &LHS, 4504 MachineOperand &RHS, 4505 MachineIRBuilder &MIRBuilder) const { 4506 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?"); 4507 MachineRegisterInfo *MRI = MIRBuilder.getMRI(); 4508 bool Is32Bit = (MRI->getType(LHS.getReg()).getSizeInBits() == 32); 4509 static const unsigned OpcTable[2] = {AArch64::SBCSXr, AArch64::SBCSWr}; 4510 return emitInstr(OpcTable[Is32Bit], {Dst}, {LHS, RHS}, MIRBuilder); 4511 } 4512 4513 MachineInstr * 4514 AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS, 4515 MachineIRBuilder &MIRBuilder) const { 4516 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); 4517 bool Is32Bit = (MRI.getType(LHS.getReg()).getSizeInBits() == 32); 4518 auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass; 4519 return emitADDS(MRI.createVirtualRegister(RC), LHS, RHS, MIRBuilder); 4520 } 4521 4522 MachineInstr * 4523 AArch64InstructionSelector::emitTST(MachineOperand &LHS, MachineOperand &RHS, 4524 MachineIRBuilder &MIRBuilder) const { 4525 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?"); 4526 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); 4527 LLT Ty = MRI.getType(LHS.getReg()); 4528 unsigned RegSize = Ty.getSizeInBits(); 4529 bool Is32Bit = (RegSize == 32); 4530 const unsigned OpcTable[3][2] = {{AArch64::ANDSXri, AArch64::ANDSWri}, 4531 {AArch64::ANDSXrs, AArch64::ANDSWrs}, 4532 {AArch64::ANDSXrr, AArch64::ANDSWrr}}; 4533 // ANDS needs a logical immediate for its immediate form. Check if we can 4534 // fold one in. 4535 if (auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI)) { 4536 int64_t Imm = ValAndVReg->Value.getSExtValue(); 4537 4538 if (AArch64_AM::isLogicalImmediate(Imm, RegSize)) { 4539 auto TstMI = MIRBuilder.buildInstr(OpcTable[0][Is32Bit], {Ty}, {LHS}); 4540 TstMI.addImm(AArch64_AM::encodeLogicalImmediate(Imm, RegSize)); 4541 constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI); 4542 return &*TstMI; 4543 } 4544 } 4545 4546 if (auto Fns = selectLogicalShiftedRegister(RHS)) 4547 return emitInstr(OpcTable[1][Is32Bit], {Ty}, {LHS}, MIRBuilder, Fns); 4548 return emitInstr(OpcTable[2][Is32Bit], {Ty}, {LHS, RHS}, MIRBuilder); 4549 } 4550 4551 MachineInstr *AArch64InstructionSelector::emitIntegerCompare( 4552 MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate, 4553 MachineIRBuilder &MIRBuilder) const { 4554 assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!"); 4555 assert(Predicate.isPredicate() && "Expected predicate?"); 4556 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); 4557 LLT CmpTy = MRI.getType(LHS.getReg()); 4558 assert(!CmpTy.isVector() && "Expected scalar or pointer"); 4559 unsigned Size = CmpTy.getSizeInBits(); 4560 (void)Size; 4561 assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit LHS/RHS?"); 4562 // Fold the compare into a cmn or tst if possible. 4563 if (auto FoldCmp = tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder)) 4564 return FoldCmp; 4565 auto Dst = MRI.cloneVirtualRegister(LHS.getReg()); 4566 return emitSUBS(Dst, LHS, RHS, MIRBuilder); 4567 } 4568 4569 MachineInstr *AArch64InstructionSelector::emitCSetForFCmp( 4570 Register Dst, CmpInst::Predicate Pred, MachineIRBuilder &MIRBuilder) const { 4571 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 4572 #ifndef NDEBUG 4573 LLT Ty = MRI.getType(Dst); 4574 assert(!Ty.isVector() && Ty.getSizeInBits() == 32 && 4575 "Expected a 32-bit scalar register?"); 4576 #endif 4577 const Register ZReg = AArch64::WZR; 4578 AArch64CC::CondCode CC1, CC2; 4579 changeFCMPPredToAArch64CC(Pred, CC1, CC2); 4580 auto InvCC1 = AArch64CC::getInvertedCondCode(CC1); 4581 if (CC2 == AArch64CC::AL) 4582 return emitCSINC(/*Dst=*/Dst, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1, 4583 MIRBuilder); 4584 const TargetRegisterClass *RC = &AArch64::GPR32RegClass; 4585 Register Def1Reg = MRI.createVirtualRegister(RC); 4586 Register Def2Reg = MRI.createVirtualRegister(RC); 4587 auto InvCC2 = AArch64CC::getInvertedCondCode(CC2); 4588 emitCSINC(/*Dst=*/Def1Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1, MIRBuilder); 4589 emitCSINC(/*Dst=*/Def2Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC2, MIRBuilder); 4590 auto OrMI = MIRBuilder.buildInstr(AArch64::ORRWrr, {Dst}, {Def1Reg, Def2Reg}); 4591 constrainSelectedInstRegOperands(*OrMI, TII, TRI, RBI); 4592 return &*OrMI; 4593 } 4594 4595 MachineInstr *AArch64InstructionSelector::emitFPCompare( 4596 Register LHS, Register RHS, MachineIRBuilder &MIRBuilder, 4597 std::optional<CmpInst::Predicate> Pred) const { 4598 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 4599 LLT Ty = MRI.getType(LHS); 4600 if (Ty.isVector()) 4601 return nullptr; 4602 unsigned OpSize = Ty.getSizeInBits(); 4603 if (OpSize != 32 && OpSize != 64) 4604 return nullptr; 4605 4606 // If this is a compare against +0.0, then we don't have 4607 // to explicitly materialize a constant. 4608 const ConstantFP *FPImm = getConstantFPVRegVal(RHS, MRI); 4609 bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative()); 4610 4611 auto IsEqualityPred = [](CmpInst::Predicate P) { 4612 return P == CmpInst::FCMP_OEQ || P == CmpInst::FCMP_ONE || 4613 P == CmpInst::FCMP_UEQ || P == CmpInst::FCMP_UNE; 4614 }; 4615 if (!ShouldUseImm && Pred && IsEqualityPred(*Pred)) { 4616 // Try commutating the operands. 4617 const ConstantFP *LHSImm = getConstantFPVRegVal(LHS, MRI); 4618 if (LHSImm && (LHSImm->isZero() && !LHSImm->isNegative())) { 4619 ShouldUseImm = true; 4620 std::swap(LHS, RHS); 4621 } 4622 } 4623 unsigned CmpOpcTbl[2][2] = {{AArch64::FCMPSrr, AArch64::FCMPDrr}, 4624 {AArch64::FCMPSri, AArch64::FCMPDri}}; 4625 unsigned CmpOpc = CmpOpcTbl[ShouldUseImm][OpSize == 64]; 4626 4627 // Partially build the compare. Decide if we need to add a use for the 4628 // third operand based off whether or not we're comparing against 0.0. 4629 auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addUse(LHS); 4630 CmpMI.setMIFlags(MachineInstr::NoFPExcept); 4631 if (!ShouldUseImm) 4632 CmpMI.addUse(RHS); 4633 constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI); 4634 return &*CmpMI; 4635 } 4636 4637 MachineInstr *AArch64InstructionSelector::emitVectorConcat( 4638 std::optional<Register> Dst, Register Op1, Register Op2, 4639 MachineIRBuilder &MIRBuilder) const { 4640 // We implement a vector concat by: 4641 // 1. Use scalar_to_vector to insert the lower vector into the larger dest 4642 // 2. Insert the upper vector into the destination's upper element 4643 // TODO: some of this code is common with G_BUILD_VECTOR handling. 4644 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); 4645 4646 const LLT Op1Ty = MRI.getType(Op1); 4647 const LLT Op2Ty = MRI.getType(Op2); 4648 4649 if (Op1Ty != Op2Ty) { 4650 LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys"); 4651 return nullptr; 4652 } 4653 assert(Op1Ty.isVector() && "Expected a vector for vector concat"); 4654 4655 if (Op1Ty.getSizeInBits() >= 128) { 4656 LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors"); 4657 return nullptr; 4658 } 4659 4660 // At the moment we just support 64 bit vector concats. 4661 if (Op1Ty.getSizeInBits() != 64) { 4662 LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors"); 4663 return nullptr; 4664 } 4665 4666 const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits()); 4667 const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI); 4668 const TargetRegisterClass *DstRC = 4669 getRegClassForTypeOnBank(Op1Ty.multiplyElements(2), FPRBank); 4670 4671 MachineInstr *WidenedOp1 = 4672 emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder); 4673 MachineInstr *WidenedOp2 = 4674 emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op2, MIRBuilder); 4675 if (!WidenedOp1 || !WidenedOp2) { 4676 LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value"); 4677 return nullptr; 4678 } 4679 4680 // Now do the insert of the upper element. 4681 unsigned InsertOpc, InsSubRegIdx; 4682 std::tie(InsertOpc, InsSubRegIdx) = 4683 getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits()); 4684 4685 if (!Dst) 4686 Dst = MRI.createVirtualRegister(DstRC); 4687 auto InsElt = 4688 MIRBuilder 4689 .buildInstr(InsertOpc, {*Dst}, {WidenedOp1->getOperand(0).getReg()}) 4690 .addImm(1) /* Lane index */ 4691 .addUse(WidenedOp2->getOperand(0).getReg()) 4692 .addImm(0); 4693 constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI); 4694 return &*InsElt; 4695 } 4696 4697 MachineInstr * 4698 AArch64InstructionSelector::emitCSINC(Register Dst, Register Src1, 4699 Register Src2, AArch64CC::CondCode Pred, 4700 MachineIRBuilder &MIRBuilder) const { 4701 auto &MRI = *MIRBuilder.getMRI(); 4702 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Dst); 4703 // If we used a register class, then this won't necessarily have an LLT. 4704 // Compute the size based off whether or not we have a class or bank. 4705 unsigned Size; 4706 if (const auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>()) 4707 Size = TRI.getRegSizeInBits(*RC); 4708 else 4709 Size = MRI.getType(Dst).getSizeInBits(); 4710 // Some opcodes use s1. 4711 assert(Size <= 64 && "Expected 64 bits or less only!"); 4712 static const unsigned OpcTable[2] = {AArch64::CSINCWr, AArch64::CSINCXr}; 4713 unsigned Opc = OpcTable[Size == 64]; 4714 auto CSINC = MIRBuilder.buildInstr(Opc, {Dst}, {Src1, Src2}).addImm(Pred); 4715 constrainSelectedInstRegOperands(*CSINC, TII, TRI, RBI); 4716 return &*CSINC; 4717 } 4718 4719 MachineInstr *AArch64InstructionSelector::emitCarryIn(MachineInstr &I, 4720 Register CarryReg) { 4721 MachineRegisterInfo *MRI = MIB.getMRI(); 4722 unsigned Opcode = I.getOpcode(); 4723 4724 // If the instruction is a SUB, we need to negate the carry, 4725 // because borrowing is indicated by carry-flag == 0. 4726 bool NeedsNegatedCarry = 4727 (Opcode == TargetOpcode::G_USUBE || Opcode == TargetOpcode::G_SSUBE); 4728 4729 // If the previous instruction will already produce the correct carry, do not 4730 // emit a carry generating instruction. E.g. for G_UADDE/G_USUBE sequences 4731 // generated during legalization of wide add/sub. This optimization depends on 4732 // these sequences not being interrupted by other instructions. 4733 // We have to select the previous instruction before the carry-using 4734 // instruction is deleted by the calling function, otherwise the previous 4735 // instruction might become dead and would get deleted. 4736 MachineInstr *SrcMI = MRI->getVRegDef(CarryReg); 4737 if (SrcMI == I.getPrevNode()) { 4738 if (auto *CarrySrcMI = dyn_cast<GAddSubCarryOut>(SrcMI)) { 4739 bool ProducesNegatedCarry = CarrySrcMI->isSub(); 4740 if (NeedsNegatedCarry == ProducesNegatedCarry && 4741 CarrySrcMI->isUnsigned() && 4742 CarrySrcMI->getCarryOutReg() == CarryReg && 4743 selectAndRestoreState(*SrcMI)) 4744 return nullptr; 4745 } 4746 } 4747 4748 Register DeadReg = MRI->createVirtualRegister(&AArch64::GPR32RegClass); 4749 4750 if (NeedsNegatedCarry) { 4751 // (0 - Carry) sets !C in NZCV when Carry == 1 4752 Register ZReg = AArch64::WZR; 4753 return emitInstr(AArch64::SUBSWrr, {DeadReg}, {ZReg, CarryReg}, MIB); 4754 } 4755 4756 // (Carry - 1) sets !C in NZCV when Carry == 0 4757 auto Fns = select12BitValueWithLeftShift(1); 4758 return emitInstr(AArch64::SUBSWri, {DeadReg}, {CarryReg}, MIB, Fns); 4759 } 4760 4761 bool AArch64InstructionSelector::selectOverflowOp(MachineInstr &I, 4762 MachineRegisterInfo &MRI) { 4763 auto &CarryMI = cast<GAddSubCarryOut>(I); 4764 4765 if (auto *CarryInMI = dyn_cast<GAddSubCarryInOut>(&I)) { 4766 // Set NZCV carry according to carry-in VReg 4767 emitCarryIn(I, CarryInMI->getCarryInReg()); 4768 } 4769 4770 // Emit the operation and get the correct condition code. 4771 auto OpAndCC = emitOverflowOp(I.getOpcode(), CarryMI.getDstReg(), 4772 CarryMI.getLHS(), CarryMI.getRHS(), MIB); 4773 4774 Register CarryOutReg = CarryMI.getCarryOutReg(); 4775 4776 // Don't convert carry-out to VReg if it is never used 4777 if (!MRI.use_nodbg_empty(CarryOutReg)) { 4778 // Now, put the overflow result in the register given by the first operand 4779 // to the overflow op. CSINC increments the result when the predicate is 4780 // false, so to get the increment when it's true, we need to use the 4781 // inverse. In this case, we want to increment when carry is set. 4782 Register ZReg = AArch64::WZR; 4783 emitCSINC(/*Dst=*/CarryOutReg, /*Src1=*/ZReg, /*Src2=*/ZReg, 4784 getInvertedCondCode(OpAndCC.second), MIB); 4785 } 4786 4787 I.eraseFromParent(); 4788 return true; 4789 } 4790 4791 std::pair<MachineInstr *, AArch64CC::CondCode> 4792 AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst, 4793 MachineOperand &LHS, 4794 MachineOperand &RHS, 4795 MachineIRBuilder &MIRBuilder) const { 4796 switch (Opcode) { 4797 default: 4798 llvm_unreachable("Unexpected opcode!"); 4799 case TargetOpcode::G_SADDO: 4800 return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS); 4801 case TargetOpcode::G_UADDO: 4802 return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS); 4803 case TargetOpcode::G_SSUBO: 4804 return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS); 4805 case TargetOpcode::G_USUBO: 4806 return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO); 4807 case TargetOpcode::G_SADDE: 4808 return std::make_pair(emitADCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS); 4809 case TargetOpcode::G_UADDE: 4810 return std::make_pair(emitADCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS); 4811 case TargetOpcode::G_SSUBE: 4812 return std::make_pair(emitSBCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS); 4813 case TargetOpcode::G_USUBE: 4814 return std::make_pair(emitSBCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO); 4815 } 4816 } 4817 4818 /// Returns true if @p Val is a tree of AND/OR/CMP operations that can be 4819 /// expressed as a conjunction. 4820 /// \param CanNegate Set to true if we can negate the whole sub-tree just by 4821 /// changing the conditions on the CMP tests. 4822 /// (this means we can call emitConjunctionRec() with 4823 /// Negate==true on this sub-tree) 4824 /// \param MustBeFirst Set to true if this subtree needs to be negated and we 4825 /// cannot do the negation naturally. We are required to 4826 /// emit the subtree first in this case. 4827 /// \param WillNegate Is true if are called when the result of this 4828 /// subexpression must be negated. This happens when the 4829 /// outer expression is an OR. We can use this fact to know 4830 /// that we have a double negation (or (or ...) ...) that 4831 /// can be implemented for free. 4832 static bool canEmitConjunction(Register Val, bool &CanNegate, bool &MustBeFirst, 4833 bool WillNegate, MachineRegisterInfo &MRI, 4834 unsigned Depth = 0) { 4835 if (!MRI.hasOneNonDBGUse(Val)) 4836 return false; 4837 MachineInstr *ValDef = MRI.getVRegDef(Val); 4838 unsigned Opcode = ValDef->getOpcode(); 4839 if (isa<GAnyCmp>(ValDef)) { 4840 CanNegate = true; 4841 MustBeFirst = false; 4842 return true; 4843 } 4844 // Protect against exponential runtime and stack overflow. 4845 if (Depth > 6) 4846 return false; 4847 if (Opcode == TargetOpcode::G_AND || Opcode == TargetOpcode::G_OR) { 4848 bool IsOR = Opcode == TargetOpcode::G_OR; 4849 Register O0 = ValDef->getOperand(1).getReg(); 4850 Register O1 = ValDef->getOperand(2).getReg(); 4851 bool CanNegateL; 4852 bool MustBeFirstL; 4853 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, MRI, Depth + 1)) 4854 return false; 4855 bool CanNegateR; 4856 bool MustBeFirstR; 4857 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, MRI, Depth + 1)) 4858 return false; 4859 4860 if (MustBeFirstL && MustBeFirstR) 4861 return false; 4862 4863 if (IsOR) { 4864 // For an OR expression we need to be able to naturally negate at least 4865 // one side or we cannot do the transformation at all. 4866 if (!CanNegateL && !CanNegateR) 4867 return false; 4868 // If we the result of the OR will be negated and we can naturally negate 4869 // the leaves, then this sub-tree as a whole negates naturally. 4870 CanNegate = WillNegate && CanNegateL && CanNegateR; 4871 // If we cannot naturally negate the whole sub-tree, then this must be 4872 // emitted first. 4873 MustBeFirst = !CanNegate; 4874 } else { 4875 assert(Opcode == TargetOpcode::G_AND && "Must be G_AND"); 4876 // We cannot naturally negate an AND operation. 4877 CanNegate = false; 4878 MustBeFirst = MustBeFirstL || MustBeFirstR; 4879 } 4880 return true; 4881 } 4882 return false; 4883 } 4884 4885 MachineInstr *AArch64InstructionSelector::emitConditionalComparison( 4886 Register LHS, Register RHS, CmpInst::Predicate CC, 4887 AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, 4888 MachineIRBuilder &MIB) const { 4889 // TODO: emit CMN as an optimization. 4890 auto &MRI = *MIB.getMRI(); 4891 LLT OpTy = MRI.getType(LHS); 4892 assert(OpTy.getSizeInBits() == 32 || OpTy.getSizeInBits() == 64); 4893 unsigned CCmpOpc; 4894 std::optional<ValueAndVReg> C; 4895 if (CmpInst::isIntPredicate(CC)) { 4896 C = getIConstantVRegValWithLookThrough(RHS, MRI); 4897 if (C && C->Value.ult(32)) 4898 CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWi : AArch64::CCMPXi; 4899 else 4900 CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWr : AArch64::CCMPXr; 4901 } else { 4902 switch (OpTy.getSizeInBits()) { 4903 case 16: 4904 CCmpOpc = AArch64::FCCMPHrr; 4905 break; 4906 case 32: 4907 CCmpOpc = AArch64::FCCMPSrr; 4908 break; 4909 case 64: 4910 CCmpOpc = AArch64::FCCMPDrr; 4911 break; 4912 default: 4913 return nullptr; 4914 } 4915 } 4916 AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); 4917 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); 4918 auto CCmp = 4919 MIB.buildInstr(CCmpOpc, {}, {LHS}); 4920 if (CCmpOpc == AArch64::CCMPWi || CCmpOpc == AArch64::CCMPXi) 4921 CCmp.addImm(C->Value.getZExtValue()); 4922 else 4923 CCmp.addReg(RHS); 4924 CCmp.addImm(NZCV).addImm(Predicate); 4925 constrainSelectedInstRegOperands(*CCmp, TII, TRI, RBI); 4926 return &*CCmp; 4927 } 4928 4929 MachineInstr *AArch64InstructionSelector::emitConjunctionRec( 4930 Register Val, AArch64CC::CondCode &OutCC, bool Negate, Register CCOp, 4931 AArch64CC::CondCode Predicate, MachineIRBuilder &MIB) const { 4932 // We're at a tree leaf, produce a conditional comparison operation. 4933 auto &MRI = *MIB.getMRI(); 4934 MachineInstr *ValDef = MRI.getVRegDef(Val); 4935 unsigned Opcode = ValDef->getOpcode(); 4936 if (auto *Cmp = dyn_cast<GAnyCmp>(ValDef)) { 4937 Register LHS = Cmp->getLHSReg(); 4938 Register RHS = Cmp->getRHSReg(); 4939 CmpInst::Predicate CC = Cmp->getCond(); 4940 if (Negate) 4941 CC = CmpInst::getInversePredicate(CC); 4942 if (isa<GICmp>(Cmp)) { 4943 OutCC = changeICMPPredToAArch64CC(CC); 4944 } else { 4945 // Handle special FP cases. 4946 AArch64CC::CondCode ExtraCC; 4947 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC); 4948 // Some floating point conditions can't be tested with a single condition 4949 // code. Construct an additional comparison in this case. 4950 if (ExtraCC != AArch64CC::AL) { 4951 MachineInstr *ExtraCmp; 4952 if (!CCOp) 4953 ExtraCmp = emitFPCompare(LHS, RHS, MIB, CC); 4954 else 4955 ExtraCmp = 4956 emitConditionalComparison(LHS, RHS, CC, Predicate, ExtraCC, MIB); 4957 CCOp = ExtraCmp->getOperand(0).getReg(); 4958 Predicate = ExtraCC; 4959 } 4960 } 4961 4962 // Produce a normal comparison if we are first in the chain 4963 if (!CCOp) { 4964 auto Dst = MRI.cloneVirtualRegister(LHS); 4965 if (isa<GICmp>(Cmp)) 4966 return emitSUBS(Dst, Cmp->getOperand(2), Cmp->getOperand(3), MIB); 4967 return emitFPCompare(Cmp->getOperand(2).getReg(), 4968 Cmp->getOperand(3).getReg(), MIB); 4969 } 4970 // Otherwise produce a ccmp. 4971 return emitConditionalComparison(LHS, RHS, CC, Predicate, OutCC, MIB); 4972 } 4973 assert(MRI.hasOneNonDBGUse(Val) && "Valid conjunction/disjunction tree"); 4974 4975 bool IsOR = Opcode == TargetOpcode::G_OR; 4976 4977 Register LHS = ValDef->getOperand(1).getReg(); 4978 bool CanNegateL; 4979 bool MustBeFirstL; 4980 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR, MRI); 4981 assert(ValidL && "Valid conjunction/disjunction tree"); 4982 (void)ValidL; 4983 4984 Register RHS = ValDef->getOperand(2).getReg(); 4985 bool CanNegateR; 4986 bool MustBeFirstR; 4987 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR, MRI); 4988 assert(ValidR && "Valid conjunction/disjunction tree"); 4989 (void)ValidR; 4990 4991 // Swap sub-tree that must come first to the right side. 4992 if (MustBeFirstL) { 4993 assert(!MustBeFirstR && "Valid conjunction/disjunction tree"); 4994 std::swap(LHS, RHS); 4995 std::swap(CanNegateL, CanNegateR); 4996 std::swap(MustBeFirstL, MustBeFirstR); 4997 } 4998 4999 bool NegateR; 5000 bool NegateAfterR; 5001 bool NegateL; 5002 bool NegateAfterAll; 5003 if (Opcode == TargetOpcode::G_OR) { 5004 // Swap the sub-tree that we can negate naturally to the left. 5005 if (!CanNegateL) { 5006 assert(CanNegateR && "at least one side must be negatable"); 5007 assert(!MustBeFirstR && "invalid conjunction/disjunction tree"); 5008 assert(!Negate); 5009 std::swap(LHS, RHS); 5010 NegateR = false; 5011 NegateAfterR = true; 5012 } else { 5013 // Negate the left sub-tree if possible, otherwise negate the result. 5014 NegateR = CanNegateR; 5015 NegateAfterR = !CanNegateR; 5016 } 5017 NegateL = true; 5018 NegateAfterAll = !Negate; 5019 } else { 5020 assert(Opcode == TargetOpcode::G_AND && 5021 "Valid conjunction/disjunction tree"); 5022 assert(!Negate && "Valid conjunction/disjunction tree"); 5023 5024 NegateL = false; 5025 NegateR = false; 5026 NegateAfterR = false; 5027 NegateAfterAll = false; 5028 } 5029 5030 // Emit sub-trees. 5031 AArch64CC::CondCode RHSCC; 5032 MachineInstr *CmpR = 5033 emitConjunctionRec(RHS, RHSCC, NegateR, CCOp, Predicate, MIB); 5034 if (NegateAfterR) 5035 RHSCC = AArch64CC::getInvertedCondCode(RHSCC); 5036 MachineInstr *CmpL = emitConjunctionRec( 5037 LHS, OutCC, NegateL, CmpR->getOperand(0).getReg(), RHSCC, MIB); 5038 if (NegateAfterAll) 5039 OutCC = AArch64CC::getInvertedCondCode(OutCC); 5040 return CmpL; 5041 } 5042 5043 MachineInstr *AArch64InstructionSelector::emitConjunction( 5044 Register Val, AArch64CC::CondCode &OutCC, MachineIRBuilder &MIB) const { 5045 bool DummyCanNegate; 5046 bool DummyMustBeFirst; 5047 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false, 5048 *MIB.getMRI())) 5049 return nullptr; 5050 return emitConjunctionRec(Val, OutCC, false, Register(), AArch64CC::AL, MIB); 5051 } 5052 5053 bool AArch64InstructionSelector::tryOptSelectConjunction(GSelect &SelI, 5054 MachineInstr &CondMI) { 5055 AArch64CC::CondCode AArch64CC; 5056 MachineInstr *ConjMI = emitConjunction(SelI.getCondReg(), AArch64CC, MIB); 5057 if (!ConjMI) 5058 return false; 5059 5060 emitSelect(SelI.getReg(0), SelI.getTrueReg(), SelI.getFalseReg(), AArch64CC, MIB); 5061 SelI.eraseFromParent(); 5062 return true; 5063 } 5064 5065 bool AArch64InstructionSelector::tryOptSelect(GSelect &I) { 5066 MachineRegisterInfo &MRI = *MIB.getMRI(); 5067 // We want to recognize this pattern: 5068 // 5069 // $z = G_FCMP pred, $x, $y 5070 // ... 5071 // $w = G_SELECT $z, $a, $b 5072 // 5073 // Where the value of $z is *only* ever used by the G_SELECT (possibly with 5074 // some copies/truncs in between.) 5075 // 5076 // If we see this, then we can emit something like this: 5077 // 5078 // fcmp $x, $y 5079 // fcsel $w, $a, $b, pred 5080 // 5081 // Rather than emitting both of the rather long sequences in the standard 5082 // G_FCMP/G_SELECT select methods. 5083 5084 // First, check if the condition is defined by a compare. 5085 MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg()); 5086 5087 // We can only fold if all of the defs have one use. 5088 Register CondDefReg = CondDef->getOperand(0).getReg(); 5089 if (!MRI.hasOneNonDBGUse(CondDefReg)) { 5090 // Unless it's another select. 5091 for (const MachineInstr &UI : MRI.use_nodbg_instructions(CondDefReg)) { 5092 if (CondDef == &UI) 5093 continue; 5094 if (UI.getOpcode() != TargetOpcode::G_SELECT) 5095 return false; 5096 } 5097 } 5098 5099 // Is the condition defined by a compare? 5100 unsigned CondOpc = CondDef->getOpcode(); 5101 if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP) { 5102 if (tryOptSelectConjunction(I, *CondDef)) 5103 return true; 5104 return false; 5105 } 5106 5107 AArch64CC::CondCode CondCode; 5108 if (CondOpc == TargetOpcode::G_ICMP) { 5109 auto Pred = 5110 static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate()); 5111 CondCode = changeICMPPredToAArch64CC(Pred); 5112 emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3), 5113 CondDef->getOperand(1), MIB); 5114 } else { 5115 // Get the condition code for the select. 5116 auto Pred = 5117 static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate()); 5118 AArch64CC::CondCode CondCode2; 5119 changeFCMPPredToAArch64CC(Pred, CondCode, CondCode2); 5120 5121 // changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two 5122 // instructions to emit the comparison. 5123 // TODO: Handle FCMP_UEQ and FCMP_ONE. After that, this check will be 5124 // unnecessary. 5125 if (CondCode2 != AArch64CC::AL) 5126 return false; 5127 5128 if (!emitFPCompare(CondDef->getOperand(2).getReg(), 5129 CondDef->getOperand(3).getReg(), MIB)) { 5130 LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n"); 5131 return false; 5132 } 5133 } 5134 5135 // Emit the select. 5136 emitSelect(I.getOperand(0).getReg(), I.getOperand(2).getReg(), 5137 I.getOperand(3).getReg(), CondCode, MIB); 5138 I.eraseFromParent(); 5139 return true; 5140 } 5141 5142 MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare( 5143 MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate, 5144 MachineIRBuilder &MIRBuilder) const { 5145 assert(LHS.isReg() && RHS.isReg() && Predicate.isPredicate() && 5146 "Unexpected MachineOperand"); 5147 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 5148 // We want to find this sort of thing: 5149 // x = G_SUB 0, y 5150 // G_ICMP z, x 5151 // 5152 // In this case, we can fold the G_SUB into the G_ICMP using a CMN instead. 5153 // e.g: 5154 // 5155 // cmn z, y 5156 5157 // Check if the RHS or LHS of the G_ICMP is defined by a SUB 5158 MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI); 5159 MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI); 5160 auto P = static_cast<CmpInst::Predicate>(Predicate.getPredicate()); 5161 // Given this: 5162 // 5163 // x = G_SUB 0, y 5164 // G_ICMP x, z 5165 // 5166 // Produce this: 5167 // 5168 // cmn y, z 5169 if (isCMN(LHSDef, P, MRI)) 5170 return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder); 5171 5172 // Same idea here, but with the RHS of the compare instead: 5173 // 5174 // Given this: 5175 // 5176 // x = G_SUB 0, y 5177 // G_ICMP z, x 5178 // 5179 // Produce this: 5180 // 5181 // cmn z, y 5182 if (isCMN(RHSDef, P, MRI)) 5183 return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder); 5184 5185 // Given this: 5186 // 5187 // z = G_AND x, y 5188 // G_ICMP z, 0 5189 // 5190 // Produce this if the compare is signed: 5191 // 5192 // tst x, y 5193 if (!CmpInst::isUnsigned(P) && LHSDef && 5194 LHSDef->getOpcode() == TargetOpcode::G_AND) { 5195 // Make sure that the RHS is 0. 5196 auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI); 5197 if (!ValAndVReg || ValAndVReg->Value != 0) 5198 return nullptr; 5199 5200 return emitTST(LHSDef->getOperand(1), 5201 LHSDef->getOperand(2), MIRBuilder); 5202 } 5203 5204 return nullptr; 5205 } 5206 5207 bool AArch64InstructionSelector::selectShuffleVector( 5208 MachineInstr &I, MachineRegisterInfo &MRI) { 5209 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 5210 Register Src1Reg = I.getOperand(1).getReg(); 5211 const LLT Src1Ty = MRI.getType(Src1Reg); 5212 Register Src2Reg = I.getOperand(2).getReg(); 5213 const LLT Src2Ty = MRI.getType(Src2Reg); 5214 ArrayRef<int> Mask = I.getOperand(3).getShuffleMask(); 5215 5216 MachineBasicBlock &MBB = *I.getParent(); 5217 MachineFunction &MF = *MBB.getParent(); 5218 LLVMContext &Ctx = MF.getFunction().getContext(); 5219 5220 // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if 5221 // it's originated from a <1 x T> type. Those should have been lowered into 5222 // G_BUILD_VECTOR earlier. 5223 if (!Src1Ty.isVector() || !Src2Ty.isVector()) { 5224 LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n"); 5225 return false; 5226 } 5227 5228 unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8; 5229 5230 SmallVector<Constant *, 64> CstIdxs; 5231 for (int Val : Mask) { 5232 // For now, any undef indexes we'll just assume to be 0. This should be 5233 // optimized in future, e.g. to select DUP etc. 5234 Val = Val < 0 ? 0 : Val; 5235 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { 5236 unsigned Offset = Byte + Val * BytesPerElt; 5237 CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset)); 5238 } 5239 } 5240 5241 // Use a constant pool to load the index vector for TBL. 5242 Constant *CPVal = ConstantVector::get(CstIdxs); 5243 MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIB); 5244 if (!IndexLoad) { 5245 LLVM_DEBUG(dbgs() << "Could not load from a constant pool"); 5246 return false; 5247 } 5248 5249 if (DstTy.getSizeInBits() != 128) { 5250 assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty"); 5251 // This case can be done with TBL1. 5252 MachineInstr *Concat = 5253 emitVectorConcat(std::nullopt, Src1Reg, Src2Reg, MIB); 5254 if (!Concat) { 5255 LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1"); 5256 return false; 5257 } 5258 5259 // The constant pool load will be 64 bits, so need to convert to FPR128 reg. 5260 IndexLoad = emitScalarToVector(64, &AArch64::FPR128RegClass, 5261 IndexLoad->getOperand(0).getReg(), MIB); 5262 5263 auto TBL1 = MIB.buildInstr( 5264 AArch64::TBLv16i8One, {&AArch64::FPR128RegClass}, 5265 {Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()}); 5266 constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI); 5267 5268 auto Copy = 5269 MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {}) 5270 .addReg(TBL1.getReg(0), 0, AArch64::dsub); 5271 RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI); 5272 I.eraseFromParent(); 5273 return true; 5274 } 5275 5276 // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive 5277 // Q registers for regalloc. 5278 SmallVector<Register, 2> Regs = {Src1Reg, Src2Reg}; 5279 auto RegSeq = createQTuple(Regs, MIB); 5280 auto TBL2 = MIB.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0)}, 5281 {RegSeq, IndexLoad->getOperand(0)}); 5282 constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI); 5283 I.eraseFromParent(); 5284 return true; 5285 } 5286 5287 MachineInstr *AArch64InstructionSelector::emitLaneInsert( 5288 std::optional<Register> DstReg, Register SrcReg, Register EltReg, 5289 unsigned LaneIdx, const RegisterBank &RB, 5290 MachineIRBuilder &MIRBuilder) const { 5291 MachineInstr *InsElt = nullptr; 5292 const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass; 5293 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 5294 5295 // Create a register to define with the insert if one wasn't passed in. 5296 if (!DstReg) 5297 DstReg = MRI.createVirtualRegister(DstRC); 5298 5299 unsigned EltSize = MRI.getType(EltReg).getSizeInBits(); 5300 unsigned Opc = getInsertVecEltOpInfo(RB, EltSize).first; 5301 5302 if (RB.getID() == AArch64::FPRRegBankID) { 5303 auto InsSub = emitScalarToVector(EltSize, DstRC, EltReg, MIRBuilder); 5304 InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg}) 5305 .addImm(LaneIdx) 5306 .addUse(InsSub->getOperand(0).getReg()) 5307 .addImm(0); 5308 } else { 5309 InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg}) 5310 .addImm(LaneIdx) 5311 .addUse(EltReg); 5312 } 5313 5314 constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI); 5315 return InsElt; 5316 } 5317 5318 bool AArch64InstructionSelector::selectUSMovFromExtend( 5319 MachineInstr &MI, MachineRegisterInfo &MRI) { 5320 if (MI.getOpcode() != TargetOpcode::G_SEXT && 5321 MI.getOpcode() != TargetOpcode::G_ZEXT && 5322 MI.getOpcode() != TargetOpcode::G_ANYEXT) 5323 return false; 5324 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SEXT; 5325 const Register DefReg = MI.getOperand(0).getReg(); 5326 const LLT DstTy = MRI.getType(DefReg); 5327 unsigned DstSize = DstTy.getSizeInBits(); 5328 5329 if (DstSize != 32 && DstSize != 64) 5330 return false; 5331 5332 MachineInstr *Extract = getOpcodeDef(TargetOpcode::G_EXTRACT_VECTOR_ELT, 5333 MI.getOperand(1).getReg(), MRI); 5334 int64_t Lane; 5335 if (!Extract || !mi_match(Extract->getOperand(2).getReg(), MRI, m_ICst(Lane))) 5336 return false; 5337 Register Src0 = Extract->getOperand(1).getReg(); 5338 5339 const LLT &VecTy = MRI.getType(Src0); 5340 5341 if (VecTy.getSizeInBits() != 128) { 5342 const MachineInstr *ScalarToVector = emitScalarToVector( 5343 VecTy.getSizeInBits(), &AArch64::FPR128RegClass, Src0, MIB); 5344 assert(ScalarToVector && "Didn't expect emitScalarToVector to fail!"); 5345 Src0 = ScalarToVector->getOperand(0).getReg(); 5346 } 5347 5348 unsigned Opcode; 5349 if (DstSize == 64 && VecTy.getScalarSizeInBits() == 32) 5350 Opcode = IsSigned ? AArch64::SMOVvi32to64 : AArch64::UMOVvi32; 5351 else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 16) 5352 Opcode = IsSigned ? AArch64::SMOVvi16to64 : AArch64::UMOVvi16; 5353 else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 8) 5354 Opcode = IsSigned ? AArch64::SMOVvi8to64 : AArch64::UMOVvi8; 5355 else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 16) 5356 Opcode = IsSigned ? AArch64::SMOVvi16to32 : AArch64::UMOVvi16; 5357 else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 8) 5358 Opcode = IsSigned ? AArch64::SMOVvi8to32 : AArch64::UMOVvi8; 5359 else 5360 llvm_unreachable("Unexpected type combo for S/UMov!"); 5361 5362 // We may need to generate one of these, depending on the type and sign of the 5363 // input: 5364 // DstReg = SMOV Src0, Lane; 5365 // NewReg = UMOV Src0, Lane; DstReg = SUBREG_TO_REG NewReg, sub_32; 5366 MachineInstr *ExtI = nullptr; 5367 if (DstSize == 64 && !IsSigned) { 5368 Register NewReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 5369 MIB.buildInstr(Opcode, {NewReg}, {Src0}).addImm(Lane); 5370 ExtI = MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {}) 5371 .addImm(0) 5372 .addUse(NewReg) 5373 .addImm(AArch64::sub_32); 5374 RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI); 5375 } else 5376 ExtI = MIB.buildInstr(Opcode, {DefReg}, {Src0}).addImm(Lane); 5377 5378 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 5379 MI.eraseFromParent(); 5380 return true; 5381 } 5382 5383 bool AArch64InstructionSelector::selectInsertElt(MachineInstr &I, 5384 MachineRegisterInfo &MRI) { 5385 assert(I.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT); 5386 5387 // Get information on the destination. 5388 Register DstReg = I.getOperand(0).getReg(); 5389 const LLT DstTy = MRI.getType(DstReg); 5390 unsigned VecSize = DstTy.getSizeInBits(); 5391 5392 // Get information on the element we want to insert into the destination. 5393 Register EltReg = I.getOperand(2).getReg(); 5394 const LLT EltTy = MRI.getType(EltReg); 5395 unsigned EltSize = EltTy.getSizeInBits(); 5396 if (EltSize < 8 || EltSize > 64) 5397 return false; 5398 5399 // Find the definition of the index. Bail out if it's not defined by a 5400 // G_CONSTANT. 5401 Register IdxReg = I.getOperand(3).getReg(); 5402 auto VRegAndVal = getIConstantVRegValWithLookThrough(IdxReg, MRI); 5403 if (!VRegAndVal) 5404 return false; 5405 unsigned LaneIdx = VRegAndVal->Value.getSExtValue(); 5406 5407 // Perform the lane insert. 5408 Register SrcReg = I.getOperand(1).getReg(); 5409 const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI); 5410 5411 if (VecSize < 128) { 5412 // If the vector we're inserting into is smaller than 128 bits, widen it 5413 // to 128 to do the insert. 5414 MachineInstr *ScalarToVec = 5415 emitScalarToVector(VecSize, &AArch64::FPR128RegClass, SrcReg, MIB); 5416 if (!ScalarToVec) 5417 return false; 5418 SrcReg = ScalarToVec->getOperand(0).getReg(); 5419 } 5420 5421 // Create an insert into a new FPR128 register. 5422 // Note that if our vector is already 128 bits, we end up emitting an extra 5423 // register. 5424 MachineInstr *InsMI = 5425 emitLaneInsert(std::nullopt, SrcReg, EltReg, LaneIdx, EltRB, MIB); 5426 5427 if (VecSize < 128) { 5428 // If we had to widen to perform the insert, then we have to demote back to 5429 // the original size to get the result we want. 5430 if (!emitNarrowVector(DstReg, InsMI->getOperand(0).getReg(), MIB, MRI)) 5431 return false; 5432 } else { 5433 // No widening needed. 5434 InsMI->getOperand(0).setReg(DstReg); 5435 constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI); 5436 } 5437 5438 I.eraseFromParent(); 5439 return true; 5440 } 5441 5442 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm8( 5443 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) { 5444 unsigned int Op; 5445 if (DstSize == 128) { 5446 if (Bits.getHiBits(64) != Bits.getLoBits(64)) 5447 return nullptr; 5448 Op = AArch64::MOVIv16b_ns; 5449 } else { 5450 Op = AArch64::MOVIv8b_ns; 5451 } 5452 5453 uint64_t Val = Bits.zextOrTrunc(64).getZExtValue(); 5454 5455 if (AArch64_AM::isAdvSIMDModImmType9(Val)) { 5456 Val = AArch64_AM::encodeAdvSIMDModImmType9(Val); 5457 auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val); 5458 constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI); 5459 return &*Mov; 5460 } 5461 return nullptr; 5462 } 5463 5464 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm16( 5465 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder, 5466 bool Inv) { 5467 5468 unsigned int Op; 5469 if (DstSize == 128) { 5470 if (Bits.getHiBits(64) != Bits.getLoBits(64)) 5471 return nullptr; 5472 Op = Inv ? AArch64::MVNIv8i16 : AArch64::MOVIv8i16; 5473 } else { 5474 Op = Inv ? AArch64::MVNIv4i16 : AArch64::MOVIv4i16; 5475 } 5476 5477 uint64_t Val = Bits.zextOrTrunc(64).getZExtValue(); 5478 uint64_t Shift; 5479 5480 if (AArch64_AM::isAdvSIMDModImmType5(Val)) { 5481 Val = AArch64_AM::encodeAdvSIMDModImmType5(Val); 5482 Shift = 0; 5483 } else if (AArch64_AM::isAdvSIMDModImmType6(Val)) { 5484 Val = AArch64_AM::encodeAdvSIMDModImmType6(Val); 5485 Shift = 8; 5486 } else 5487 return nullptr; 5488 5489 auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val).addImm(Shift); 5490 constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI); 5491 return &*Mov; 5492 } 5493 5494 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm32( 5495 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder, 5496 bool Inv) { 5497 5498 unsigned int Op; 5499 if (DstSize == 128) { 5500 if (Bits.getHiBits(64) != Bits.getLoBits(64)) 5501 return nullptr; 5502 Op = Inv ? AArch64::MVNIv4i32 : AArch64::MOVIv4i32; 5503 } else { 5504 Op = Inv ? AArch64::MVNIv2i32 : AArch64::MOVIv2i32; 5505 } 5506 5507 uint64_t Val = Bits.zextOrTrunc(64).getZExtValue(); 5508 uint64_t Shift; 5509 5510 if ((AArch64_AM::isAdvSIMDModImmType1(Val))) { 5511 Val = AArch64_AM::encodeAdvSIMDModImmType1(Val); 5512 Shift = 0; 5513 } else if ((AArch64_AM::isAdvSIMDModImmType2(Val))) { 5514 Val = AArch64_AM::encodeAdvSIMDModImmType2(Val); 5515 Shift = 8; 5516 } else if ((AArch64_AM::isAdvSIMDModImmType3(Val))) { 5517 Val = AArch64_AM::encodeAdvSIMDModImmType3(Val); 5518 Shift = 16; 5519 } else if ((AArch64_AM::isAdvSIMDModImmType4(Val))) { 5520 Val = AArch64_AM::encodeAdvSIMDModImmType4(Val); 5521 Shift = 24; 5522 } else 5523 return nullptr; 5524 5525 auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val).addImm(Shift); 5526 constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI); 5527 return &*Mov; 5528 } 5529 5530 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm64( 5531 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) { 5532 5533 unsigned int Op; 5534 if (DstSize == 128) { 5535 if (Bits.getHiBits(64) != Bits.getLoBits(64)) 5536 return nullptr; 5537 Op = AArch64::MOVIv2d_ns; 5538 } else { 5539 Op = AArch64::MOVID; 5540 } 5541 5542 uint64_t Val = Bits.zextOrTrunc(64).getZExtValue(); 5543 if (AArch64_AM::isAdvSIMDModImmType10(Val)) { 5544 Val = AArch64_AM::encodeAdvSIMDModImmType10(Val); 5545 auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val); 5546 constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI); 5547 return &*Mov; 5548 } 5549 return nullptr; 5550 } 5551 5552 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm321s( 5553 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder, 5554 bool Inv) { 5555 5556 unsigned int Op; 5557 if (DstSize == 128) { 5558 if (Bits.getHiBits(64) != Bits.getLoBits(64)) 5559 return nullptr; 5560 Op = Inv ? AArch64::MVNIv4s_msl : AArch64::MOVIv4s_msl; 5561 } else { 5562 Op = Inv ? AArch64::MVNIv2s_msl : AArch64::MOVIv2s_msl; 5563 } 5564 5565 uint64_t Val = Bits.zextOrTrunc(64).getZExtValue(); 5566 uint64_t Shift; 5567 5568 if (AArch64_AM::isAdvSIMDModImmType7(Val)) { 5569 Val = AArch64_AM::encodeAdvSIMDModImmType7(Val); 5570 Shift = 264; 5571 } else if (AArch64_AM::isAdvSIMDModImmType8(Val)) { 5572 Val = AArch64_AM::encodeAdvSIMDModImmType8(Val); 5573 Shift = 272; 5574 } else 5575 return nullptr; 5576 5577 auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val).addImm(Shift); 5578 constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI); 5579 return &*Mov; 5580 } 5581 5582 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImmFP( 5583 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) { 5584 5585 unsigned int Op; 5586 bool IsWide = false; 5587 if (DstSize == 128) { 5588 if (Bits.getHiBits(64) != Bits.getLoBits(64)) 5589 return nullptr; 5590 Op = AArch64::FMOVv4f32_ns; 5591 IsWide = true; 5592 } else { 5593 Op = AArch64::FMOVv2f32_ns; 5594 } 5595 5596 uint64_t Val = Bits.zextOrTrunc(64).getZExtValue(); 5597 5598 if (AArch64_AM::isAdvSIMDModImmType11(Val)) { 5599 Val = AArch64_AM::encodeAdvSIMDModImmType11(Val); 5600 } else if (IsWide && AArch64_AM::isAdvSIMDModImmType12(Val)) { 5601 Val = AArch64_AM::encodeAdvSIMDModImmType12(Val); 5602 Op = AArch64::FMOVv2f64_ns; 5603 } else 5604 return nullptr; 5605 5606 auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val); 5607 constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI); 5608 return &*Mov; 5609 } 5610 5611 bool AArch64InstructionSelector::selectIndexedExtLoad( 5612 MachineInstr &MI, MachineRegisterInfo &MRI) { 5613 auto &ExtLd = cast<GIndexedAnyExtLoad>(MI); 5614 Register Dst = ExtLd.getDstReg(); 5615 Register WriteBack = ExtLd.getWritebackReg(); 5616 Register Base = ExtLd.getBaseReg(); 5617 Register Offset = ExtLd.getOffsetReg(); 5618 LLT Ty = MRI.getType(Dst); 5619 assert(Ty.getSizeInBits() <= 64); // Only for scalar GPRs. 5620 unsigned MemSizeBits = ExtLd.getMMO().getMemoryType().getSizeInBits(); 5621 bool IsPre = ExtLd.isPre(); 5622 bool IsSExt = isa<GIndexedSExtLoad>(ExtLd); 5623 bool InsertIntoXReg = false; 5624 bool IsDst64 = Ty.getSizeInBits() == 64; 5625 5626 unsigned Opc = 0; 5627 LLT NewLdDstTy; 5628 LLT s32 = LLT::scalar(32); 5629 LLT s64 = LLT::scalar(64); 5630 5631 if (MemSizeBits == 8) { 5632 if (IsSExt) { 5633 if (IsDst64) 5634 Opc = IsPre ? AArch64::LDRSBXpre : AArch64::LDRSBXpost; 5635 else 5636 Opc = IsPre ? AArch64::LDRSBWpre : AArch64::LDRSBWpost; 5637 NewLdDstTy = IsDst64 ? s64 : s32; 5638 } else { 5639 Opc = IsPre ? AArch64::LDRBBpre : AArch64::LDRBBpost; 5640 InsertIntoXReg = IsDst64; 5641 NewLdDstTy = s32; 5642 } 5643 } else if (MemSizeBits == 16) { 5644 if (IsSExt) { 5645 if (IsDst64) 5646 Opc = IsPre ? AArch64::LDRSHXpre : AArch64::LDRSHXpost; 5647 else 5648 Opc = IsPre ? AArch64::LDRSHWpre : AArch64::LDRSHWpost; 5649 NewLdDstTy = IsDst64 ? s64 : s32; 5650 } else { 5651 Opc = IsPre ? AArch64::LDRHHpre : AArch64::LDRHHpost; 5652 InsertIntoXReg = IsDst64; 5653 NewLdDstTy = s32; 5654 } 5655 } else if (MemSizeBits == 32) { 5656 if (IsSExt) { 5657 Opc = IsPre ? AArch64::LDRSWpre : AArch64::LDRSWpost; 5658 NewLdDstTy = s64; 5659 } else { 5660 Opc = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost; 5661 InsertIntoXReg = IsDst64; 5662 NewLdDstTy = s32; 5663 } 5664 } else { 5665 llvm_unreachable("Unexpected size for indexed load"); 5666 } 5667 5668 if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID) 5669 return false; // We should be on gpr. 5670 5671 auto Cst = getIConstantVRegVal(Offset, MRI); 5672 if (!Cst) 5673 return false; // Shouldn't happen, but just in case. 5674 5675 auto LdMI = MIB.buildInstr(Opc, {WriteBack, NewLdDstTy}, {Base}) 5676 .addImm(Cst->getSExtValue()); 5677 LdMI.cloneMemRefs(ExtLd); 5678 constrainSelectedInstRegOperands(*LdMI, TII, TRI, RBI); 5679 // Make sure to select the load with the MemTy as the dest type, and then 5680 // insert into X reg if needed. 5681 if (InsertIntoXReg) { 5682 // Generate a SUBREG_TO_REG. 5683 auto SubToReg = MIB.buildInstr(TargetOpcode::SUBREG_TO_REG, {Dst}, {}) 5684 .addImm(0) 5685 .addUse(LdMI.getReg(1)) 5686 .addImm(AArch64::sub_32); 5687 RBI.constrainGenericRegister(SubToReg.getReg(0), AArch64::GPR64RegClass, 5688 MRI); 5689 } else { 5690 auto Copy = MIB.buildCopy(Dst, LdMI.getReg(1)); 5691 selectCopy(*Copy, TII, MRI, TRI, RBI); 5692 } 5693 MI.eraseFromParent(); 5694 5695 return true; 5696 } 5697 5698 bool AArch64InstructionSelector::selectIndexedLoad(MachineInstr &MI, 5699 MachineRegisterInfo &MRI) { 5700 auto &Ld = cast<GIndexedLoad>(MI); 5701 Register Dst = Ld.getDstReg(); 5702 Register WriteBack = Ld.getWritebackReg(); 5703 Register Base = Ld.getBaseReg(); 5704 Register Offset = Ld.getOffsetReg(); 5705 assert(MRI.getType(Dst).getSizeInBits() <= 128 && 5706 "Unexpected type for indexed load"); 5707 unsigned MemSize = Ld.getMMO().getMemoryType().getSizeInBytes(); 5708 5709 if (MemSize < MRI.getType(Dst).getSizeInBytes()) 5710 return selectIndexedExtLoad(MI, MRI); 5711 5712 unsigned Opc = 0; 5713 if (Ld.isPre()) { 5714 static constexpr unsigned GPROpcodes[] = { 5715 AArch64::LDRBBpre, AArch64::LDRHHpre, AArch64::LDRWpre, 5716 AArch64::LDRXpre}; 5717 static constexpr unsigned FPROpcodes[] = { 5718 AArch64::LDRBpre, AArch64::LDRHpre, AArch64::LDRSpre, AArch64::LDRDpre, 5719 AArch64::LDRQpre}; 5720 if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID) 5721 Opc = FPROpcodes[Log2_32(MemSize)]; 5722 else 5723 Opc = GPROpcodes[Log2_32(MemSize)]; 5724 } else { 5725 static constexpr unsigned GPROpcodes[] = { 5726 AArch64::LDRBBpost, AArch64::LDRHHpost, AArch64::LDRWpost, 5727 AArch64::LDRXpost}; 5728 static constexpr unsigned FPROpcodes[] = { 5729 AArch64::LDRBpost, AArch64::LDRHpost, AArch64::LDRSpost, 5730 AArch64::LDRDpost, AArch64::LDRQpost}; 5731 if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID) 5732 Opc = FPROpcodes[Log2_32(MemSize)]; 5733 else 5734 Opc = GPROpcodes[Log2_32(MemSize)]; 5735 } 5736 auto Cst = getIConstantVRegVal(Offset, MRI); 5737 if (!Cst) 5738 return false; // Shouldn't happen, but just in case. 5739 auto LdMI = 5740 MIB.buildInstr(Opc, {WriteBack, Dst}, {Base}).addImm(Cst->getSExtValue()); 5741 LdMI.cloneMemRefs(Ld); 5742 constrainSelectedInstRegOperands(*LdMI, TII, TRI, RBI); 5743 MI.eraseFromParent(); 5744 return true; 5745 } 5746 5747 bool AArch64InstructionSelector::selectIndexedStore(GIndexedStore &I, 5748 MachineRegisterInfo &MRI) { 5749 Register Dst = I.getWritebackReg(); 5750 Register Val = I.getValueReg(); 5751 Register Base = I.getBaseReg(); 5752 Register Offset = I.getOffsetReg(); 5753 LLT ValTy = MRI.getType(Val); 5754 assert(ValTy.getSizeInBits() <= 128 && "Unexpected type for indexed store"); 5755 5756 unsigned Opc = 0; 5757 if (I.isPre()) { 5758 static constexpr unsigned GPROpcodes[] = { 5759 AArch64::STRBBpre, AArch64::STRHHpre, AArch64::STRWpre, 5760 AArch64::STRXpre}; 5761 static constexpr unsigned FPROpcodes[] = { 5762 AArch64::STRBpre, AArch64::STRHpre, AArch64::STRSpre, AArch64::STRDpre, 5763 AArch64::STRQpre}; 5764 5765 if (RBI.getRegBank(Val, MRI, TRI)->getID() == AArch64::FPRRegBankID) 5766 Opc = FPROpcodes[Log2_32(ValTy.getSizeInBytes())]; 5767 else 5768 Opc = GPROpcodes[Log2_32(ValTy.getSizeInBytes())]; 5769 } else { 5770 static constexpr unsigned GPROpcodes[] = { 5771 AArch64::STRBBpost, AArch64::STRHHpost, AArch64::STRWpost, 5772 AArch64::STRXpost}; 5773 static constexpr unsigned FPROpcodes[] = { 5774 AArch64::STRBpost, AArch64::STRHpost, AArch64::STRSpost, 5775 AArch64::STRDpost, AArch64::STRQpost}; 5776 5777 if (RBI.getRegBank(Val, MRI, TRI)->getID() == AArch64::FPRRegBankID) 5778 Opc = FPROpcodes[Log2_32(ValTy.getSizeInBytes())]; 5779 else 5780 Opc = GPROpcodes[Log2_32(ValTy.getSizeInBytes())]; 5781 } 5782 5783 auto Cst = getIConstantVRegVal(Offset, MRI); 5784 if (!Cst) 5785 return false; // Shouldn't happen, but just in case. 5786 auto Str = 5787 MIB.buildInstr(Opc, {Dst}, {Val, Base}).addImm(Cst->getSExtValue()); 5788 Str.cloneMemRefs(I); 5789 constrainSelectedInstRegOperands(*Str, TII, TRI, RBI); 5790 I.eraseFromParent(); 5791 return true; 5792 } 5793 5794 MachineInstr * 5795 AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV, 5796 MachineIRBuilder &MIRBuilder, 5797 MachineRegisterInfo &MRI) { 5798 LLT DstTy = MRI.getType(Dst); 5799 unsigned DstSize = DstTy.getSizeInBits(); 5800 if (CV->isNullValue()) { 5801 if (DstSize == 128) { 5802 auto Mov = 5803 MIRBuilder.buildInstr(AArch64::MOVIv2d_ns, {Dst}, {}).addImm(0); 5804 constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI); 5805 return &*Mov; 5806 } 5807 5808 if (DstSize == 64) { 5809 auto Mov = 5810 MIRBuilder 5811 .buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {}) 5812 .addImm(0); 5813 auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {Dst}, {}) 5814 .addReg(Mov.getReg(0), 0, AArch64::dsub); 5815 RBI.constrainGenericRegister(Dst, AArch64::FPR64RegClass, MRI); 5816 return &*Copy; 5817 } 5818 } 5819 5820 if (CV->getSplatValue()) { 5821 APInt DefBits = APInt::getSplat(DstSize, CV->getUniqueInteger()); 5822 MachineInstr *NewOp; 5823 bool Inv = false; 5824 if ((NewOp = tryAdvSIMDModImm64(Dst, DstSize, DefBits, MIRBuilder)) || 5825 (NewOp = tryAdvSIMDModImm32(Dst, DstSize, DefBits, MIRBuilder, Inv)) || 5826 (NewOp = 5827 tryAdvSIMDModImm321s(Dst, DstSize, DefBits, MIRBuilder, Inv)) || 5828 (NewOp = tryAdvSIMDModImm16(Dst, DstSize, DefBits, MIRBuilder, Inv)) || 5829 (NewOp = tryAdvSIMDModImm8(Dst, DstSize, DefBits, MIRBuilder)) || 5830 (NewOp = tryAdvSIMDModImmFP(Dst, DstSize, DefBits, MIRBuilder))) 5831 return NewOp; 5832 5833 DefBits = ~DefBits; 5834 Inv = true; 5835 if ((NewOp = tryAdvSIMDModImm32(Dst, DstSize, DefBits, MIRBuilder, Inv)) || 5836 (NewOp = 5837 tryAdvSIMDModImm321s(Dst, DstSize, DefBits, MIRBuilder, Inv)) || 5838 (NewOp = tryAdvSIMDModImm16(Dst, DstSize, DefBits, MIRBuilder, Inv))) 5839 return NewOp; 5840 } 5841 5842 auto *CPLoad = emitLoadFromConstantPool(CV, MIRBuilder); 5843 if (!CPLoad) { 5844 LLVM_DEBUG(dbgs() << "Could not generate cp load for constant vector!"); 5845 return nullptr; 5846 } 5847 5848 auto Copy = MIRBuilder.buildCopy(Dst, CPLoad->getOperand(0)); 5849 RBI.constrainGenericRegister( 5850 Dst, *MRI.getRegClass(CPLoad->getOperand(0).getReg()), MRI); 5851 return &*Copy; 5852 } 5853 5854 bool AArch64InstructionSelector::tryOptConstantBuildVec( 5855 MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) { 5856 assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR); 5857 unsigned DstSize = DstTy.getSizeInBits(); 5858 assert(DstSize <= 128 && "Unexpected build_vec type!"); 5859 if (DstSize < 32) 5860 return false; 5861 // Check if we're building a constant vector, in which case we want to 5862 // generate a constant pool load instead of a vector insert sequence. 5863 SmallVector<Constant *, 16> Csts; 5864 for (unsigned Idx = 1; Idx < I.getNumOperands(); ++Idx) { 5865 // Try to find G_CONSTANT or G_FCONSTANT 5866 auto *OpMI = 5867 getOpcodeDef(TargetOpcode::G_CONSTANT, I.getOperand(Idx).getReg(), MRI); 5868 if (OpMI) 5869 Csts.emplace_back( 5870 const_cast<ConstantInt *>(OpMI->getOperand(1).getCImm())); 5871 else if ((OpMI = getOpcodeDef(TargetOpcode::G_FCONSTANT, 5872 I.getOperand(Idx).getReg(), MRI))) 5873 Csts.emplace_back( 5874 const_cast<ConstantFP *>(OpMI->getOperand(1).getFPImm())); 5875 else 5876 return false; 5877 } 5878 Constant *CV = ConstantVector::get(Csts); 5879 if (!emitConstantVector(I.getOperand(0).getReg(), CV, MIB, MRI)) 5880 return false; 5881 I.eraseFromParent(); 5882 return true; 5883 } 5884 5885 bool AArch64InstructionSelector::tryOptBuildVecToSubregToReg( 5886 MachineInstr &I, MachineRegisterInfo &MRI) { 5887 // Given: 5888 // %vec = G_BUILD_VECTOR %elt, %undef, %undef, ... %undef 5889 // 5890 // Select the G_BUILD_VECTOR as a SUBREG_TO_REG from %elt. 5891 Register Dst = I.getOperand(0).getReg(); 5892 Register EltReg = I.getOperand(1).getReg(); 5893 LLT EltTy = MRI.getType(EltReg); 5894 // If the index isn't on the same bank as its elements, then this can't be a 5895 // SUBREG_TO_REG. 5896 const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI); 5897 const RegisterBank &DstRB = *RBI.getRegBank(Dst, MRI, TRI); 5898 if (EltRB != DstRB) 5899 return false; 5900 if (any_of(drop_begin(I.operands(), 2), [&MRI](const MachineOperand &Op) { 5901 return !getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, Op.getReg(), MRI); 5902 })) 5903 return false; 5904 unsigned SubReg; 5905 const TargetRegisterClass *EltRC = getRegClassForTypeOnBank(EltTy, EltRB); 5906 if (!EltRC) 5907 return false; 5908 const TargetRegisterClass *DstRC = 5909 getRegClassForTypeOnBank(MRI.getType(Dst), DstRB); 5910 if (!DstRC) 5911 return false; 5912 if (!getSubRegForClass(EltRC, TRI, SubReg)) 5913 return false; 5914 auto SubregToReg = MIB.buildInstr(AArch64::SUBREG_TO_REG, {Dst}, {}) 5915 .addImm(0) 5916 .addUse(EltReg) 5917 .addImm(SubReg); 5918 I.eraseFromParent(); 5919 constrainSelectedInstRegOperands(*SubregToReg, TII, TRI, RBI); 5920 return RBI.constrainGenericRegister(Dst, *DstRC, MRI); 5921 } 5922 5923 bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I, 5924 MachineRegisterInfo &MRI) { 5925 assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR); 5926 // Until we port more of the optimized selections, for now just use a vector 5927 // insert sequence. 5928 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 5929 const LLT EltTy = MRI.getType(I.getOperand(1).getReg()); 5930 unsigned EltSize = EltTy.getSizeInBits(); 5931 5932 if (tryOptConstantBuildVec(I, DstTy, MRI)) 5933 return true; 5934 if (tryOptBuildVecToSubregToReg(I, MRI)) 5935 return true; 5936 5937 if (EltSize != 8 && EltSize != 16 && EltSize != 32 && EltSize != 64) 5938 return false; // Don't support all element types yet. 5939 const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI); 5940 5941 const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass; 5942 MachineInstr *ScalarToVec = 5943 emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC, 5944 I.getOperand(1).getReg(), MIB); 5945 if (!ScalarToVec) 5946 return false; 5947 5948 Register DstVec = ScalarToVec->getOperand(0).getReg(); 5949 unsigned DstSize = DstTy.getSizeInBits(); 5950 5951 // Keep track of the last MI we inserted. Later on, we might be able to save 5952 // a copy using it. 5953 MachineInstr *PrevMI = nullptr; 5954 for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) { 5955 // Note that if we don't do a subregister copy, we can end up making an 5956 // extra register. 5957 PrevMI = &*emitLaneInsert(std::nullopt, DstVec, I.getOperand(i).getReg(), 5958 i - 1, RB, MIB); 5959 DstVec = PrevMI->getOperand(0).getReg(); 5960 } 5961 5962 // If DstTy's size in bits is less than 128, then emit a subregister copy 5963 // from DstVec to the last register we've defined. 5964 if (DstSize < 128) { 5965 // Force this to be FPR using the destination vector. 5966 const TargetRegisterClass *RC = 5967 getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DstVec, MRI, TRI)); 5968 if (!RC) 5969 return false; 5970 if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) { 5971 LLVM_DEBUG(dbgs() << "Unsupported register class!\n"); 5972 return false; 5973 } 5974 5975 unsigned SubReg = 0; 5976 if (!getSubRegForClass(RC, TRI, SubReg)) 5977 return false; 5978 if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) { 5979 LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << DstSize 5980 << "\n"); 5981 return false; 5982 } 5983 5984 Register Reg = MRI.createVirtualRegister(RC); 5985 Register DstReg = I.getOperand(0).getReg(); 5986 5987 MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}).addReg(DstVec, 0, SubReg); 5988 MachineOperand &RegOp = I.getOperand(1); 5989 RegOp.setReg(Reg); 5990 RBI.constrainGenericRegister(DstReg, *RC, MRI); 5991 } else { 5992 // We don't need a subregister copy. Save a copy by re-using the 5993 // destination register on the final insert. 5994 assert(PrevMI && "PrevMI was null?"); 5995 PrevMI->getOperand(0).setReg(I.getOperand(0).getReg()); 5996 constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI); 5997 } 5998 5999 I.eraseFromParent(); 6000 return true; 6001 } 6002 6003 bool AArch64InstructionSelector::selectVectorLoadIntrinsic(unsigned Opc, 6004 unsigned NumVecs, 6005 MachineInstr &I) { 6006 assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS); 6007 assert(Opc && "Expected an opcode?"); 6008 assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors"); 6009 auto &MRI = *MIB.getMRI(); 6010 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6011 unsigned Size = Ty.getSizeInBits(); 6012 assert((Size == 64 || Size == 128) && 6013 "Destination must be 64 bits or 128 bits?"); 6014 unsigned SubReg = Size == 64 ? AArch64::dsub0 : AArch64::qsub0; 6015 auto Ptr = I.getOperand(I.getNumOperands() - 1).getReg(); 6016 assert(MRI.getType(Ptr).isPointer() && "Expected a pointer type?"); 6017 auto Load = MIB.buildInstr(Opc, {Ty}, {Ptr}); 6018 Load.cloneMemRefs(I); 6019 constrainSelectedInstRegOperands(*Load, TII, TRI, RBI); 6020 Register SelectedLoadDst = Load->getOperand(0).getReg(); 6021 for (unsigned Idx = 0; Idx < NumVecs; ++Idx) { 6022 auto Vec = MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(Idx)}, {}) 6023 .addReg(SelectedLoadDst, 0, SubReg + Idx); 6024 // Emit the subreg copies and immediately select them. 6025 // FIXME: We should refactor our copy code into an emitCopy helper and 6026 // clean up uses of this pattern elsewhere in the selector. 6027 selectCopy(*Vec, TII, MRI, TRI, RBI); 6028 } 6029 return true; 6030 } 6031 6032 bool AArch64InstructionSelector::selectVectorLoadLaneIntrinsic( 6033 unsigned Opc, unsigned NumVecs, MachineInstr &I) { 6034 assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS); 6035 assert(Opc && "Expected an opcode?"); 6036 assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors"); 6037 auto &MRI = *MIB.getMRI(); 6038 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6039 bool Narrow = Ty.getSizeInBits() == 64; 6040 6041 auto FirstSrcRegIt = I.operands_begin() + NumVecs + 1; 6042 SmallVector<Register, 4> Regs(NumVecs); 6043 std::transform(FirstSrcRegIt, FirstSrcRegIt + NumVecs, Regs.begin(), 6044 [](auto MO) { return MO.getReg(); }); 6045 6046 if (Narrow) { 6047 transform(Regs, Regs.begin(), [this](Register Reg) { 6048 return emitScalarToVector(64, &AArch64::FPR128RegClass, Reg, MIB) 6049 ->getOperand(0) 6050 .getReg(); 6051 }); 6052 Ty = Ty.multiplyElements(2); 6053 } 6054 6055 Register Tuple = createQTuple(Regs, MIB); 6056 auto LaneNo = getIConstantVRegVal((FirstSrcRegIt + NumVecs)->getReg(), MRI); 6057 if (!LaneNo) 6058 return false; 6059 6060 Register Ptr = (FirstSrcRegIt + NumVecs + 1)->getReg(); 6061 auto Load = MIB.buildInstr(Opc, {Ty}, {}) 6062 .addReg(Tuple) 6063 .addImm(LaneNo->getZExtValue()) 6064 .addReg(Ptr); 6065 Load.cloneMemRefs(I); 6066 constrainSelectedInstRegOperands(*Load, TII, TRI, RBI); 6067 Register SelectedLoadDst = Load->getOperand(0).getReg(); 6068 unsigned SubReg = AArch64::qsub0; 6069 for (unsigned Idx = 0; Idx < NumVecs; ++Idx) { 6070 auto Vec = MIB.buildInstr(TargetOpcode::COPY, 6071 {Narrow ? DstOp(&AArch64::FPR128RegClass) 6072 : DstOp(I.getOperand(Idx).getReg())}, 6073 {}) 6074 .addReg(SelectedLoadDst, 0, SubReg + Idx); 6075 Register WideReg = Vec.getReg(0); 6076 // Emit the subreg copies and immediately select them. 6077 selectCopy(*Vec, TII, MRI, TRI, RBI); 6078 if (Narrow && 6079 !emitNarrowVector(I.getOperand(Idx).getReg(), WideReg, MIB, MRI)) 6080 return false; 6081 } 6082 return true; 6083 } 6084 6085 void AArch64InstructionSelector::selectVectorStoreIntrinsic(MachineInstr &I, 6086 unsigned NumVecs, 6087 unsigned Opc) { 6088 MachineRegisterInfo &MRI = I.getParent()->getParent()->getRegInfo(); 6089 LLT Ty = MRI.getType(I.getOperand(1).getReg()); 6090 Register Ptr = I.getOperand(1 + NumVecs).getReg(); 6091 6092 SmallVector<Register, 2> Regs(NumVecs); 6093 std::transform(I.operands_begin() + 1, I.operands_begin() + 1 + NumVecs, 6094 Regs.begin(), [](auto MO) { return MO.getReg(); }); 6095 6096 Register Tuple = Ty.getSizeInBits() == 128 ? createQTuple(Regs, MIB) 6097 : createDTuple(Regs, MIB); 6098 auto Store = MIB.buildInstr(Opc, {}, {Tuple, Ptr}); 6099 Store.cloneMemRefs(I); 6100 constrainSelectedInstRegOperands(*Store, TII, TRI, RBI); 6101 } 6102 6103 bool AArch64InstructionSelector::selectVectorStoreLaneIntrinsic( 6104 MachineInstr &I, unsigned NumVecs, unsigned Opc) { 6105 MachineRegisterInfo &MRI = I.getParent()->getParent()->getRegInfo(); 6106 LLT Ty = MRI.getType(I.getOperand(1).getReg()); 6107 bool Narrow = Ty.getSizeInBits() == 64; 6108 6109 SmallVector<Register, 2> Regs(NumVecs); 6110 std::transform(I.operands_begin() + 1, I.operands_begin() + 1 + NumVecs, 6111 Regs.begin(), [](auto MO) { return MO.getReg(); }); 6112 6113 if (Narrow) 6114 transform(Regs, Regs.begin(), [this](Register Reg) { 6115 return emitScalarToVector(64, &AArch64::FPR128RegClass, Reg, MIB) 6116 ->getOperand(0) 6117 .getReg(); 6118 }); 6119 6120 Register Tuple = createQTuple(Regs, MIB); 6121 6122 auto LaneNo = getIConstantVRegVal(I.getOperand(1 + NumVecs).getReg(), MRI); 6123 if (!LaneNo) 6124 return false; 6125 Register Ptr = I.getOperand(1 + NumVecs + 1).getReg(); 6126 auto Store = MIB.buildInstr(Opc, {}, {}) 6127 .addReg(Tuple) 6128 .addImm(LaneNo->getZExtValue()) 6129 .addReg(Ptr); 6130 Store.cloneMemRefs(I); 6131 constrainSelectedInstRegOperands(*Store, TII, TRI, RBI); 6132 return true; 6133 } 6134 6135 bool AArch64InstructionSelector::selectIntrinsicWithSideEffects( 6136 MachineInstr &I, MachineRegisterInfo &MRI) { 6137 // Find the intrinsic ID. 6138 unsigned IntrinID = cast<GIntrinsic>(I).getIntrinsicID(); 6139 6140 const LLT S8 = LLT::scalar(8); 6141 const LLT S16 = LLT::scalar(16); 6142 const LLT S32 = LLT::scalar(32); 6143 const LLT S64 = LLT::scalar(64); 6144 const LLT P0 = LLT::pointer(0, 64); 6145 // Select the instruction. 6146 switch (IntrinID) { 6147 default: 6148 return false; 6149 case Intrinsic::aarch64_ldxp: 6150 case Intrinsic::aarch64_ldaxp: { 6151 auto NewI = MIB.buildInstr( 6152 IntrinID == Intrinsic::aarch64_ldxp ? AArch64::LDXPX : AArch64::LDAXPX, 6153 {I.getOperand(0).getReg(), I.getOperand(1).getReg()}, 6154 {I.getOperand(3)}); 6155 NewI.cloneMemRefs(I); 6156 constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI); 6157 break; 6158 } 6159 case Intrinsic::trap: 6160 MIB.buildInstr(AArch64::BRK, {}, {}).addImm(1); 6161 break; 6162 case Intrinsic::debugtrap: 6163 MIB.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000); 6164 break; 6165 case Intrinsic::ubsantrap: 6166 MIB.buildInstr(AArch64::BRK, {}, {}) 6167 .addImm(I.getOperand(1).getImm() | ('U' << 8)); 6168 break; 6169 case Intrinsic::aarch64_neon_ld1x2: { 6170 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6171 unsigned Opc = 0; 6172 if (Ty == LLT::fixed_vector(8, S8)) 6173 Opc = AArch64::LD1Twov8b; 6174 else if (Ty == LLT::fixed_vector(16, S8)) 6175 Opc = AArch64::LD1Twov16b; 6176 else if (Ty == LLT::fixed_vector(4, S16)) 6177 Opc = AArch64::LD1Twov4h; 6178 else if (Ty == LLT::fixed_vector(8, S16)) 6179 Opc = AArch64::LD1Twov8h; 6180 else if (Ty == LLT::fixed_vector(2, S32)) 6181 Opc = AArch64::LD1Twov2s; 6182 else if (Ty == LLT::fixed_vector(4, S32)) 6183 Opc = AArch64::LD1Twov4s; 6184 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6185 Opc = AArch64::LD1Twov2d; 6186 else if (Ty == S64 || Ty == P0) 6187 Opc = AArch64::LD1Twov1d; 6188 else 6189 llvm_unreachable("Unexpected type for ld1x2!"); 6190 selectVectorLoadIntrinsic(Opc, 2, I); 6191 break; 6192 } 6193 case Intrinsic::aarch64_neon_ld1x3: { 6194 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6195 unsigned Opc = 0; 6196 if (Ty == LLT::fixed_vector(8, S8)) 6197 Opc = AArch64::LD1Threev8b; 6198 else if (Ty == LLT::fixed_vector(16, S8)) 6199 Opc = AArch64::LD1Threev16b; 6200 else if (Ty == LLT::fixed_vector(4, S16)) 6201 Opc = AArch64::LD1Threev4h; 6202 else if (Ty == LLT::fixed_vector(8, S16)) 6203 Opc = AArch64::LD1Threev8h; 6204 else if (Ty == LLT::fixed_vector(2, S32)) 6205 Opc = AArch64::LD1Threev2s; 6206 else if (Ty == LLT::fixed_vector(4, S32)) 6207 Opc = AArch64::LD1Threev4s; 6208 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6209 Opc = AArch64::LD1Threev2d; 6210 else if (Ty == S64 || Ty == P0) 6211 Opc = AArch64::LD1Threev1d; 6212 else 6213 llvm_unreachable("Unexpected type for ld1x3!"); 6214 selectVectorLoadIntrinsic(Opc, 3, I); 6215 break; 6216 } 6217 case Intrinsic::aarch64_neon_ld1x4: { 6218 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6219 unsigned Opc = 0; 6220 if (Ty == LLT::fixed_vector(8, S8)) 6221 Opc = AArch64::LD1Fourv8b; 6222 else if (Ty == LLT::fixed_vector(16, S8)) 6223 Opc = AArch64::LD1Fourv16b; 6224 else if (Ty == LLT::fixed_vector(4, S16)) 6225 Opc = AArch64::LD1Fourv4h; 6226 else if (Ty == LLT::fixed_vector(8, S16)) 6227 Opc = AArch64::LD1Fourv8h; 6228 else if (Ty == LLT::fixed_vector(2, S32)) 6229 Opc = AArch64::LD1Fourv2s; 6230 else if (Ty == LLT::fixed_vector(4, S32)) 6231 Opc = AArch64::LD1Fourv4s; 6232 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6233 Opc = AArch64::LD1Fourv2d; 6234 else if (Ty == S64 || Ty == P0) 6235 Opc = AArch64::LD1Fourv1d; 6236 else 6237 llvm_unreachable("Unexpected type for ld1x4!"); 6238 selectVectorLoadIntrinsic(Opc, 4, I); 6239 break; 6240 } 6241 case Intrinsic::aarch64_neon_ld2: { 6242 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6243 unsigned Opc = 0; 6244 if (Ty == LLT::fixed_vector(8, S8)) 6245 Opc = AArch64::LD2Twov8b; 6246 else if (Ty == LLT::fixed_vector(16, S8)) 6247 Opc = AArch64::LD2Twov16b; 6248 else if (Ty == LLT::fixed_vector(4, S16)) 6249 Opc = AArch64::LD2Twov4h; 6250 else if (Ty == LLT::fixed_vector(8, S16)) 6251 Opc = AArch64::LD2Twov8h; 6252 else if (Ty == LLT::fixed_vector(2, S32)) 6253 Opc = AArch64::LD2Twov2s; 6254 else if (Ty == LLT::fixed_vector(4, S32)) 6255 Opc = AArch64::LD2Twov4s; 6256 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6257 Opc = AArch64::LD2Twov2d; 6258 else if (Ty == S64 || Ty == P0) 6259 Opc = AArch64::LD1Twov1d; 6260 else 6261 llvm_unreachable("Unexpected type for ld2!"); 6262 selectVectorLoadIntrinsic(Opc, 2, I); 6263 break; 6264 } 6265 case Intrinsic::aarch64_neon_ld2lane: { 6266 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6267 unsigned Opc; 6268 if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8)) 6269 Opc = AArch64::LD2i8; 6270 else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16)) 6271 Opc = AArch64::LD2i16; 6272 else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32)) 6273 Opc = AArch64::LD2i32; 6274 else if (Ty == LLT::fixed_vector(2, S64) || 6275 Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0) 6276 Opc = AArch64::LD2i64; 6277 else 6278 llvm_unreachable("Unexpected type for st2lane!"); 6279 if (!selectVectorLoadLaneIntrinsic(Opc, 2, I)) 6280 return false; 6281 break; 6282 } 6283 case Intrinsic::aarch64_neon_ld2r: { 6284 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6285 unsigned Opc = 0; 6286 if (Ty == LLT::fixed_vector(8, S8)) 6287 Opc = AArch64::LD2Rv8b; 6288 else if (Ty == LLT::fixed_vector(16, S8)) 6289 Opc = AArch64::LD2Rv16b; 6290 else if (Ty == LLT::fixed_vector(4, S16)) 6291 Opc = AArch64::LD2Rv4h; 6292 else if (Ty == LLT::fixed_vector(8, S16)) 6293 Opc = AArch64::LD2Rv8h; 6294 else if (Ty == LLT::fixed_vector(2, S32)) 6295 Opc = AArch64::LD2Rv2s; 6296 else if (Ty == LLT::fixed_vector(4, S32)) 6297 Opc = AArch64::LD2Rv4s; 6298 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6299 Opc = AArch64::LD2Rv2d; 6300 else if (Ty == S64 || Ty == P0) 6301 Opc = AArch64::LD2Rv1d; 6302 else 6303 llvm_unreachable("Unexpected type for ld2r!"); 6304 selectVectorLoadIntrinsic(Opc, 2, I); 6305 break; 6306 } 6307 case Intrinsic::aarch64_neon_ld3: { 6308 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6309 unsigned Opc = 0; 6310 if (Ty == LLT::fixed_vector(8, S8)) 6311 Opc = AArch64::LD3Threev8b; 6312 else if (Ty == LLT::fixed_vector(16, S8)) 6313 Opc = AArch64::LD3Threev16b; 6314 else if (Ty == LLT::fixed_vector(4, S16)) 6315 Opc = AArch64::LD3Threev4h; 6316 else if (Ty == LLT::fixed_vector(8, S16)) 6317 Opc = AArch64::LD3Threev8h; 6318 else if (Ty == LLT::fixed_vector(2, S32)) 6319 Opc = AArch64::LD3Threev2s; 6320 else if (Ty == LLT::fixed_vector(4, S32)) 6321 Opc = AArch64::LD3Threev4s; 6322 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6323 Opc = AArch64::LD3Threev2d; 6324 else if (Ty == S64 || Ty == P0) 6325 Opc = AArch64::LD1Threev1d; 6326 else 6327 llvm_unreachable("Unexpected type for ld3!"); 6328 selectVectorLoadIntrinsic(Opc, 3, I); 6329 break; 6330 } 6331 case Intrinsic::aarch64_neon_ld3lane: { 6332 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6333 unsigned Opc; 6334 if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8)) 6335 Opc = AArch64::LD3i8; 6336 else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16)) 6337 Opc = AArch64::LD3i16; 6338 else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32)) 6339 Opc = AArch64::LD3i32; 6340 else if (Ty == LLT::fixed_vector(2, S64) || 6341 Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0) 6342 Opc = AArch64::LD3i64; 6343 else 6344 llvm_unreachable("Unexpected type for st3lane!"); 6345 if (!selectVectorLoadLaneIntrinsic(Opc, 3, I)) 6346 return false; 6347 break; 6348 } 6349 case Intrinsic::aarch64_neon_ld3r: { 6350 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6351 unsigned Opc = 0; 6352 if (Ty == LLT::fixed_vector(8, S8)) 6353 Opc = AArch64::LD3Rv8b; 6354 else if (Ty == LLT::fixed_vector(16, S8)) 6355 Opc = AArch64::LD3Rv16b; 6356 else if (Ty == LLT::fixed_vector(4, S16)) 6357 Opc = AArch64::LD3Rv4h; 6358 else if (Ty == LLT::fixed_vector(8, S16)) 6359 Opc = AArch64::LD3Rv8h; 6360 else if (Ty == LLT::fixed_vector(2, S32)) 6361 Opc = AArch64::LD3Rv2s; 6362 else if (Ty == LLT::fixed_vector(4, S32)) 6363 Opc = AArch64::LD3Rv4s; 6364 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6365 Opc = AArch64::LD3Rv2d; 6366 else if (Ty == S64 || Ty == P0) 6367 Opc = AArch64::LD3Rv1d; 6368 else 6369 llvm_unreachable("Unexpected type for ld3r!"); 6370 selectVectorLoadIntrinsic(Opc, 3, I); 6371 break; 6372 } 6373 case Intrinsic::aarch64_neon_ld4: { 6374 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6375 unsigned Opc = 0; 6376 if (Ty == LLT::fixed_vector(8, S8)) 6377 Opc = AArch64::LD4Fourv8b; 6378 else if (Ty == LLT::fixed_vector(16, S8)) 6379 Opc = AArch64::LD4Fourv16b; 6380 else if (Ty == LLT::fixed_vector(4, S16)) 6381 Opc = AArch64::LD4Fourv4h; 6382 else if (Ty == LLT::fixed_vector(8, S16)) 6383 Opc = AArch64::LD4Fourv8h; 6384 else if (Ty == LLT::fixed_vector(2, S32)) 6385 Opc = AArch64::LD4Fourv2s; 6386 else if (Ty == LLT::fixed_vector(4, S32)) 6387 Opc = AArch64::LD4Fourv4s; 6388 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6389 Opc = AArch64::LD4Fourv2d; 6390 else if (Ty == S64 || Ty == P0) 6391 Opc = AArch64::LD1Fourv1d; 6392 else 6393 llvm_unreachable("Unexpected type for ld4!"); 6394 selectVectorLoadIntrinsic(Opc, 4, I); 6395 break; 6396 } 6397 case Intrinsic::aarch64_neon_ld4lane: { 6398 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6399 unsigned Opc; 6400 if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8)) 6401 Opc = AArch64::LD4i8; 6402 else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16)) 6403 Opc = AArch64::LD4i16; 6404 else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32)) 6405 Opc = AArch64::LD4i32; 6406 else if (Ty == LLT::fixed_vector(2, S64) || 6407 Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0) 6408 Opc = AArch64::LD4i64; 6409 else 6410 llvm_unreachable("Unexpected type for st4lane!"); 6411 if (!selectVectorLoadLaneIntrinsic(Opc, 4, I)) 6412 return false; 6413 break; 6414 } 6415 case Intrinsic::aarch64_neon_ld4r: { 6416 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6417 unsigned Opc = 0; 6418 if (Ty == LLT::fixed_vector(8, S8)) 6419 Opc = AArch64::LD4Rv8b; 6420 else if (Ty == LLT::fixed_vector(16, S8)) 6421 Opc = AArch64::LD4Rv16b; 6422 else if (Ty == LLT::fixed_vector(4, S16)) 6423 Opc = AArch64::LD4Rv4h; 6424 else if (Ty == LLT::fixed_vector(8, S16)) 6425 Opc = AArch64::LD4Rv8h; 6426 else if (Ty == LLT::fixed_vector(2, S32)) 6427 Opc = AArch64::LD4Rv2s; 6428 else if (Ty == LLT::fixed_vector(4, S32)) 6429 Opc = AArch64::LD4Rv4s; 6430 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6431 Opc = AArch64::LD4Rv2d; 6432 else if (Ty == S64 || Ty == P0) 6433 Opc = AArch64::LD4Rv1d; 6434 else 6435 llvm_unreachable("Unexpected type for ld4r!"); 6436 selectVectorLoadIntrinsic(Opc, 4, I); 6437 break; 6438 } 6439 case Intrinsic::aarch64_neon_st1x2: { 6440 LLT Ty = MRI.getType(I.getOperand(1).getReg()); 6441 unsigned Opc; 6442 if (Ty == LLT::fixed_vector(8, S8)) 6443 Opc = AArch64::ST1Twov8b; 6444 else if (Ty == LLT::fixed_vector(16, S8)) 6445 Opc = AArch64::ST1Twov16b; 6446 else if (Ty == LLT::fixed_vector(4, S16)) 6447 Opc = AArch64::ST1Twov4h; 6448 else if (Ty == LLT::fixed_vector(8, S16)) 6449 Opc = AArch64::ST1Twov8h; 6450 else if (Ty == LLT::fixed_vector(2, S32)) 6451 Opc = AArch64::ST1Twov2s; 6452 else if (Ty == LLT::fixed_vector(4, S32)) 6453 Opc = AArch64::ST1Twov4s; 6454 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6455 Opc = AArch64::ST1Twov2d; 6456 else if (Ty == S64 || Ty == P0) 6457 Opc = AArch64::ST1Twov1d; 6458 else 6459 llvm_unreachable("Unexpected type for st1x2!"); 6460 selectVectorStoreIntrinsic(I, 2, Opc); 6461 break; 6462 } 6463 case Intrinsic::aarch64_neon_st1x3: { 6464 LLT Ty = MRI.getType(I.getOperand(1).getReg()); 6465 unsigned Opc; 6466 if (Ty == LLT::fixed_vector(8, S8)) 6467 Opc = AArch64::ST1Threev8b; 6468 else if (Ty == LLT::fixed_vector(16, S8)) 6469 Opc = AArch64::ST1Threev16b; 6470 else if (Ty == LLT::fixed_vector(4, S16)) 6471 Opc = AArch64::ST1Threev4h; 6472 else if (Ty == LLT::fixed_vector(8, S16)) 6473 Opc = AArch64::ST1Threev8h; 6474 else if (Ty == LLT::fixed_vector(2, S32)) 6475 Opc = AArch64::ST1Threev2s; 6476 else if (Ty == LLT::fixed_vector(4, S32)) 6477 Opc = AArch64::ST1Threev4s; 6478 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6479 Opc = AArch64::ST1Threev2d; 6480 else if (Ty == S64 || Ty == P0) 6481 Opc = AArch64::ST1Threev1d; 6482 else 6483 llvm_unreachable("Unexpected type for st1x3!"); 6484 selectVectorStoreIntrinsic(I, 3, Opc); 6485 break; 6486 } 6487 case Intrinsic::aarch64_neon_st1x4: { 6488 LLT Ty = MRI.getType(I.getOperand(1).getReg()); 6489 unsigned Opc; 6490 if (Ty == LLT::fixed_vector(8, S8)) 6491 Opc = AArch64::ST1Fourv8b; 6492 else if (Ty == LLT::fixed_vector(16, S8)) 6493 Opc = AArch64::ST1Fourv16b; 6494 else if (Ty == LLT::fixed_vector(4, S16)) 6495 Opc = AArch64::ST1Fourv4h; 6496 else if (Ty == LLT::fixed_vector(8, S16)) 6497 Opc = AArch64::ST1Fourv8h; 6498 else if (Ty == LLT::fixed_vector(2, S32)) 6499 Opc = AArch64::ST1Fourv2s; 6500 else if (Ty == LLT::fixed_vector(4, S32)) 6501 Opc = AArch64::ST1Fourv4s; 6502 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6503 Opc = AArch64::ST1Fourv2d; 6504 else if (Ty == S64 || Ty == P0) 6505 Opc = AArch64::ST1Fourv1d; 6506 else 6507 llvm_unreachable("Unexpected type for st1x4!"); 6508 selectVectorStoreIntrinsic(I, 4, Opc); 6509 break; 6510 } 6511 case Intrinsic::aarch64_neon_st2: { 6512 LLT Ty = MRI.getType(I.getOperand(1).getReg()); 6513 unsigned Opc; 6514 if (Ty == LLT::fixed_vector(8, S8)) 6515 Opc = AArch64::ST2Twov8b; 6516 else if (Ty == LLT::fixed_vector(16, S8)) 6517 Opc = AArch64::ST2Twov16b; 6518 else if (Ty == LLT::fixed_vector(4, S16)) 6519 Opc = AArch64::ST2Twov4h; 6520 else if (Ty == LLT::fixed_vector(8, S16)) 6521 Opc = AArch64::ST2Twov8h; 6522 else if (Ty == LLT::fixed_vector(2, S32)) 6523 Opc = AArch64::ST2Twov2s; 6524 else if (Ty == LLT::fixed_vector(4, S32)) 6525 Opc = AArch64::ST2Twov4s; 6526 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6527 Opc = AArch64::ST2Twov2d; 6528 else if (Ty == S64 || Ty == P0) 6529 Opc = AArch64::ST1Twov1d; 6530 else 6531 llvm_unreachable("Unexpected type for st2!"); 6532 selectVectorStoreIntrinsic(I, 2, Opc); 6533 break; 6534 } 6535 case Intrinsic::aarch64_neon_st3: { 6536 LLT Ty = MRI.getType(I.getOperand(1).getReg()); 6537 unsigned Opc; 6538 if (Ty == LLT::fixed_vector(8, S8)) 6539 Opc = AArch64::ST3Threev8b; 6540 else if (Ty == LLT::fixed_vector(16, S8)) 6541 Opc = AArch64::ST3Threev16b; 6542 else if (Ty == LLT::fixed_vector(4, S16)) 6543 Opc = AArch64::ST3Threev4h; 6544 else if (Ty == LLT::fixed_vector(8, S16)) 6545 Opc = AArch64::ST3Threev8h; 6546 else if (Ty == LLT::fixed_vector(2, S32)) 6547 Opc = AArch64::ST3Threev2s; 6548 else if (Ty == LLT::fixed_vector(4, S32)) 6549 Opc = AArch64::ST3Threev4s; 6550 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6551 Opc = AArch64::ST3Threev2d; 6552 else if (Ty == S64 || Ty == P0) 6553 Opc = AArch64::ST1Threev1d; 6554 else 6555 llvm_unreachable("Unexpected type for st3!"); 6556 selectVectorStoreIntrinsic(I, 3, Opc); 6557 break; 6558 } 6559 case Intrinsic::aarch64_neon_st4: { 6560 LLT Ty = MRI.getType(I.getOperand(1).getReg()); 6561 unsigned Opc; 6562 if (Ty == LLT::fixed_vector(8, S8)) 6563 Opc = AArch64::ST4Fourv8b; 6564 else if (Ty == LLT::fixed_vector(16, S8)) 6565 Opc = AArch64::ST4Fourv16b; 6566 else if (Ty == LLT::fixed_vector(4, S16)) 6567 Opc = AArch64::ST4Fourv4h; 6568 else if (Ty == LLT::fixed_vector(8, S16)) 6569 Opc = AArch64::ST4Fourv8h; 6570 else if (Ty == LLT::fixed_vector(2, S32)) 6571 Opc = AArch64::ST4Fourv2s; 6572 else if (Ty == LLT::fixed_vector(4, S32)) 6573 Opc = AArch64::ST4Fourv4s; 6574 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6575 Opc = AArch64::ST4Fourv2d; 6576 else if (Ty == S64 || Ty == P0) 6577 Opc = AArch64::ST1Fourv1d; 6578 else 6579 llvm_unreachable("Unexpected type for st4!"); 6580 selectVectorStoreIntrinsic(I, 4, Opc); 6581 break; 6582 } 6583 case Intrinsic::aarch64_neon_st2lane: { 6584 LLT Ty = MRI.getType(I.getOperand(1).getReg()); 6585 unsigned Opc; 6586 if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8)) 6587 Opc = AArch64::ST2i8; 6588 else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16)) 6589 Opc = AArch64::ST2i16; 6590 else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32)) 6591 Opc = AArch64::ST2i32; 6592 else if (Ty == LLT::fixed_vector(2, S64) || 6593 Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0) 6594 Opc = AArch64::ST2i64; 6595 else 6596 llvm_unreachable("Unexpected type for st2lane!"); 6597 if (!selectVectorStoreLaneIntrinsic(I, 2, Opc)) 6598 return false; 6599 break; 6600 } 6601 case Intrinsic::aarch64_neon_st3lane: { 6602 LLT Ty = MRI.getType(I.getOperand(1).getReg()); 6603 unsigned Opc; 6604 if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8)) 6605 Opc = AArch64::ST3i8; 6606 else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16)) 6607 Opc = AArch64::ST3i16; 6608 else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32)) 6609 Opc = AArch64::ST3i32; 6610 else if (Ty == LLT::fixed_vector(2, S64) || 6611 Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0) 6612 Opc = AArch64::ST3i64; 6613 else 6614 llvm_unreachable("Unexpected type for st3lane!"); 6615 if (!selectVectorStoreLaneIntrinsic(I, 3, Opc)) 6616 return false; 6617 break; 6618 } 6619 case Intrinsic::aarch64_neon_st4lane: { 6620 LLT Ty = MRI.getType(I.getOperand(1).getReg()); 6621 unsigned Opc; 6622 if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8)) 6623 Opc = AArch64::ST4i8; 6624 else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16)) 6625 Opc = AArch64::ST4i16; 6626 else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32)) 6627 Opc = AArch64::ST4i32; 6628 else if (Ty == LLT::fixed_vector(2, S64) || 6629 Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0) 6630 Opc = AArch64::ST4i64; 6631 else 6632 llvm_unreachable("Unexpected type for st4lane!"); 6633 if (!selectVectorStoreLaneIntrinsic(I, 4, Opc)) 6634 return false; 6635 break; 6636 } 6637 case Intrinsic::aarch64_mops_memset_tag: { 6638 // Transform 6639 // %dst:gpr(p0) = \ 6640 // G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.mops.memset.tag), 6641 // \ %dst:gpr(p0), %val:gpr(s64), %n:gpr(s64) 6642 // where %dst is updated, into 6643 // %Rd:GPR64common, %Rn:GPR64) = \ 6644 // MOPSMemorySetTaggingPseudo \ 6645 // %Rd:GPR64common, %Rn:GPR64, %Rm:GPR64 6646 // where Rd and Rn are tied. 6647 // It is expected that %val has been extended to s64 in legalization. 6648 // Note that the order of the size/value operands are swapped. 6649 6650 Register DstDef = I.getOperand(0).getReg(); 6651 // I.getOperand(1) is the intrinsic function 6652 Register DstUse = I.getOperand(2).getReg(); 6653 Register ValUse = I.getOperand(3).getReg(); 6654 Register SizeUse = I.getOperand(4).getReg(); 6655 6656 // MOPSMemorySetTaggingPseudo has two defs; the intrinsic call has only one. 6657 // Therefore an additional virtual register is requried for the updated size 6658 // operand. This value is not accessible via the semantics of the intrinsic. 6659 Register SizeDef = MRI.createGenericVirtualRegister(LLT::scalar(64)); 6660 6661 auto Memset = MIB.buildInstr(AArch64::MOPSMemorySetTaggingPseudo, 6662 {DstDef, SizeDef}, {DstUse, SizeUse, ValUse}); 6663 Memset.cloneMemRefs(I); 6664 constrainSelectedInstRegOperands(*Memset, TII, TRI, RBI); 6665 break; 6666 } 6667 } 6668 6669 I.eraseFromParent(); 6670 return true; 6671 } 6672 6673 bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I, 6674 MachineRegisterInfo &MRI) { 6675 unsigned IntrinID = cast<GIntrinsic>(I).getIntrinsicID(); 6676 6677 switch (IntrinID) { 6678 default: 6679 break; 6680 case Intrinsic::aarch64_crypto_sha1h: { 6681 Register DstReg = I.getOperand(0).getReg(); 6682 Register SrcReg = I.getOperand(2).getReg(); 6683 6684 // FIXME: Should this be an assert? 6685 if (MRI.getType(DstReg).getSizeInBits() != 32 || 6686 MRI.getType(SrcReg).getSizeInBits() != 32) 6687 return false; 6688 6689 // The operation has to happen on FPRs. Set up some new FPR registers for 6690 // the source and destination if they are on GPRs. 6691 if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) { 6692 SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); 6693 MIB.buildCopy({SrcReg}, {I.getOperand(2)}); 6694 6695 // Make sure the copy ends up getting constrained properly. 6696 RBI.constrainGenericRegister(I.getOperand(2).getReg(), 6697 AArch64::GPR32RegClass, MRI); 6698 } 6699 6700 if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) 6701 DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); 6702 6703 // Actually insert the instruction. 6704 auto SHA1Inst = MIB.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg}); 6705 constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI); 6706 6707 // Did we create a new register for the destination? 6708 if (DstReg != I.getOperand(0).getReg()) { 6709 // Yep. Copy the result of the instruction back into the original 6710 // destination. 6711 MIB.buildCopy({I.getOperand(0)}, {DstReg}); 6712 RBI.constrainGenericRegister(I.getOperand(0).getReg(), 6713 AArch64::GPR32RegClass, MRI); 6714 } 6715 6716 I.eraseFromParent(); 6717 return true; 6718 } 6719 case Intrinsic::frameaddress: 6720 case Intrinsic::returnaddress: { 6721 MachineFunction &MF = *I.getParent()->getParent(); 6722 MachineFrameInfo &MFI = MF.getFrameInfo(); 6723 6724 unsigned Depth = I.getOperand(2).getImm(); 6725 Register DstReg = I.getOperand(0).getReg(); 6726 RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI); 6727 6728 if (Depth == 0 && IntrinID == Intrinsic::returnaddress) { 6729 if (!MFReturnAddr) { 6730 // Insert the copy from LR/X30 into the entry block, before it can be 6731 // clobbered by anything. 6732 MFI.setReturnAddressIsTaken(true); 6733 MFReturnAddr = getFunctionLiveInPhysReg( 6734 MF, TII, AArch64::LR, AArch64::GPR64RegClass, I.getDebugLoc()); 6735 } 6736 6737 if (STI.hasPAuth()) { 6738 MIB.buildInstr(AArch64::XPACI, {DstReg}, {MFReturnAddr}); 6739 } else { 6740 MIB.buildCopy({Register(AArch64::LR)}, {MFReturnAddr}); 6741 MIB.buildInstr(AArch64::XPACLRI); 6742 MIB.buildCopy({DstReg}, {Register(AArch64::LR)}); 6743 } 6744 6745 I.eraseFromParent(); 6746 return true; 6747 } 6748 6749 MFI.setFrameAddressIsTaken(true); 6750 Register FrameAddr(AArch64::FP); 6751 while (Depth--) { 6752 Register NextFrame = MRI.createVirtualRegister(&AArch64::GPR64spRegClass); 6753 auto Ldr = 6754 MIB.buildInstr(AArch64::LDRXui, {NextFrame}, {FrameAddr}).addImm(0); 6755 constrainSelectedInstRegOperands(*Ldr, TII, TRI, RBI); 6756 FrameAddr = NextFrame; 6757 } 6758 6759 if (IntrinID == Intrinsic::frameaddress) 6760 MIB.buildCopy({DstReg}, {FrameAddr}); 6761 else { 6762 MFI.setReturnAddressIsTaken(true); 6763 6764 if (STI.hasPAuth()) { 6765 Register TmpReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); 6766 MIB.buildInstr(AArch64::LDRXui, {TmpReg}, {FrameAddr}).addImm(1); 6767 MIB.buildInstr(AArch64::XPACI, {DstReg}, {TmpReg}); 6768 } else { 6769 MIB.buildInstr(AArch64::LDRXui, {Register(AArch64::LR)}, {FrameAddr}) 6770 .addImm(1); 6771 MIB.buildInstr(AArch64::XPACLRI); 6772 MIB.buildCopy({DstReg}, {Register(AArch64::LR)}); 6773 } 6774 } 6775 6776 I.eraseFromParent(); 6777 return true; 6778 } 6779 case Intrinsic::swift_async_context_addr: 6780 auto Sub = MIB.buildInstr(AArch64::SUBXri, {I.getOperand(0).getReg()}, 6781 {Register(AArch64::FP)}) 6782 .addImm(8) 6783 .addImm(0); 6784 constrainSelectedInstRegOperands(*Sub, TII, TRI, RBI); 6785 6786 MF->getFrameInfo().setFrameAddressIsTaken(true); 6787 MF->getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true); 6788 I.eraseFromParent(); 6789 return true; 6790 } 6791 return false; 6792 } 6793 6794 InstructionSelector::ComplexRendererFns 6795 AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const { 6796 auto MaybeImmed = getImmedFromMO(Root); 6797 if (MaybeImmed == std::nullopt || *MaybeImmed > 31) 6798 return std::nullopt; 6799 uint64_t Enc = (32 - *MaybeImmed) & 0x1f; 6800 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; 6801 } 6802 6803 InstructionSelector::ComplexRendererFns 6804 AArch64InstructionSelector::selectShiftB_32(const MachineOperand &Root) const { 6805 auto MaybeImmed = getImmedFromMO(Root); 6806 if (MaybeImmed == std::nullopt || *MaybeImmed > 31) 6807 return std::nullopt; 6808 uint64_t Enc = 31 - *MaybeImmed; 6809 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; 6810 } 6811 6812 InstructionSelector::ComplexRendererFns 6813 AArch64InstructionSelector::selectShiftA_64(const MachineOperand &Root) const { 6814 auto MaybeImmed = getImmedFromMO(Root); 6815 if (MaybeImmed == std::nullopt || *MaybeImmed > 63) 6816 return std::nullopt; 6817 uint64_t Enc = (64 - *MaybeImmed) & 0x3f; 6818 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; 6819 } 6820 6821 InstructionSelector::ComplexRendererFns 6822 AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const { 6823 auto MaybeImmed = getImmedFromMO(Root); 6824 if (MaybeImmed == std::nullopt || *MaybeImmed > 63) 6825 return std::nullopt; 6826 uint64_t Enc = 63 - *MaybeImmed; 6827 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; 6828 } 6829 6830 /// Helper to select an immediate value that can be represented as a 12-bit 6831 /// value shifted left by either 0 or 12. If it is possible to do so, return 6832 /// the immediate and shift value. If not, return std::nullopt. 6833 /// 6834 /// Used by selectArithImmed and selectNegArithImmed. 6835 InstructionSelector::ComplexRendererFns 6836 AArch64InstructionSelector::select12BitValueWithLeftShift( 6837 uint64_t Immed) const { 6838 unsigned ShiftAmt; 6839 if (Immed >> 12 == 0) { 6840 ShiftAmt = 0; 6841 } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) { 6842 ShiftAmt = 12; 6843 Immed = Immed >> 12; 6844 } else 6845 return std::nullopt; 6846 6847 unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt); 6848 return {{ 6849 [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); }, 6850 [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); }, 6851 }}; 6852 } 6853 6854 /// SelectArithImmed - Select an immediate value that can be represented as 6855 /// a 12-bit value shifted left by either 0 or 12. If so, return true with 6856 /// Val set to the 12-bit value and Shift set to the shifter operand. 6857 InstructionSelector::ComplexRendererFns 6858 AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const { 6859 // This function is called from the addsub_shifted_imm ComplexPattern, 6860 // which lists [imm] as the list of opcode it's interested in, however 6861 // we still need to check whether the operand is actually an immediate 6862 // here because the ComplexPattern opcode list is only used in 6863 // root-level opcode matching. 6864 auto MaybeImmed = getImmedFromMO(Root); 6865 if (MaybeImmed == std::nullopt) 6866 return std::nullopt; 6867 return select12BitValueWithLeftShift(*MaybeImmed); 6868 } 6869 6870 /// SelectNegArithImmed - As above, but negates the value before trying to 6871 /// select it. 6872 InstructionSelector::ComplexRendererFns 6873 AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const { 6874 // We need a register here, because we need to know if we have a 64 or 32 6875 // bit immediate. 6876 if (!Root.isReg()) 6877 return std::nullopt; 6878 auto MaybeImmed = getImmedFromMO(Root); 6879 if (MaybeImmed == std::nullopt) 6880 return std::nullopt; 6881 uint64_t Immed = *MaybeImmed; 6882 6883 // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0" 6884 // have the opposite effect on the C flag, so this pattern mustn't match under 6885 // those circumstances. 6886 if (Immed == 0) 6887 return std::nullopt; 6888 6889 // Check if we're dealing with a 32-bit type on the root or a 64-bit type on 6890 // the root. 6891 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 6892 if (MRI.getType(Root.getReg()).getSizeInBits() == 32) 6893 Immed = ~((uint32_t)Immed) + 1; 6894 else 6895 Immed = ~Immed + 1ULL; 6896 6897 if (Immed & 0xFFFFFFFFFF000000ULL) 6898 return std::nullopt; 6899 6900 Immed &= 0xFFFFFFULL; 6901 return select12BitValueWithLeftShift(Immed); 6902 } 6903 6904 /// Return true if it is worth folding MI into an extended register. That is, 6905 /// if it's safe to pull it into the addressing mode of a load or store as a 6906 /// shift. 6907 bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg( 6908 MachineInstr &MI, const MachineRegisterInfo &MRI) const { 6909 // Always fold if there is one use, or if we're optimizing for size. 6910 Register DefReg = MI.getOperand(0).getReg(); 6911 if (MRI.hasOneNonDBGUse(DefReg) || 6912 MI.getParent()->getParent()->getFunction().hasOptSize()) 6913 return true; 6914 6915 // It's better to avoid folding and recomputing shifts when we don't have a 6916 // fastpath. 6917 if (!STI.hasAddrLSLFast()) 6918 return false; 6919 6920 // We have a fastpath, so folding a shift in and potentially computing it 6921 // many times may be beneficial. Check if this is only used in memory ops. 6922 // If it is, then we should fold. 6923 return all_of(MRI.use_nodbg_instructions(DefReg), 6924 [](MachineInstr &Use) { return Use.mayLoadOrStore(); }); 6925 } 6926 6927 static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) { 6928 switch (Type) { 6929 case AArch64_AM::SXTB: 6930 case AArch64_AM::SXTH: 6931 case AArch64_AM::SXTW: 6932 return true; 6933 default: 6934 return false; 6935 } 6936 } 6937 6938 InstructionSelector::ComplexRendererFns 6939 AArch64InstructionSelector::selectExtendedSHL( 6940 MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset, 6941 unsigned SizeInBytes, bool WantsExt) const { 6942 assert(Base.isReg() && "Expected base to be a register operand"); 6943 assert(Offset.isReg() && "Expected offset to be a register operand"); 6944 6945 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 6946 MachineInstr *OffsetInst = MRI.getVRegDef(Offset.getReg()); 6947 6948 unsigned OffsetOpc = OffsetInst->getOpcode(); 6949 bool LookedThroughZExt = false; 6950 if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) { 6951 // Try to look through a ZEXT. 6952 if (OffsetOpc != TargetOpcode::G_ZEXT || !WantsExt) 6953 return std::nullopt; 6954 6955 OffsetInst = MRI.getVRegDef(OffsetInst->getOperand(1).getReg()); 6956 OffsetOpc = OffsetInst->getOpcode(); 6957 LookedThroughZExt = true; 6958 6959 if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) 6960 return std::nullopt; 6961 } 6962 // Make sure that the memory op is a valid size. 6963 int64_t LegalShiftVal = Log2_32(SizeInBytes); 6964 if (LegalShiftVal == 0) 6965 return std::nullopt; 6966 if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI)) 6967 return std::nullopt; 6968 6969 // Now, try to find the specific G_CONSTANT. Start by assuming that the 6970 // register we will offset is the LHS, and the register containing the 6971 // constant is the RHS. 6972 Register OffsetReg = OffsetInst->getOperand(1).getReg(); 6973 Register ConstantReg = OffsetInst->getOperand(2).getReg(); 6974 auto ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI); 6975 if (!ValAndVReg) { 6976 // We didn't get a constant on the RHS. If the opcode is a shift, then 6977 // we're done. 6978 if (OffsetOpc == TargetOpcode::G_SHL) 6979 return std::nullopt; 6980 6981 // If we have a G_MUL, we can use either register. Try looking at the RHS. 6982 std::swap(OffsetReg, ConstantReg); 6983 ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI); 6984 if (!ValAndVReg) 6985 return std::nullopt; 6986 } 6987 6988 // The value must fit into 3 bits, and must be positive. Make sure that is 6989 // true. 6990 int64_t ImmVal = ValAndVReg->Value.getSExtValue(); 6991 6992 // Since we're going to pull this into a shift, the constant value must be 6993 // a power of 2. If we got a multiply, then we need to check this. 6994 if (OffsetOpc == TargetOpcode::G_MUL) { 6995 if (!llvm::has_single_bit<uint32_t>(ImmVal)) 6996 return std::nullopt; 6997 6998 // Got a power of 2. So, the amount we'll shift is the log base-2 of that. 6999 ImmVal = Log2_32(ImmVal); 7000 } 7001 7002 if ((ImmVal & 0x7) != ImmVal) 7003 return std::nullopt; 7004 7005 // We are only allowed to shift by LegalShiftVal. This shift value is built 7006 // into the instruction, so we can't just use whatever we want. 7007 if (ImmVal != LegalShiftVal) 7008 return std::nullopt; 7009 7010 unsigned SignExtend = 0; 7011 if (WantsExt) { 7012 // Check if the offset is defined by an extend, unless we looked through a 7013 // G_ZEXT earlier. 7014 if (!LookedThroughZExt) { 7015 MachineInstr *ExtInst = getDefIgnoringCopies(OffsetReg, MRI); 7016 auto Ext = getExtendTypeForInst(*ExtInst, MRI, true); 7017 if (Ext == AArch64_AM::InvalidShiftExtend) 7018 return std::nullopt; 7019 7020 SignExtend = isSignExtendShiftType(Ext) ? 1 : 0; 7021 // We only support SXTW for signed extension here. 7022 if (SignExtend && Ext != AArch64_AM::SXTW) 7023 return std::nullopt; 7024 OffsetReg = ExtInst->getOperand(1).getReg(); 7025 } 7026 7027 // Need a 32-bit wide register here. 7028 MachineIRBuilder MIB(*MRI.getVRegDef(Root.getReg())); 7029 OffsetReg = moveScalarRegClass(OffsetReg, AArch64::GPR32RegClass, MIB); 7030 } 7031 7032 // We can use the LHS of the GEP as the base, and the LHS of the shift as an 7033 // offset. Signify that we are shifting by setting the shift flag to 1. 7034 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(Base.getReg()); }, 7035 [=](MachineInstrBuilder &MIB) { MIB.addUse(OffsetReg); }, 7036 [=](MachineInstrBuilder &MIB) { 7037 // Need to add both immediates here to make sure that they are both 7038 // added to the instruction. 7039 MIB.addImm(SignExtend); 7040 MIB.addImm(1); 7041 }}}; 7042 } 7043 7044 /// This is used for computing addresses like this: 7045 /// 7046 /// ldr x1, [x2, x3, lsl #3] 7047 /// 7048 /// Where x2 is the base register, and x3 is an offset register. The shift-left 7049 /// is a constant value specific to this load instruction. That is, we'll never 7050 /// see anything other than a 3 here (which corresponds to the size of the 7051 /// element being loaded.) 7052 InstructionSelector::ComplexRendererFns 7053 AArch64InstructionSelector::selectAddrModeShiftedExtendXReg( 7054 MachineOperand &Root, unsigned SizeInBytes) const { 7055 if (!Root.isReg()) 7056 return std::nullopt; 7057 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 7058 7059 // We want to find something like this: 7060 // 7061 // val = G_CONSTANT LegalShiftVal 7062 // shift = G_SHL off_reg val 7063 // ptr = G_PTR_ADD base_reg shift 7064 // x = G_LOAD ptr 7065 // 7066 // And fold it into this addressing mode: 7067 // 7068 // ldr x, [base_reg, off_reg, lsl #LegalShiftVal] 7069 7070 // Check if we can find the G_PTR_ADD. 7071 MachineInstr *PtrAdd = 7072 getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); 7073 if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI)) 7074 return std::nullopt; 7075 7076 // Now, try to match an opcode which will match our specific offset. 7077 // We want a G_SHL or a G_MUL. 7078 MachineInstr *OffsetInst = 7079 getDefIgnoringCopies(PtrAdd->getOperand(2).getReg(), MRI); 7080 return selectExtendedSHL(Root, PtrAdd->getOperand(1), 7081 OffsetInst->getOperand(0), SizeInBytes, 7082 /*WantsExt=*/false); 7083 } 7084 7085 /// This is used for computing addresses like this: 7086 /// 7087 /// ldr x1, [x2, x3] 7088 /// 7089 /// Where x2 is the base register, and x3 is an offset register. 7090 /// 7091 /// When possible (or profitable) to fold a G_PTR_ADD into the address 7092 /// calculation, this will do so. Otherwise, it will return std::nullopt. 7093 InstructionSelector::ComplexRendererFns 7094 AArch64InstructionSelector::selectAddrModeRegisterOffset( 7095 MachineOperand &Root) const { 7096 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 7097 7098 // We need a GEP. 7099 MachineInstr *Gep = MRI.getVRegDef(Root.getReg()); 7100 if (Gep->getOpcode() != TargetOpcode::G_PTR_ADD) 7101 return std::nullopt; 7102 7103 // If this is used more than once, let's not bother folding. 7104 // TODO: Check if they are memory ops. If they are, then we can still fold 7105 // without having to recompute anything. 7106 if (!MRI.hasOneNonDBGUse(Gep->getOperand(0).getReg())) 7107 return std::nullopt; 7108 7109 // Base is the GEP's LHS, offset is its RHS. 7110 return {{[=](MachineInstrBuilder &MIB) { 7111 MIB.addUse(Gep->getOperand(1).getReg()); 7112 }, 7113 [=](MachineInstrBuilder &MIB) { 7114 MIB.addUse(Gep->getOperand(2).getReg()); 7115 }, 7116 [=](MachineInstrBuilder &MIB) { 7117 // Need to add both immediates here to make sure that they are both 7118 // added to the instruction. 7119 MIB.addImm(0); 7120 MIB.addImm(0); 7121 }}}; 7122 } 7123 7124 /// This is intended to be equivalent to selectAddrModeXRO in 7125 /// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads. 7126 InstructionSelector::ComplexRendererFns 7127 AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root, 7128 unsigned SizeInBytes) const { 7129 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 7130 if (!Root.isReg()) 7131 return std::nullopt; 7132 MachineInstr *PtrAdd = 7133 getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); 7134 if (!PtrAdd) 7135 return std::nullopt; 7136 7137 // Check for an immediates which cannot be encoded in the [base + imm] 7138 // addressing mode, and can't be encoded in an add/sub. If this happens, we'll 7139 // end up with code like: 7140 // 7141 // mov x0, wide 7142 // add x1 base, x0 7143 // ldr x2, [x1, x0] 7144 // 7145 // In this situation, we can use the [base, xreg] addressing mode to save an 7146 // add/sub: 7147 // 7148 // mov x0, wide 7149 // ldr x2, [base, x0] 7150 auto ValAndVReg = 7151 getIConstantVRegValWithLookThrough(PtrAdd->getOperand(2).getReg(), MRI); 7152 if (ValAndVReg) { 7153 unsigned Scale = Log2_32(SizeInBytes); 7154 int64_t ImmOff = ValAndVReg->Value.getSExtValue(); 7155 7156 // Skip immediates that can be selected in the load/store addresing 7157 // mode. 7158 if (ImmOff % SizeInBytes == 0 && ImmOff >= 0 && 7159 ImmOff < (0x1000 << Scale)) 7160 return std::nullopt; 7161 7162 // Helper lambda to decide whether or not it is preferable to emit an add. 7163 auto isPreferredADD = [](int64_t ImmOff) { 7164 // Constants in [0x0, 0xfff] can be encoded in an add. 7165 if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL) 7166 return true; 7167 7168 // Can it be encoded in an add lsl #12? 7169 if ((ImmOff & 0xffffffffff000fffLL) != 0x0LL) 7170 return false; 7171 7172 // It can be encoded in an add lsl #12, but we may not want to. If it is 7173 // possible to select this as a single movz, then prefer that. A single 7174 // movz is faster than an add with a shift. 7175 return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL && 7176 (ImmOff & 0xffffffffffff0fffLL) != 0x0LL; 7177 }; 7178 7179 // If the immediate can be encoded in a single add/sub, then bail out. 7180 if (isPreferredADD(ImmOff) || isPreferredADD(-ImmOff)) 7181 return std::nullopt; 7182 } 7183 7184 // Try to fold shifts into the addressing mode. 7185 auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes); 7186 if (AddrModeFns) 7187 return AddrModeFns; 7188 7189 // If that doesn't work, see if it's possible to fold in registers from 7190 // a GEP. 7191 return selectAddrModeRegisterOffset(Root); 7192 } 7193 7194 /// This is used for computing addresses like this: 7195 /// 7196 /// ldr x0, [xBase, wOffset, sxtw #LegalShiftVal] 7197 /// 7198 /// Where we have a 64-bit base register, a 32-bit offset register, and an 7199 /// extend (which may or may not be signed). 7200 InstructionSelector::ComplexRendererFns 7201 AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root, 7202 unsigned SizeInBytes) const { 7203 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 7204 7205 MachineInstr *PtrAdd = 7206 getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); 7207 if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI)) 7208 return std::nullopt; 7209 7210 MachineOperand &LHS = PtrAdd->getOperand(1); 7211 MachineOperand &RHS = PtrAdd->getOperand(2); 7212 MachineInstr *OffsetInst = getDefIgnoringCopies(RHS.getReg(), MRI); 7213 7214 // The first case is the same as selectAddrModeXRO, except we need an extend. 7215 // In this case, we try to find a shift and extend, and fold them into the 7216 // addressing mode. 7217 // 7218 // E.g. 7219 // 7220 // off_reg = G_Z/S/ANYEXT ext_reg 7221 // val = G_CONSTANT LegalShiftVal 7222 // shift = G_SHL off_reg val 7223 // ptr = G_PTR_ADD base_reg shift 7224 // x = G_LOAD ptr 7225 // 7226 // In this case we can get a load like this: 7227 // 7228 // ldr x0, [base_reg, ext_reg, sxtw #LegalShiftVal] 7229 auto ExtendedShl = selectExtendedSHL(Root, LHS, OffsetInst->getOperand(0), 7230 SizeInBytes, /*WantsExt=*/true); 7231 if (ExtendedShl) 7232 return ExtendedShl; 7233 7234 // There was no shift. We can try and fold a G_Z/S/ANYEXT in alone though. 7235 // 7236 // e.g. 7237 // ldr something, [base_reg, ext_reg, sxtw] 7238 if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI)) 7239 return std::nullopt; 7240 7241 // Check if this is an extend. We'll get an extend type if it is. 7242 AArch64_AM::ShiftExtendType Ext = 7243 getExtendTypeForInst(*OffsetInst, MRI, /*IsLoadStore=*/true); 7244 if (Ext == AArch64_AM::InvalidShiftExtend) 7245 return std::nullopt; 7246 7247 // Need a 32-bit wide register. 7248 MachineIRBuilder MIB(*PtrAdd); 7249 Register ExtReg = moveScalarRegClass(OffsetInst->getOperand(1).getReg(), 7250 AArch64::GPR32RegClass, MIB); 7251 unsigned SignExtend = Ext == AArch64_AM::SXTW; 7252 7253 // Base is LHS, offset is ExtReg. 7254 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(LHS.getReg()); }, 7255 [=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }, 7256 [=](MachineInstrBuilder &MIB) { 7257 MIB.addImm(SignExtend); 7258 MIB.addImm(0); 7259 }}}; 7260 } 7261 7262 /// Select a "register plus unscaled signed 9-bit immediate" address. This 7263 /// should only match when there is an offset that is not valid for a scaled 7264 /// immediate addressing mode. The "Size" argument is the size in bytes of the 7265 /// memory reference, which is needed here to know what is valid for a scaled 7266 /// immediate. 7267 InstructionSelector::ComplexRendererFns 7268 AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root, 7269 unsigned Size) const { 7270 MachineRegisterInfo &MRI = 7271 Root.getParent()->getParent()->getParent()->getRegInfo(); 7272 7273 if (!Root.isReg()) 7274 return std::nullopt; 7275 7276 if (!isBaseWithConstantOffset(Root, MRI)) 7277 return std::nullopt; 7278 7279 MachineInstr *RootDef = MRI.getVRegDef(Root.getReg()); 7280 7281 MachineOperand &OffImm = RootDef->getOperand(2); 7282 if (!OffImm.isReg()) 7283 return std::nullopt; 7284 MachineInstr *RHS = MRI.getVRegDef(OffImm.getReg()); 7285 if (RHS->getOpcode() != TargetOpcode::G_CONSTANT) 7286 return std::nullopt; 7287 int64_t RHSC; 7288 MachineOperand &RHSOp1 = RHS->getOperand(1); 7289 if (!RHSOp1.isCImm() || RHSOp1.getCImm()->getBitWidth() > 64) 7290 return std::nullopt; 7291 RHSC = RHSOp1.getCImm()->getSExtValue(); 7292 7293 if (RHSC >= -256 && RHSC < 256) { 7294 MachineOperand &Base = RootDef->getOperand(1); 7295 return {{ 7296 [=](MachineInstrBuilder &MIB) { MIB.add(Base); }, 7297 [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC); }, 7298 }}; 7299 } 7300 return std::nullopt; 7301 } 7302 7303 InstructionSelector::ComplexRendererFns 7304 AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef, 7305 unsigned Size, 7306 MachineRegisterInfo &MRI) const { 7307 if (RootDef.getOpcode() != AArch64::G_ADD_LOW) 7308 return std::nullopt; 7309 MachineInstr &Adrp = *MRI.getVRegDef(RootDef.getOperand(1).getReg()); 7310 if (Adrp.getOpcode() != AArch64::ADRP) 7311 return std::nullopt; 7312 7313 // TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG. 7314 auto Offset = Adrp.getOperand(1).getOffset(); 7315 if (Offset % Size != 0) 7316 return std::nullopt; 7317 7318 auto GV = Adrp.getOperand(1).getGlobal(); 7319 if (GV->isThreadLocal()) 7320 return std::nullopt; 7321 7322 auto &MF = *RootDef.getParent()->getParent(); 7323 if (GV->getPointerAlignment(MF.getDataLayout()) < Size) 7324 return std::nullopt; 7325 7326 unsigned OpFlags = STI.ClassifyGlobalReference(GV, MF.getTarget()); 7327 MachineIRBuilder MIRBuilder(RootDef); 7328 Register AdrpReg = Adrp.getOperand(0).getReg(); 7329 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(AdrpReg); }, 7330 [=](MachineInstrBuilder &MIB) { 7331 MIB.addGlobalAddress(GV, Offset, 7332 OpFlags | AArch64II::MO_PAGEOFF | 7333 AArch64II::MO_NC); 7334 }}}; 7335 } 7336 7337 /// Select a "register plus scaled unsigned 12-bit immediate" address. The 7338 /// "Size" argument is the size in bytes of the memory reference, which 7339 /// determines the scale. 7340 InstructionSelector::ComplexRendererFns 7341 AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root, 7342 unsigned Size) const { 7343 MachineFunction &MF = *Root.getParent()->getParent()->getParent(); 7344 MachineRegisterInfo &MRI = MF.getRegInfo(); 7345 7346 if (!Root.isReg()) 7347 return std::nullopt; 7348 7349 MachineInstr *RootDef = MRI.getVRegDef(Root.getReg()); 7350 if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) { 7351 return {{ 7352 [=](MachineInstrBuilder &MIB) { MIB.add(RootDef->getOperand(1)); }, 7353 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, 7354 }}; 7355 } 7356 7357 CodeModel::Model CM = MF.getTarget().getCodeModel(); 7358 // Check if we can fold in the ADD of small code model ADRP + ADD address. 7359 if (CM == CodeModel::Small) { 7360 auto OpFns = tryFoldAddLowIntoImm(*RootDef, Size, MRI); 7361 if (OpFns) 7362 return OpFns; 7363 } 7364 7365 if (isBaseWithConstantOffset(Root, MRI)) { 7366 MachineOperand &LHS = RootDef->getOperand(1); 7367 MachineOperand &RHS = RootDef->getOperand(2); 7368 MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg()); 7369 MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg()); 7370 7371 int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue(); 7372 unsigned Scale = Log2_32(Size); 7373 if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) { 7374 if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) 7375 return {{ 7376 [=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); }, 7377 [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); }, 7378 }}; 7379 7380 return {{ 7381 [=](MachineInstrBuilder &MIB) { MIB.add(LHS); }, 7382 [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); }, 7383 }}; 7384 } 7385 } 7386 7387 // Before falling back to our general case, check if the unscaled 7388 // instructions can handle this. If so, that's preferable. 7389 if (selectAddrModeUnscaled(Root, Size)) 7390 return std::nullopt; 7391 7392 return {{ 7393 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 7394 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, 7395 }}; 7396 } 7397 7398 /// Given a shift instruction, return the correct shift type for that 7399 /// instruction. 7400 static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) { 7401 switch (MI.getOpcode()) { 7402 default: 7403 return AArch64_AM::InvalidShiftExtend; 7404 case TargetOpcode::G_SHL: 7405 return AArch64_AM::LSL; 7406 case TargetOpcode::G_LSHR: 7407 return AArch64_AM::LSR; 7408 case TargetOpcode::G_ASHR: 7409 return AArch64_AM::ASR; 7410 case TargetOpcode::G_ROTR: 7411 return AArch64_AM::ROR; 7412 } 7413 } 7414 7415 /// Select a "shifted register" operand. If the value is not shifted, set the 7416 /// shift operand to a default value of "lsl 0". 7417 InstructionSelector::ComplexRendererFns 7418 AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root, 7419 bool AllowROR) const { 7420 if (!Root.isReg()) 7421 return std::nullopt; 7422 MachineRegisterInfo &MRI = 7423 Root.getParent()->getParent()->getParent()->getRegInfo(); 7424 7425 // Check if the operand is defined by an instruction which corresponds to 7426 // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc. 7427 MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg()); 7428 AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst); 7429 if (ShType == AArch64_AM::InvalidShiftExtend) 7430 return std::nullopt; 7431 if (ShType == AArch64_AM::ROR && !AllowROR) 7432 return std::nullopt; 7433 if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI)) 7434 return std::nullopt; 7435 7436 // Need an immediate on the RHS. 7437 MachineOperand &ShiftRHS = ShiftInst->getOperand(2); 7438 auto Immed = getImmedFromMO(ShiftRHS); 7439 if (!Immed) 7440 return std::nullopt; 7441 7442 // We have something that we can fold. Fold in the shift's LHS and RHS into 7443 // the instruction. 7444 MachineOperand &ShiftLHS = ShiftInst->getOperand(1); 7445 Register ShiftReg = ShiftLHS.getReg(); 7446 7447 unsigned NumBits = MRI.getType(ShiftReg).getSizeInBits(); 7448 unsigned Val = *Immed & (NumBits - 1); 7449 unsigned ShiftVal = AArch64_AM::getShifterImm(ShType, Val); 7450 7451 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ShiftReg); }, 7452 [=](MachineInstrBuilder &MIB) { MIB.addImm(ShiftVal); }}}; 7453 } 7454 7455 AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst( 7456 MachineInstr &MI, MachineRegisterInfo &MRI, bool IsLoadStore) const { 7457 unsigned Opc = MI.getOpcode(); 7458 7459 // Handle explicit extend instructions first. 7460 if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) { 7461 unsigned Size; 7462 if (Opc == TargetOpcode::G_SEXT) 7463 Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 7464 else 7465 Size = MI.getOperand(2).getImm(); 7466 assert(Size != 64 && "Extend from 64 bits?"); 7467 switch (Size) { 7468 case 8: 7469 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTB; 7470 case 16: 7471 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTH; 7472 case 32: 7473 return AArch64_AM::SXTW; 7474 default: 7475 return AArch64_AM::InvalidShiftExtend; 7476 } 7477 } 7478 7479 if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) { 7480 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 7481 assert(Size != 64 && "Extend from 64 bits?"); 7482 switch (Size) { 7483 case 8: 7484 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTB; 7485 case 16: 7486 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTH; 7487 case 32: 7488 return AArch64_AM::UXTW; 7489 default: 7490 return AArch64_AM::InvalidShiftExtend; 7491 } 7492 } 7493 7494 // Don't have an explicit extend. Try to handle a G_AND with a constant mask 7495 // on the RHS. 7496 if (Opc != TargetOpcode::G_AND) 7497 return AArch64_AM::InvalidShiftExtend; 7498 7499 std::optional<uint64_t> MaybeAndMask = getImmedFromMO(MI.getOperand(2)); 7500 if (!MaybeAndMask) 7501 return AArch64_AM::InvalidShiftExtend; 7502 uint64_t AndMask = *MaybeAndMask; 7503 switch (AndMask) { 7504 default: 7505 return AArch64_AM::InvalidShiftExtend; 7506 case 0xFF: 7507 return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend; 7508 case 0xFFFF: 7509 return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend; 7510 case 0xFFFFFFFF: 7511 return AArch64_AM::UXTW; 7512 } 7513 } 7514 7515 Register AArch64InstructionSelector::moveScalarRegClass( 7516 Register Reg, const TargetRegisterClass &RC, MachineIRBuilder &MIB) const { 7517 MachineRegisterInfo &MRI = *MIB.getMRI(); 7518 auto Ty = MRI.getType(Reg); 7519 assert(!Ty.isVector() && "Expected scalars only!"); 7520 if (Ty.getSizeInBits() == TRI.getRegSizeInBits(RC)) 7521 return Reg; 7522 7523 // Create a copy and immediately select it. 7524 // FIXME: We should have an emitCopy function? 7525 auto Copy = MIB.buildCopy({&RC}, {Reg}); 7526 selectCopy(*Copy, TII, MRI, TRI, RBI); 7527 return Copy.getReg(0); 7528 } 7529 7530 /// Select an "extended register" operand. This operand folds in an extend 7531 /// followed by an optional left shift. 7532 InstructionSelector::ComplexRendererFns 7533 AArch64InstructionSelector::selectArithExtendedRegister( 7534 MachineOperand &Root) const { 7535 if (!Root.isReg()) 7536 return std::nullopt; 7537 MachineRegisterInfo &MRI = 7538 Root.getParent()->getParent()->getParent()->getRegInfo(); 7539 7540 uint64_t ShiftVal = 0; 7541 Register ExtReg; 7542 AArch64_AM::ShiftExtendType Ext; 7543 MachineInstr *RootDef = getDefIgnoringCopies(Root.getReg(), MRI); 7544 if (!RootDef) 7545 return std::nullopt; 7546 7547 if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI)) 7548 return std::nullopt; 7549 7550 // Check if we can fold a shift and an extend. 7551 if (RootDef->getOpcode() == TargetOpcode::G_SHL) { 7552 // Look for a constant on the RHS of the shift. 7553 MachineOperand &RHS = RootDef->getOperand(2); 7554 std::optional<uint64_t> MaybeShiftVal = getImmedFromMO(RHS); 7555 if (!MaybeShiftVal) 7556 return std::nullopt; 7557 ShiftVal = *MaybeShiftVal; 7558 if (ShiftVal > 4) 7559 return std::nullopt; 7560 // Look for a valid extend instruction on the LHS of the shift. 7561 MachineOperand &LHS = RootDef->getOperand(1); 7562 MachineInstr *ExtDef = getDefIgnoringCopies(LHS.getReg(), MRI); 7563 if (!ExtDef) 7564 return std::nullopt; 7565 Ext = getExtendTypeForInst(*ExtDef, MRI); 7566 if (Ext == AArch64_AM::InvalidShiftExtend) 7567 return std::nullopt; 7568 ExtReg = ExtDef->getOperand(1).getReg(); 7569 } else { 7570 // Didn't get a shift. Try just folding an extend. 7571 Ext = getExtendTypeForInst(*RootDef, MRI); 7572 if (Ext == AArch64_AM::InvalidShiftExtend) 7573 return std::nullopt; 7574 ExtReg = RootDef->getOperand(1).getReg(); 7575 7576 // If we have a 32 bit instruction which zeroes out the high half of a 7577 // register, we get an implicit zero extend for free. Check if we have one. 7578 // FIXME: We actually emit the extend right now even though we don't have 7579 // to. 7580 if (Ext == AArch64_AM::UXTW && MRI.getType(ExtReg).getSizeInBits() == 32) { 7581 MachineInstr *ExtInst = MRI.getVRegDef(ExtReg); 7582 if (isDef32(*ExtInst)) 7583 return std::nullopt; 7584 } 7585 } 7586 7587 // We require a GPR32 here. Narrow the ExtReg if needed using a subregister 7588 // copy. 7589 MachineIRBuilder MIB(*RootDef); 7590 ExtReg = moveScalarRegClass(ExtReg, AArch64::GPR32RegClass, MIB); 7591 7592 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }, 7593 [=](MachineInstrBuilder &MIB) { 7594 MIB.addImm(getArithExtendImm(Ext, ShiftVal)); 7595 }}}; 7596 } 7597 7598 InstructionSelector::ComplexRendererFns 7599 AArch64InstructionSelector::selectExtractHigh(MachineOperand &Root) const { 7600 if (!Root.isReg()) 7601 return std::nullopt; 7602 MachineRegisterInfo &MRI = 7603 Root.getParent()->getParent()->getParent()->getRegInfo(); 7604 7605 auto Extract = getDefSrcRegIgnoringCopies(Root.getReg(), MRI); 7606 while (Extract && Extract->MI->getOpcode() == TargetOpcode::G_BITCAST && 7607 STI.isLittleEndian()) 7608 Extract = 7609 getDefSrcRegIgnoringCopies(Extract->MI->getOperand(1).getReg(), MRI); 7610 if (!Extract) 7611 return std::nullopt; 7612 7613 if (Extract->MI->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) { 7614 if (Extract->Reg == Extract->MI->getOperand(1).getReg()) { 7615 Register ExtReg = Extract->MI->getOperand(2).getReg(); 7616 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }}}; 7617 } 7618 } 7619 if (Extract->MI->getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT) { 7620 LLT SrcTy = MRI.getType(Extract->MI->getOperand(1).getReg()); 7621 auto LaneIdx = getIConstantVRegValWithLookThrough( 7622 Extract->MI->getOperand(2).getReg(), MRI); 7623 if (LaneIdx && SrcTy == LLT::fixed_vector(2, 64) && 7624 LaneIdx->Value.getSExtValue() == 1) { 7625 Register ExtReg = Extract->MI->getOperand(1).getReg(); 7626 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }}}; 7627 } 7628 } 7629 7630 return std::nullopt; 7631 } 7632 7633 void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB, 7634 const MachineInstr &MI, 7635 int OpIdx) const { 7636 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 7637 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 7638 "Expected G_CONSTANT"); 7639 std::optional<int64_t> CstVal = 7640 getIConstantVRegSExtVal(MI.getOperand(0).getReg(), MRI); 7641 assert(CstVal && "Expected constant value"); 7642 MIB.addImm(*CstVal); 7643 } 7644 7645 void AArch64InstructionSelector::renderLogicalImm32( 7646 MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const { 7647 assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 7648 "Expected G_CONSTANT"); 7649 uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue(); 7650 uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 32); 7651 MIB.addImm(Enc); 7652 } 7653 7654 void AArch64InstructionSelector::renderLogicalImm64( 7655 MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const { 7656 assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 7657 "Expected G_CONSTANT"); 7658 uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue(); 7659 uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 64); 7660 MIB.addImm(Enc); 7661 } 7662 7663 void AArch64InstructionSelector::renderFPImm16(MachineInstrBuilder &MIB, 7664 const MachineInstr &MI, 7665 int OpIdx) const { 7666 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 && 7667 "Expected G_FCONSTANT"); 7668 MIB.addImm( 7669 AArch64_AM::getFP16Imm(MI.getOperand(1).getFPImm()->getValueAPF())); 7670 } 7671 7672 void AArch64InstructionSelector::renderFPImm32(MachineInstrBuilder &MIB, 7673 const MachineInstr &MI, 7674 int OpIdx) const { 7675 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 && 7676 "Expected G_FCONSTANT"); 7677 MIB.addImm( 7678 AArch64_AM::getFP32Imm(MI.getOperand(1).getFPImm()->getValueAPF())); 7679 } 7680 7681 void AArch64InstructionSelector::renderFPImm64(MachineInstrBuilder &MIB, 7682 const MachineInstr &MI, 7683 int OpIdx) const { 7684 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 && 7685 "Expected G_FCONSTANT"); 7686 MIB.addImm( 7687 AArch64_AM::getFP64Imm(MI.getOperand(1).getFPImm()->getValueAPF())); 7688 } 7689 7690 void AArch64InstructionSelector::renderFPImm32SIMDModImmType4( 7691 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { 7692 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 && 7693 "Expected G_FCONSTANT"); 7694 MIB.addImm(AArch64_AM::encodeAdvSIMDModImmType4(MI.getOperand(1) 7695 .getFPImm() 7696 ->getValueAPF() 7697 .bitcastToAPInt() 7698 .getZExtValue())); 7699 } 7700 7701 bool AArch64InstructionSelector::isLoadStoreOfNumBytes( 7702 const MachineInstr &MI, unsigned NumBytes) const { 7703 if (!MI.mayLoadOrStore()) 7704 return false; 7705 assert(MI.hasOneMemOperand() && 7706 "Expected load/store to have only one mem op!"); 7707 return (*MI.memoperands_begin())->getSize() == NumBytes; 7708 } 7709 7710 bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const { 7711 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 7712 if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() != 32) 7713 return false; 7714 7715 // Only return true if we know the operation will zero-out the high half of 7716 // the 64-bit register. Truncates can be subregister copies, which don't 7717 // zero out the high bits. Copies and other copy-like instructions can be 7718 // fed by truncates, or could be lowered as subregister copies. 7719 switch (MI.getOpcode()) { 7720 default: 7721 return true; 7722 case TargetOpcode::COPY: 7723 case TargetOpcode::G_BITCAST: 7724 case TargetOpcode::G_TRUNC: 7725 case TargetOpcode::G_PHI: 7726 return false; 7727 } 7728 } 7729 7730 7731 // Perform fixups on the given PHI instruction's operands to force them all 7732 // to be the same as the destination regbank. 7733 static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI, 7734 const AArch64RegisterBankInfo &RBI) { 7735 assert(MI.getOpcode() == TargetOpcode::G_PHI && "Expected a G_PHI"); 7736 Register DstReg = MI.getOperand(0).getReg(); 7737 const RegisterBank *DstRB = MRI.getRegBankOrNull(DstReg); 7738 assert(DstRB && "Expected PHI dst to have regbank assigned"); 7739 MachineIRBuilder MIB(MI); 7740 7741 // Go through each operand and ensure it has the same regbank. 7742 for (MachineOperand &MO : llvm::drop_begin(MI.operands())) { 7743 if (!MO.isReg()) 7744 continue; 7745 Register OpReg = MO.getReg(); 7746 const RegisterBank *RB = MRI.getRegBankOrNull(OpReg); 7747 if (RB != DstRB) { 7748 // Insert a cross-bank copy. 7749 auto *OpDef = MRI.getVRegDef(OpReg); 7750 const LLT &Ty = MRI.getType(OpReg); 7751 MachineBasicBlock &OpDefBB = *OpDef->getParent(); 7752 7753 // Any instruction we insert must appear after all PHIs in the block 7754 // for the block to be valid MIR. 7755 MachineBasicBlock::iterator InsertPt = std::next(OpDef->getIterator()); 7756 if (InsertPt != OpDefBB.end() && InsertPt->isPHI()) 7757 InsertPt = OpDefBB.getFirstNonPHI(); 7758 MIB.setInsertPt(*OpDef->getParent(), InsertPt); 7759 auto Copy = MIB.buildCopy(Ty, OpReg); 7760 MRI.setRegBank(Copy.getReg(0), *DstRB); 7761 MO.setReg(Copy.getReg(0)); 7762 } 7763 } 7764 } 7765 7766 void AArch64InstructionSelector::processPHIs(MachineFunction &MF) { 7767 // We're looking for PHIs, build a list so we don't invalidate iterators. 7768 MachineRegisterInfo &MRI = MF.getRegInfo(); 7769 SmallVector<MachineInstr *, 32> Phis; 7770 for (auto &BB : MF) { 7771 for (auto &MI : BB) { 7772 if (MI.getOpcode() == TargetOpcode::G_PHI) 7773 Phis.emplace_back(&MI); 7774 } 7775 } 7776 7777 for (auto *MI : Phis) { 7778 // We need to do some work here if the operand types are < 16 bit and they 7779 // are split across fpr/gpr banks. Since all types <32b on gpr 7780 // end up being assigned gpr32 regclasses, we can end up with PHIs here 7781 // which try to select between a gpr32 and an fpr16. Ideally RBS shouldn't 7782 // be selecting heterogenous regbanks for operands if possible, but we 7783 // still need to be able to deal with it here. 7784 // 7785 // To fix this, if we have a gpr-bank operand < 32b in size and at least 7786 // one other operand is on the fpr bank, then we add cross-bank copies 7787 // to homogenize the operand banks. For simplicity the bank that we choose 7788 // to settle on is whatever bank the def operand has. For example: 7789 // 7790 // %endbb: 7791 // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2:fpr(s16), %bb2 7792 // => 7793 // %bb2: 7794 // ... 7795 // %in2_copy:gpr(s16) = COPY %in2:fpr(s16) 7796 // ... 7797 // %endbb: 7798 // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2 7799 bool HasGPROp = false, HasFPROp = false; 7800 for (const MachineOperand &MO : llvm::drop_begin(MI->operands())) { 7801 if (!MO.isReg()) 7802 continue; 7803 const LLT &Ty = MRI.getType(MO.getReg()); 7804 if (!Ty.isValid() || !Ty.isScalar()) 7805 break; 7806 if (Ty.getSizeInBits() >= 32) 7807 break; 7808 const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg()); 7809 // If for some reason we don't have a regbank yet. Don't try anything. 7810 if (!RB) 7811 break; 7812 7813 if (RB->getID() == AArch64::GPRRegBankID) 7814 HasGPROp = true; 7815 else 7816 HasFPROp = true; 7817 } 7818 // We have heterogenous regbanks, need to fixup. 7819 if (HasGPROp && HasFPROp) 7820 fixupPHIOpBanks(*MI, MRI, RBI); 7821 } 7822 } 7823 7824 namespace llvm { 7825 InstructionSelector * 7826 createAArch64InstructionSelector(const AArch64TargetMachine &TM, 7827 AArch64Subtarget &Subtarget, 7828 AArch64RegisterBankInfo &RBI) { 7829 return new AArch64InstructionSelector(TM, Subtarget, RBI); 7830 } 7831 } 7832