//===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file /// This file implements the targeting of the InstructionSelector class for /// AArch64. /// \todo This should be generated by TableGen. //===----------------------------------------------------------------------===// #include "AArch64GlobalISelUtils.h" #include "AArch64InstrInfo.h" #include "AArch64MachineFunctionInfo.h" #include "AArch64RegisterBankInfo.h" #include "AArch64RegisterInfo.h" #include "AArch64Subtarget.h" #include "AArch64TargetMachine.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "MCTargetDesc/AArch64MCTargetDesc.h" #include "llvm/BinaryFormat/Dwarf.h" #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include #define DEBUG_TYPE "aarch64-isel" using namespace llvm; using namespace MIPatternMatch; using namespace AArch64GISelUtils; namespace llvm { class BlockFrequencyInfo; class ProfileSummaryInfo; } namespace { #define GET_GLOBALISEL_PREDICATE_BITSET #include "AArch64GenGlobalISel.inc" #undef GET_GLOBALISEL_PREDICATE_BITSET class AArch64InstructionSelector : public InstructionSelector { public: AArch64InstructionSelector(const AArch64TargetMachine &TM, const AArch64Subtarget &STI, const AArch64RegisterBankInfo &RBI); bool select(MachineInstr &I) override; static const char *getName() { return DEBUG_TYPE; } void setupMF(MachineFunction &MF, GISelKnownBits *KB, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override { InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI); MIB.setMF(MF); // hasFnAttribute() is expensive to call on every BRCOND selection, so // cache it here for each run of the selector. ProduceNonFlagSettingCondBr = !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening); MFReturnAddr = Register(); processPHIs(MF); } private: /// tblgen-erated 'select' implementation, used as the initial selector for /// the patterns that don't require complex C++. bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; // A lowering phase that runs before any selection attempts. // Returns true if the instruction was modified. bool preISelLower(MachineInstr &I); // An early selection function that runs before the selectImpl() call. bool earlySelect(MachineInstr &I); // Do some preprocessing of G_PHIs before we begin selection. void processPHIs(MachineFunction &MF); bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI); /// Eliminate same-sized cross-bank copies into stores before selectImpl(). bool contractCrossBankCopyIntoStore(MachineInstr &I, MachineRegisterInfo &MRI); bool convertPtrAddToAdd(MachineInstr &I, MachineRegisterInfo &MRI); bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const; bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const; ///@{ /// Helper functions for selectCompareBranch. bool selectCompareBranchFedByFCmp(MachineInstr &I, MachineInstr &FCmp, MachineIRBuilder &MIB) const; bool selectCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const; bool tryOptCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const; bool tryOptAndIntoCompareBranch(MachineInstr &AndInst, bool Invert, MachineBasicBlock *DstMBB, MachineIRBuilder &MIB) const; ///@} bool selectCompareBranch(MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI); bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI); bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI); // Helper to generate an equivalent of scalar_to_vector into a new register, // returned via 'Dst'. MachineInstr *emitScalarToVector(unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar, MachineIRBuilder &MIRBuilder) const; /// Emit a lane insert into \p DstReg, or a new vector register if /// std::nullopt is provided. /// /// The lane inserted into is defined by \p LaneIdx. The vector source /// register is given by \p SrcReg. The register containing the element is /// given by \p EltReg. MachineInstr *emitLaneInsert(std::optional DstReg, Register SrcReg, Register EltReg, unsigned LaneIdx, const RegisterBank &RB, MachineIRBuilder &MIRBuilder) const; /// Emit a sequence of instructions representing a constant \p CV for a /// vector register \p Dst. (E.g. a MOV, or a load from a constant pool.) /// /// \returns the last instruction in the sequence on success, and nullptr /// otherwise. MachineInstr *emitConstantVector(Register Dst, Constant *CV, MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI); bool selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI); bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy, MachineRegisterInfo &MRI); /// \returns true if a G_BUILD_VECTOR instruction \p MI can be selected as a /// SUBREG_TO_REG. bool tryOptBuildVecToSubregToReg(MachineInstr &MI, MachineRegisterInfo &MRI); bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI); bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI); bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI); bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI); bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI); bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI); bool selectSplitVectorUnmerge(MachineInstr &I, MachineRegisterInfo &MRI); /// Helper function to select vector load intrinsics like /// @llvm.aarch64.neon.ld2.*, @llvm.aarch64.neon.ld4.*, etc. /// \p Opc is the opcode that the selected instruction should use. /// \p NumVecs is the number of vector destinations for the instruction. /// \p I is the original G_INTRINSIC_W_SIDE_EFFECTS instruction. bool selectVectorLoadIntrinsic(unsigned Opc, unsigned NumVecs, MachineInstr &I); bool selectIntrinsicWithSideEffects(MachineInstr &I, MachineRegisterInfo &MRI); bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI); bool selectVectorICmp(MachineInstr &I, MachineRegisterInfo &MRI); bool selectIntrinsicTrunc(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectIntrinsicRound(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI); bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI); bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI); bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI); bool selectMOPS(MachineInstr &I, MachineRegisterInfo &MRI); bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI); unsigned emitConstantPoolEntry(const Constant *CPVal, MachineFunction &MF) const; MachineInstr *emitLoadFromConstantPool(const Constant *CPVal, MachineIRBuilder &MIRBuilder) const; // Emit a vector concat operation. MachineInstr *emitVectorConcat(std::optional Dst, Register Op1, Register Op2, MachineIRBuilder &MIRBuilder) const; // Emit an integer compare between LHS and RHS, which checks for Predicate. MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate, MachineIRBuilder &MIRBuilder) const; /// Emit a floating point comparison between \p LHS and \p RHS. /// \p Pred if given is the intended predicate to use. MachineInstr * emitFPCompare(Register LHS, Register RHS, MachineIRBuilder &MIRBuilder, std::optional = std::nullopt) const; MachineInstr * emitInstr(unsigned Opcode, std::initializer_list DstOps, std::initializer_list SrcOps, MachineIRBuilder &MIRBuilder, const ComplexRendererFns &RenderFns = std::nullopt) const; /// Helper function to emit an add or sub instruction. /// /// \p AddrModeAndSizeToOpcode must contain each of the opcode variants above /// in a specific order. /// /// Below is an example of the expected input to \p AddrModeAndSizeToOpcode. /// /// \code /// const std::array, 4> Table { /// {{AArch64::ADDXri, AArch64::ADDWri}, /// {AArch64::ADDXrs, AArch64::ADDWrs}, /// {AArch64::ADDXrr, AArch64::ADDWrr}, /// {AArch64::SUBXri, AArch64::SUBWri}, /// {AArch64::ADDXrx, AArch64::ADDWrx}}}; /// \endcode /// /// Each row in the table corresponds to a different addressing mode. Each /// column corresponds to a different register size. /// /// \attention Rows must be structured as follows: /// - Row 0: The ri opcode variants /// - Row 1: The rs opcode variants /// - Row 2: The rr opcode variants /// - Row 3: The ri opcode variants for negative immediates /// - Row 4: The rx opcode variants /// /// \attention Columns must be structured as follows: /// - Column 0: The 64-bit opcode variants /// - Column 1: The 32-bit opcode variants /// /// \p Dst is the destination register of the binop to emit. /// \p LHS is the left-hand operand of the binop to emit. /// \p RHS is the right-hand operand of the binop to emit. MachineInstr *emitAddSub( const std::array, 5> &AddrModeAndSizeToOpcode, Register Dst, MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const; MachineInstr *emitADD(Register DefReg, MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const; MachineInstr *emitADDS(Register Dst, MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const; MachineInstr *emitSUBS(Register Dst, MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const; MachineInstr *emitADCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const; MachineInstr *emitSBCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const; MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const; MachineInstr *emitTST(MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const; MachineInstr *emitSelect(Register Dst, Register LHS, Register RHS, AArch64CC::CondCode CC, MachineIRBuilder &MIRBuilder) const; MachineInstr *emitExtractVectorElt(std::optional DstReg, const RegisterBank &DstRB, LLT ScalarTy, Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const; MachineInstr *emitCSINC(Register Dst, Register Src1, Register Src2, AArch64CC::CondCode Pred, MachineIRBuilder &MIRBuilder) const; /// Emit a CSet for a FP compare. /// /// \p Dst is expected to be a 32-bit scalar register. MachineInstr *emitCSetForFCmp(Register Dst, CmpInst::Predicate Pred, MachineIRBuilder &MIRBuilder) const; /// Emit an instruction that sets NZCV to the carry-in expected by \p I. /// Might elide the instruction if the previous instruction already sets NZCV /// correctly. MachineInstr *emitCarryIn(MachineInstr &I, Register CarryReg); /// Emit the overflow op for \p Opcode. /// /// \p Opcode is expected to be an overflow op's opcode, e.g. G_UADDO, /// G_USUBO, etc. std::pair emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const; bool selectOverflowOp(MachineInstr &I, MachineRegisterInfo &MRI); /// Emit expression as a conjunction (a series of CCMP/CFCMP ops). /// In some cases this is even possible with OR operations in the expression. MachineInstr *emitConjunction(Register Val, AArch64CC::CondCode &OutCC, MachineIRBuilder &MIB) const; MachineInstr *emitConditionalComparison(Register LHS, Register RHS, CmpInst::Predicate CC, AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, MachineIRBuilder &MIB) const; MachineInstr *emitConjunctionRec(Register Val, AArch64CC::CondCode &OutCC, bool Negate, Register CCOp, AArch64CC::CondCode Predicate, MachineIRBuilder &MIB) const; /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg. /// \p IsNegative is true if the test should be "not zero". /// This will also optimize the test bit instruction when possible. MachineInstr *emitTestBit(Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB, MachineIRBuilder &MIB) const; /// Emit a CB(N)Z instruction which branches to \p DestMBB. MachineInstr *emitCBZ(Register CompareReg, bool IsNegative, MachineBasicBlock *DestMBB, MachineIRBuilder &MIB) const; // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td. // We use these manually instead of using the importer since it doesn't // support SDNodeXForm. ComplexRendererFns selectShiftA_32(const MachineOperand &Root) const; ComplexRendererFns selectShiftB_32(const MachineOperand &Root) const; ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const; ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const; ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const; ComplexRendererFns selectArithImmed(MachineOperand &Root) const; ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const; ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root, unsigned Size) const; ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const { return selectAddrModeUnscaled(Root, 1); } ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const { return selectAddrModeUnscaled(Root, 2); } ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const { return selectAddrModeUnscaled(Root, 4); } ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const { return selectAddrModeUnscaled(Root, 8); } ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const { return selectAddrModeUnscaled(Root, 16); } /// Helper to try to fold in a GISEL_ADD_LOW into an immediate, to be used /// from complex pattern matchers like selectAddrModeIndexed(). ComplexRendererFns tryFoldAddLowIntoImm(MachineInstr &RootDef, unsigned Size, MachineRegisterInfo &MRI) const; ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root, unsigned Size) const; template ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const { return selectAddrModeIndexed(Root, Width / 8); } bool isWorthFoldingIntoExtendedReg(MachineInstr &MI, const MachineRegisterInfo &MRI) const; ComplexRendererFns selectAddrModeShiftedExtendXReg(MachineOperand &Root, unsigned SizeInBytes) const; /// Returns a \p ComplexRendererFns which contains a base, offset, and whether /// or not a shift + extend should be folded into an addressing mode. Returns /// None when this is not profitable or possible. ComplexRendererFns selectExtendedSHL(MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset, unsigned SizeInBytes, bool WantsExt) const; ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const; ComplexRendererFns selectAddrModeXRO(MachineOperand &Root, unsigned SizeInBytes) const; template ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const { return selectAddrModeXRO(Root, Width / 8); } ComplexRendererFns selectAddrModeWRO(MachineOperand &Root, unsigned SizeInBytes) const; template ComplexRendererFns selectAddrModeWRO(MachineOperand &Root) const { return selectAddrModeWRO(Root, Width / 8); } ComplexRendererFns selectShiftedRegister(MachineOperand &Root, bool AllowROR = false) const; ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const { return selectShiftedRegister(Root); } ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const { return selectShiftedRegister(Root, true); } /// Given an extend instruction, determine the correct shift-extend type for /// that instruction. /// /// If the instruction is going to be used in a load or store, pass /// \p IsLoadStore = true. AArch64_AM::ShiftExtendType getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI, bool IsLoadStore = false) const; /// Move \p Reg to \p RC if \p Reg is not already on \p RC. /// /// \returns Either \p Reg if no change was necessary, or the new register /// created by moving \p Reg. /// /// Note: This uses emitCopy right now. Register moveScalarRegClass(Register Reg, const TargetRegisterClass &RC, MachineIRBuilder &MIB) const; ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const; ComplexRendererFns selectExtractHigh(MachineOperand &Root) const; void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx = -1) const; void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx = -1) const; void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx = -1) const; void renderFPImm16(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx = -1) const; void renderFPImm32(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx = -1) const; void renderFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx = -1) const; void renderFPImm32SIMDModImmType4(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx = -1) const; // Materialize a GlobalValue or BlockAddress using a movz+movk sequence. void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags); // Optimization methods. bool tryOptSelect(GSelect &Sel); bool tryOptSelectConjunction(GSelect &Sel, MachineInstr &CondMI); MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate, MachineIRBuilder &MIRBuilder) const; /// Return true if \p MI is a load or store of \p NumBytes bytes. bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const; /// Returns true if \p MI is guaranteed to have the high-half of a 64-bit /// register zeroed out. In other words, the result of MI has been explicitly /// zero extended. bool isDef32(const MachineInstr &MI) const; const AArch64TargetMachine &TM; const AArch64Subtarget &STI; const AArch64InstrInfo &TII; const AArch64RegisterInfo &TRI; const AArch64RegisterBankInfo &RBI; bool ProduceNonFlagSettingCondBr = false; // Some cached values used during selection. // We use LR as a live-in register, and we keep track of it here as it can be // clobbered by calls. Register MFReturnAddr; MachineIRBuilder MIB; #define GET_GLOBALISEL_PREDICATES_DECL #include "AArch64GenGlobalISel.inc" #undef GET_GLOBALISEL_PREDICATES_DECL // We declare the temporaries used by selectImpl() in the class to minimize the // cost of constructing placeholder values. #define GET_GLOBALISEL_TEMPORARIES_DECL #include "AArch64GenGlobalISel.inc" #undef GET_GLOBALISEL_TEMPORARIES_DECL }; } // end anonymous namespace #define GET_GLOBALISEL_IMPL #include "AArch64GenGlobalISel.inc" #undef GET_GLOBALISEL_IMPL AArch64InstructionSelector::AArch64InstructionSelector( const AArch64TargetMachine &TM, const AArch64Subtarget &STI, const AArch64RegisterBankInfo &RBI) : TM(TM), STI(STI), TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), #define GET_GLOBALISEL_PREDICATES_INIT #include "AArch64GenGlobalISel.inc" #undef GET_GLOBALISEL_PREDICATES_INIT #define GET_GLOBALISEL_TEMPORARIES_INIT #include "AArch64GenGlobalISel.inc" #undef GET_GLOBALISEL_TEMPORARIES_INIT { } // FIXME: This should be target-independent, inferred from the types declared // for each class in the bank. // /// Given a register bank, and a type, return the smallest register class that /// can represent that combination. static const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB, bool GetAllRegSet = false) { if (RB.getID() == AArch64::GPRRegBankID) { if (Ty.getSizeInBits() <= 32) return GetAllRegSet ? &AArch64::GPR32allRegClass : &AArch64::GPR32RegClass; if (Ty.getSizeInBits() == 64) return GetAllRegSet ? &AArch64::GPR64allRegClass : &AArch64::GPR64RegClass; if (Ty.getSizeInBits() == 128) return &AArch64::XSeqPairsClassRegClass; return nullptr; } if (RB.getID() == AArch64::FPRRegBankID) { switch (Ty.getSizeInBits()) { case 8: return &AArch64::FPR8RegClass; case 16: return &AArch64::FPR16RegClass; case 32: return &AArch64::FPR32RegClass; case 64: return &AArch64::FPR64RegClass; case 128: return &AArch64::FPR128RegClass; } return nullptr; } return nullptr; } /// Given a register bank, and size in bits, return the smallest register class /// that can represent that combination. static const TargetRegisterClass * getMinClassForRegBank(const RegisterBank &RB, unsigned SizeInBits, bool GetAllRegSet = false) { unsigned RegBankID = RB.getID(); if (RegBankID == AArch64::GPRRegBankID) { if (SizeInBits <= 32) return GetAllRegSet ? &AArch64::GPR32allRegClass : &AArch64::GPR32RegClass; if (SizeInBits == 64) return GetAllRegSet ? &AArch64::GPR64allRegClass : &AArch64::GPR64RegClass; if (SizeInBits == 128) return &AArch64::XSeqPairsClassRegClass; } if (RegBankID == AArch64::FPRRegBankID) { switch (SizeInBits) { default: return nullptr; case 8: return &AArch64::FPR8RegClass; case 16: return &AArch64::FPR16RegClass; case 32: return &AArch64::FPR32RegClass; case 64: return &AArch64::FPR64RegClass; case 128: return &AArch64::FPR128RegClass; } } return nullptr; } /// Returns the correct subregister to use for a given register class. static bool getSubRegForClass(const TargetRegisterClass *RC, const TargetRegisterInfo &TRI, unsigned &SubReg) { switch (TRI.getRegSizeInBits(*RC)) { case 8: SubReg = AArch64::bsub; break; case 16: SubReg = AArch64::hsub; break; case 32: if (RC != &AArch64::FPR32RegClass) SubReg = AArch64::sub_32; else SubReg = AArch64::ssub; break; case 64: SubReg = AArch64::dsub; break; default: LLVM_DEBUG( dbgs() << "Couldn't find appropriate subregister for register class."); return false; } return true; } /// Returns the minimum size the given register bank can hold. static unsigned getMinSizeForRegBank(const RegisterBank &RB) { switch (RB.getID()) { case AArch64::GPRRegBankID: return 32; case AArch64::FPRRegBankID: return 8; default: llvm_unreachable("Tried to get minimum size for unknown register bank."); } } /// Create a REG_SEQUENCE instruction using the registers in \p Regs. /// Helper function for functions like createDTuple and createQTuple. /// /// \p RegClassIDs - The list of register class IDs available for some tuple of /// a scalar class. E.g. QQRegClassID, QQQRegClassID, QQQQRegClassID. This is /// expected to contain between 2 and 4 tuple classes. /// /// \p SubRegs - The list of subregister classes associated with each register /// class ID in \p RegClassIDs. E.g., QQRegClassID should use the qsub0 /// subregister class. The index of each subregister class is expected to /// correspond with the index of each register class. /// /// \returns Either the destination register of REG_SEQUENCE instruction that /// was created, or the 0th element of \p Regs if \p Regs contains a single /// element. static Register createTuple(ArrayRef Regs, const unsigned RegClassIDs[], const unsigned SubRegs[], MachineIRBuilder &MIB) { unsigned NumRegs = Regs.size(); if (NumRegs == 1) return Regs[0]; assert(NumRegs >= 2 && NumRegs <= 4 && "Only support between two and 4 registers in a tuple!"); const TargetRegisterInfo *TRI = MIB.getMF().getSubtarget().getRegisterInfo(); auto *DesiredClass = TRI->getRegClass(RegClassIDs[NumRegs - 2]); auto RegSequence = MIB.buildInstr(TargetOpcode::REG_SEQUENCE, {DesiredClass}, {}); for (unsigned I = 0, E = Regs.size(); I < E; ++I) { RegSequence.addUse(Regs[I]); RegSequence.addImm(SubRegs[I]); } return RegSequence.getReg(0); } /// Create a tuple of D-registers using the registers in \p Regs. static Register createDTuple(ArrayRef Regs, MachineIRBuilder &MIB) { static const unsigned RegClassIDs[] = { AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID}; static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1, AArch64::dsub2, AArch64::dsub3}; return createTuple(Regs, RegClassIDs, SubRegs, MIB); } /// Create a tuple of Q-registers using the registers in \p Regs. static Register createQTuple(ArrayRef Regs, MachineIRBuilder &MIB) { static const unsigned RegClassIDs[] = { AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID}; static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1, AArch64::qsub2, AArch64::qsub3}; return createTuple(Regs, RegClassIDs, SubRegs, MIB); } static std::optional getImmedFromMO(const MachineOperand &Root) { auto &MI = *Root.getParent(); auto &MBB = *MI.getParent(); auto &MF = *MBB.getParent(); auto &MRI = MF.getRegInfo(); uint64_t Immed; if (Root.isImm()) Immed = Root.getImm(); else if (Root.isCImm()) Immed = Root.getCImm()->getZExtValue(); else if (Root.isReg()) { auto ValAndVReg = getIConstantVRegValWithLookThrough(Root.getReg(), MRI, true); if (!ValAndVReg) return std::nullopt; Immed = ValAndVReg->Value.getSExtValue(); } else return std::nullopt; return Immed; } /// Check whether \p I is a currently unsupported binary operation: /// - it has an unsized type /// - an operand is not a vreg /// - all operands are not in the same bank /// These are checks that should someday live in the verifier, but right now, /// these are mostly limitations of the aarch64 selector. static bool unsupportedBinOp(const MachineInstr &I, const AArch64RegisterBankInfo &RBI, const MachineRegisterInfo &MRI, const AArch64RegisterInfo &TRI) { LLT Ty = MRI.getType(I.getOperand(0).getReg()); if (!Ty.isValid()) { LLVM_DEBUG(dbgs() << "Generic binop register should be typed\n"); return true; } const RegisterBank *PrevOpBank = nullptr; for (auto &MO : I.operands()) { // FIXME: Support non-register operands. if (!MO.isReg()) { LLVM_DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n"); return true; } // FIXME: Can generic operations have physical registers operands? If // so, this will need to be taught about that, and we'll need to get the // bank out of the minimal class for the register. // Either way, this needs to be documented (and possibly verified). if (!MO.getReg().isVirtual()) { LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n"); return true; } const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI); if (!OpBank) { LLVM_DEBUG(dbgs() << "Generic register has no bank or class\n"); return true; } if (PrevOpBank && OpBank != PrevOpBank) { LLVM_DEBUG(dbgs() << "Generic inst operands have different banks\n"); return true; } PrevOpBank = OpBank; } return false; } /// Select the AArch64 opcode for the basic binary operation \p GenericOpc /// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID /// and of size \p OpSize. /// \returns \p GenericOpc if the combination is unsupported. static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID, unsigned OpSize) { switch (RegBankID) { case AArch64::GPRRegBankID: if (OpSize == 32) { switch (GenericOpc) { case TargetOpcode::G_SHL: return AArch64::LSLVWr; case TargetOpcode::G_LSHR: return AArch64::LSRVWr; case TargetOpcode::G_ASHR: return AArch64::ASRVWr; default: return GenericOpc; } } else if (OpSize == 64) { switch (GenericOpc) { case TargetOpcode::G_PTR_ADD: return AArch64::ADDXrr; case TargetOpcode::G_SHL: return AArch64::LSLVXr; case TargetOpcode::G_LSHR: return AArch64::LSRVXr; case TargetOpcode::G_ASHR: return AArch64::ASRVXr; default: return GenericOpc; } } break; case AArch64::FPRRegBankID: switch (OpSize) { case 32: switch (GenericOpc) { case TargetOpcode::G_FADD: return AArch64::FADDSrr; case TargetOpcode::G_FSUB: return AArch64::FSUBSrr; case TargetOpcode::G_FMUL: return AArch64::FMULSrr; case TargetOpcode::G_FDIV: return AArch64::FDIVSrr; default: return GenericOpc; } case 64: switch (GenericOpc) { case TargetOpcode::G_FADD: return AArch64::FADDDrr; case TargetOpcode::G_FSUB: return AArch64::FSUBDrr; case TargetOpcode::G_FMUL: return AArch64::FMULDrr; case TargetOpcode::G_FDIV: return AArch64::FDIVDrr; case TargetOpcode::G_OR: return AArch64::ORRv8i8; default: return GenericOpc; } } break; } return GenericOpc; } /// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc, /// appropriate for the (value) register bank \p RegBankID and of memory access /// size \p OpSize. This returns the variant with the base+unsigned-immediate /// addressing mode (e.g., LDRXui). /// \returns \p GenericOpc if the combination is unsupported. static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID, unsigned OpSize) { const bool isStore = GenericOpc == TargetOpcode::G_STORE; switch (RegBankID) { case AArch64::GPRRegBankID: switch (OpSize) { case 8: return isStore ? AArch64::STRBBui : AArch64::LDRBBui; case 16: return isStore ? AArch64::STRHHui : AArch64::LDRHHui; case 32: return isStore ? AArch64::STRWui : AArch64::LDRWui; case 64: return isStore ? AArch64::STRXui : AArch64::LDRXui; } break; case AArch64::FPRRegBankID: switch (OpSize) { case 8: return isStore ? AArch64::STRBui : AArch64::LDRBui; case 16: return isStore ? AArch64::STRHui : AArch64::LDRHui; case 32: return isStore ? AArch64::STRSui : AArch64::LDRSui; case 64: return isStore ? AArch64::STRDui : AArch64::LDRDui; case 128: return isStore ? AArch64::STRQui : AArch64::LDRQui; } break; } return GenericOpc; } /// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg /// to \p *To. /// /// E.g "To = COPY SrcReg:SubReg" static bool copySubReg(MachineInstr &I, MachineRegisterInfo &MRI, const RegisterBankInfo &RBI, Register SrcReg, const TargetRegisterClass *To, unsigned SubReg) { assert(SrcReg.isValid() && "Expected a valid source register?"); assert(To && "Destination register class cannot be null"); assert(SubReg && "Expected a valid subregister"); MachineIRBuilder MIB(I); auto SubRegCopy = MIB.buildInstr(TargetOpcode::COPY, {To}, {}).addReg(SrcReg, 0, SubReg); MachineOperand &RegOp = I.getOperand(1); RegOp.setReg(SubRegCopy.getReg(0)); // It's possible that the destination register won't be constrained. Make // sure that happens. if (!I.getOperand(0).getReg().isPhysical()) RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI); return true; } /// Helper function to get the source and destination register classes for a /// copy. Returns a std::pair containing the source register class for the /// copy, and the destination register class for the copy. If a register class /// cannot be determined, then it will be nullptr. static std::pair getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI) { Register DstReg = I.getOperand(0).getReg(); Register SrcReg = I.getOperand(1).getReg(); const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI); unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); // Special casing for cross-bank copies of s1s. We can technically represent // a 1-bit value with any size of register. The minimum size for a GPR is 32 // bits. So, we need to put the FPR on 32 bits as well. // // FIXME: I'm not sure if this case holds true outside of copies. If it does, // then we can pull it into the helpers that get the appropriate class for a // register bank. Or make a new helper that carries along some constraint // information. if (SrcRegBank != DstRegBank && (DstSize == 1 && SrcSize == 1)) SrcSize = DstSize = 32; return {getMinClassForRegBank(SrcRegBank, SrcSize, true), getMinClassForRegBank(DstRegBank, DstSize, true)}; } // FIXME: We need some sort of API in RBI/TRI to allow generic code to // constrain operands of simple instructions given a TargetRegisterClass // and LLT static bool selectDebugInstr(MachineInstr &I, MachineRegisterInfo &MRI, const RegisterBankInfo &RBI) { for (MachineOperand &MO : I.operands()) { if (!MO.isReg()) continue; Register Reg = MO.getReg(); if (!Reg) continue; if (Reg.isPhysical()) continue; LLT Ty = MRI.getType(Reg); const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); const TargetRegisterClass *RC = RegClassOrBank.dyn_cast(); if (!RC) { const RegisterBank &RB = *RegClassOrBank.get(); RC = getRegClassForTypeOnBank(Ty, RB); if (!RC) { LLVM_DEBUG( dbgs() << "Warning: DBG_VALUE operand has unexpected size/bank\n"); break; } } RBI.constrainGenericRegister(Reg, *RC, MRI); } return true; } static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI) { Register DstReg = I.getOperand(0).getReg(); Register SrcReg = I.getOperand(1).getReg(); const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); // Find the correct register classes for the source and destination registers. const TargetRegisterClass *SrcRC; const TargetRegisterClass *DstRC; std::tie(SrcRC, DstRC) = getRegClassesForCopy(I, TII, MRI, TRI, RBI); if (!DstRC) { LLVM_DEBUG(dbgs() << "Unexpected dest size " << RBI.getSizeInBits(DstReg, MRI, TRI) << '\n'); return false; } // Is this a copy? If so, then we may need to insert a subregister copy. if (I.isCopy()) { // Yes. Check if there's anything to fix up. if (!SrcRC) { LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n"); return false; } unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC); unsigned DstSize = TRI.getRegSizeInBits(*DstRC); unsigned SubReg; // If the source bank doesn't support a subregister copy small enough, // then we first need to copy to the destination bank. if (getMinSizeForRegBank(SrcRegBank) > DstSize) { const TargetRegisterClass *DstTempRC = getMinClassForRegBank(DstRegBank, SrcSize, /* GetAllRegSet */ true); getSubRegForClass(DstRC, TRI, SubReg); MachineIRBuilder MIB(I); auto Copy = MIB.buildCopy({DstTempRC}, {SrcReg}); copySubReg(I, MRI, RBI, Copy.getReg(0), DstRC, SubReg); } else if (SrcSize > DstSize) { // If the source register is bigger than the destination we need to // perform a subregister copy. const TargetRegisterClass *SubRegRC = getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true); getSubRegForClass(SubRegRC, TRI, SubReg); copySubReg(I, MRI, RBI, SrcReg, DstRC, SubReg); } else if (DstSize > SrcSize) { // If the destination register is bigger than the source we need to do // a promotion using SUBREG_TO_REG. const TargetRegisterClass *PromotionRC = getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true); getSubRegForClass(SrcRC, TRI, SubReg); Register PromoteReg = MRI.createVirtualRegister(PromotionRC); BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG), PromoteReg) .addImm(0) .addUse(SrcReg) .addImm(SubReg); MachineOperand &RegOp = I.getOperand(1); RegOp.setReg(PromoteReg); } // If the destination is a physical register, then there's nothing to // change, so we're done. if (DstReg.isPhysical()) return true; } // No need to constrain SrcReg. It will get constrained when we hit another // of its use or its defs. Copies do not have constraints. if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode()) << " operand\n"); return false; } // If this a GPR ZEXT that we want to just reduce down into a copy. // The sizes will be mismatched with the source < 32b but that's ok. if (I.getOpcode() == TargetOpcode::G_ZEXT) { I.setDesc(TII.get(AArch64::COPY)); assert(SrcRegBank.getID() == AArch64::GPRRegBankID); return selectCopy(I, TII, MRI, TRI, RBI); } I.setDesc(TII.get(AArch64::COPY)); return true; } static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) { if (!DstTy.isScalar() || !SrcTy.isScalar()) return GenericOpc; const unsigned DstSize = DstTy.getSizeInBits(); const unsigned SrcSize = SrcTy.getSizeInBits(); switch (DstSize) { case 32: switch (SrcSize) { case 32: switch (GenericOpc) { case TargetOpcode::G_SITOFP: return AArch64::SCVTFUWSri; case TargetOpcode::G_UITOFP: return AArch64::UCVTFUWSri; case TargetOpcode::G_FPTOSI: return AArch64::FCVTZSUWSr; case TargetOpcode::G_FPTOUI: return AArch64::FCVTZUUWSr; default: return GenericOpc; } case 64: switch (GenericOpc) { case TargetOpcode::G_SITOFP: return AArch64::SCVTFUXSri; case TargetOpcode::G_UITOFP: return AArch64::UCVTFUXSri; case TargetOpcode::G_FPTOSI: return AArch64::FCVTZSUWDr; case TargetOpcode::G_FPTOUI: return AArch64::FCVTZUUWDr; default: return GenericOpc; } default: return GenericOpc; } case 64: switch (SrcSize) { case 32: switch (GenericOpc) { case TargetOpcode::G_SITOFP: return AArch64::SCVTFUWDri; case TargetOpcode::G_UITOFP: return AArch64::UCVTFUWDri; case TargetOpcode::G_FPTOSI: return AArch64::FCVTZSUXSr; case TargetOpcode::G_FPTOUI: return AArch64::FCVTZUUXSr; default: return GenericOpc; } case 64: switch (GenericOpc) { case TargetOpcode::G_SITOFP: return AArch64::SCVTFUXDri; case TargetOpcode::G_UITOFP: return AArch64::UCVTFUXDri; case TargetOpcode::G_FPTOSI: return AArch64::FCVTZSUXDr; case TargetOpcode::G_FPTOUI: return AArch64::FCVTZUUXDr; default: return GenericOpc; } default: return GenericOpc; } default: return GenericOpc; }; return GenericOpc; } MachineInstr * AArch64InstructionSelector::emitSelect(Register Dst, Register True, Register False, AArch64CC::CondCode CC, MachineIRBuilder &MIB) const { MachineRegisterInfo &MRI = *MIB.getMRI(); assert(RBI.getRegBank(False, MRI, TRI)->getID() == RBI.getRegBank(True, MRI, TRI)->getID() && "Expected both select operands to have the same regbank?"); LLT Ty = MRI.getType(True); if (Ty.isVector()) return nullptr; const unsigned Size = Ty.getSizeInBits(); assert((Size == 32 || Size == 64) && "Expected 32 bit or 64 bit select only?"); const bool Is32Bit = Size == 32; if (RBI.getRegBank(True, MRI, TRI)->getID() != AArch64::GPRRegBankID) { unsigned Opc = Is32Bit ? AArch64::FCSELSrrr : AArch64::FCSELDrrr; auto FCSel = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC); constrainSelectedInstRegOperands(*FCSel, TII, TRI, RBI); return &*FCSel; } // By default, we'll try and emit a CSEL. unsigned Opc = Is32Bit ? AArch64::CSELWr : AArch64::CSELXr; bool Optimized = false; auto TryFoldBinOpIntoSelect = [&Opc, Is32Bit, &CC, &MRI, &Optimized](Register &Reg, Register &OtherReg, bool Invert) { if (Optimized) return false; // Attempt to fold: // // %sub = G_SUB 0, %x // %select = G_SELECT cc, %reg, %sub // // Into: // %select = CSNEG %reg, %x, cc Register MatchReg; if (mi_match(Reg, MRI, m_Neg(m_Reg(MatchReg)))) { Opc = Is32Bit ? AArch64::CSNEGWr : AArch64::CSNEGXr; Reg = MatchReg; if (Invert) { CC = AArch64CC::getInvertedCondCode(CC); std::swap(Reg, OtherReg); } return true; } // Attempt to fold: // // %xor = G_XOR %x, -1 // %select = G_SELECT cc, %reg, %xor // // Into: // %select = CSINV %reg, %x, cc if (mi_match(Reg, MRI, m_Not(m_Reg(MatchReg)))) { Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; Reg = MatchReg; if (Invert) { CC = AArch64CC::getInvertedCondCode(CC); std::swap(Reg, OtherReg); } return true; } // Attempt to fold: // // %add = G_ADD %x, 1 // %select = G_SELECT cc, %reg, %add // // Into: // %select = CSINC %reg, %x, cc if (mi_match(Reg, MRI, m_any_of(m_GAdd(m_Reg(MatchReg), m_SpecificICst(1)), m_GPtrAdd(m_Reg(MatchReg), m_SpecificICst(1))))) { Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; Reg = MatchReg; if (Invert) { CC = AArch64CC::getInvertedCondCode(CC); std::swap(Reg, OtherReg); } return true; } return false; }; // Helper lambda which tries to use CSINC/CSINV for the instruction when its // true/false values are constants. // FIXME: All of these patterns already exist in tablegen. We should be // able to import these. auto TryOptSelectCst = [&Opc, &True, &False, &CC, Is32Bit, &MRI, &Optimized]() { if (Optimized) return false; auto TrueCst = getIConstantVRegValWithLookThrough(True, MRI); auto FalseCst = getIConstantVRegValWithLookThrough(False, MRI); if (!TrueCst && !FalseCst) return false; Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR; if (TrueCst && FalseCst) { int64_t T = TrueCst->Value.getSExtValue(); int64_t F = FalseCst->Value.getSExtValue(); if (T == 0 && F == 1) { // G_SELECT cc, 0, 1 -> CSINC zreg, zreg, cc Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; True = ZReg; False = ZReg; return true; } if (T == 0 && F == -1) { // G_SELECT cc 0, -1 -> CSINV zreg, zreg cc Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; True = ZReg; False = ZReg; return true; } } if (TrueCst) { int64_t T = TrueCst->Value.getSExtValue(); if (T == 1) { // G_SELECT cc, 1, f -> CSINC f, zreg, inv_cc Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; True = False; False = ZReg; CC = AArch64CC::getInvertedCondCode(CC); return true; } if (T == -1) { // G_SELECT cc, -1, f -> CSINV f, zreg, inv_cc Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; True = False; False = ZReg; CC = AArch64CC::getInvertedCondCode(CC); return true; } } if (FalseCst) { int64_t F = FalseCst->Value.getSExtValue(); if (F == 1) { // G_SELECT cc, t, 1 -> CSINC t, zreg, cc Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; False = ZReg; return true; } if (F == -1) { // G_SELECT cc, t, -1 -> CSINC t, zreg, cc Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; False = ZReg; return true; } } return false; }; Optimized |= TryFoldBinOpIntoSelect(False, True, /*Invert = */ false); Optimized |= TryFoldBinOpIntoSelect(True, False, /*Invert = */ true); Optimized |= TryOptSelectCst(); auto SelectInst = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC); constrainSelectedInstRegOperands(*SelectInst, TII, TRI, RBI); return &*SelectInst; } static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) { switch (P) { default: llvm_unreachable("Unknown condition code!"); case CmpInst::ICMP_NE: return AArch64CC::NE; case CmpInst::ICMP_EQ: return AArch64CC::EQ; case CmpInst::ICMP_SGT: return AArch64CC::GT; case CmpInst::ICMP_SGE: return AArch64CC::GE; case CmpInst::ICMP_SLT: return AArch64CC::LT; case CmpInst::ICMP_SLE: return AArch64CC::LE; case CmpInst::ICMP_UGT: return AArch64CC::HI; case CmpInst::ICMP_UGE: return AArch64CC::HS; case CmpInst::ICMP_ULT: return AArch64CC::LO; case CmpInst::ICMP_ULE: return AArch64CC::LS; } } /// changeFPCCToORAArch64CC - Convert an IR fp condition code to an AArch64 CC. static void changeFPCCToORAArch64CC(CmpInst::Predicate CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2) { CondCode2 = AArch64CC::AL; switch (CC) { default: llvm_unreachable("Unknown FP condition!"); case CmpInst::FCMP_OEQ: CondCode = AArch64CC::EQ; break; case CmpInst::FCMP_OGT: CondCode = AArch64CC::GT; break; case CmpInst::FCMP_OGE: CondCode = AArch64CC::GE; break; case CmpInst::FCMP_OLT: CondCode = AArch64CC::MI; break; case CmpInst::FCMP_OLE: CondCode = AArch64CC::LS; break; case CmpInst::FCMP_ONE: CondCode = AArch64CC::MI; CondCode2 = AArch64CC::GT; break; case CmpInst::FCMP_ORD: CondCode = AArch64CC::VC; break; case CmpInst::FCMP_UNO: CondCode = AArch64CC::VS; break; case CmpInst::FCMP_UEQ: CondCode = AArch64CC::EQ; CondCode2 = AArch64CC::VS; break; case CmpInst::FCMP_UGT: CondCode = AArch64CC::HI; break; case CmpInst::FCMP_UGE: CondCode = AArch64CC::PL; break; case CmpInst::FCMP_ULT: CondCode = AArch64CC::LT; break; case CmpInst::FCMP_ULE: CondCode = AArch64CC::LE; break; case CmpInst::FCMP_UNE: CondCode = AArch64CC::NE; break; } } /// Convert an IR fp condition code to an AArch64 CC. /// This differs from changeFPCCToAArch64CC in that it returns cond codes that /// should be AND'ed instead of OR'ed. static void changeFPCCToANDAArch64CC(CmpInst::Predicate CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2) { CondCode2 = AArch64CC::AL; switch (CC) { default: changeFPCCToORAArch64CC(CC, CondCode, CondCode2); assert(CondCode2 == AArch64CC::AL); break; case CmpInst::FCMP_ONE: // (a one b) // == ((a olt b) || (a ogt b)) // == ((a ord b) && (a une b)) CondCode = AArch64CC::VC; CondCode2 = AArch64CC::NE; break; case CmpInst::FCMP_UEQ: // (a ueq b) // == ((a uno b) || (a oeq b)) // == ((a ule b) && (a uge b)) CondCode = AArch64CC::PL; CondCode2 = AArch64CC::LE; break; } } /// Return a register which can be used as a bit to test in a TB(N)Z. static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert, MachineRegisterInfo &MRI) { assert(Reg.isValid() && "Expected valid register!"); bool HasZext = false; while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) { unsigned Opc = MI->getOpcode(); if (!MI->getOperand(0).isReg() || !MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) break; // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits. // // (tbz (trunc x), b) -> (tbz x, b) is always safe, because the bit number // on the truncated x is the same as the bit number on x. if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_TRUNC) { if (Opc == TargetOpcode::G_ZEXT) HasZext = true; Register NextReg = MI->getOperand(1).getReg(); // Did we find something worth folding? if (!NextReg.isValid() || !MRI.hasOneNonDBGUse(NextReg)) break; // NextReg is worth folding. Keep looking. Reg = NextReg; continue; } // Attempt to find a suitable operation with a constant on one side. std::optional C; Register TestReg; switch (Opc) { default: break; case TargetOpcode::G_AND: case TargetOpcode::G_XOR: { TestReg = MI->getOperand(1).getReg(); Register ConstantReg = MI->getOperand(2).getReg(); auto VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI); if (!VRegAndVal) { // AND commutes, check the other side for a constant. // FIXME: Can we canonicalize the constant so that it's always on the // same side at some point earlier? std::swap(ConstantReg, TestReg); VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI); } if (VRegAndVal) { if (HasZext) C = VRegAndVal->Value.getZExtValue(); else C = VRegAndVal->Value.getSExtValue(); } break; } case TargetOpcode::G_ASHR: case TargetOpcode::G_LSHR: case TargetOpcode::G_SHL: { TestReg = MI->getOperand(1).getReg(); auto VRegAndVal = getIConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI); if (VRegAndVal) C = VRegAndVal->Value.getSExtValue(); break; } } // Didn't find a constant or viable register. Bail out of the loop. if (!C || !TestReg.isValid()) break; // We found a suitable instruction with a constant. Check to see if we can // walk through the instruction. Register NextReg; unsigned TestRegSize = MRI.getType(TestReg).getSizeInBits(); switch (Opc) { default: break; case TargetOpcode::G_AND: // (tbz (and x, m), b) -> (tbz x, b) when the b-th bit of m is set. if ((*C >> Bit) & 1) NextReg = TestReg; break; case TargetOpcode::G_SHL: // (tbz (shl x, c), b) -> (tbz x, b-c) when b-c is positive and fits in // the type of the register. if (*C <= Bit && (Bit - *C) < TestRegSize) { NextReg = TestReg; Bit = Bit - *C; } break; case TargetOpcode::G_ASHR: // (tbz (ashr x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits // in x NextReg = TestReg; Bit = Bit + *C; if (Bit >= TestRegSize) Bit = TestRegSize - 1; break; case TargetOpcode::G_LSHR: // (tbz (lshr x, c), b) -> (tbz x, b+c) when b + c is < # bits in x if ((Bit + *C) < TestRegSize) { NextReg = TestReg; Bit = Bit + *C; } break; case TargetOpcode::G_XOR: // We can walk through a G_XOR by inverting whether we use tbz/tbnz when // appropriate. // // e.g. If x' = xor x, c, and the b-th bit is set in c then // // tbz x', b -> tbnz x, b // // Because x' only has the b-th bit set if x does not. if ((*C >> Bit) & 1) Invert = !Invert; NextReg = TestReg; break; } // Check if we found anything worth folding. if (!NextReg.isValid()) return Reg; Reg = NextReg; } return Reg; } MachineInstr *AArch64InstructionSelector::emitTestBit( Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB, MachineIRBuilder &MIB) const { assert(TestReg.isValid()); assert(ProduceNonFlagSettingCondBr && "Cannot emit TB(N)Z with speculation tracking!"); MachineRegisterInfo &MRI = *MIB.getMRI(); // Attempt to optimize the test bit by walking over instructions. TestReg = getTestBitReg(TestReg, Bit, IsNegative, MRI); LLT Ty = MRI.getType(TestReg); unsigned Size = Ty.getSizeInBits(); assert(!Ty.isVector() && "Expected a scalar!"); assert(Bit < 64 && "Bit is too large!"); // When the test register is a 64-bit register, we have to narrow to make // TBNZW work. bool UseWReg = Bit < 32; unsigned NecessarySize = UseWReg ? 32 : 64; if (Size != NecessarySize) TestReg = moveScalarRegClass( TestReg, UseWReg ? AArch64::GPR32RegClass : AArch64::GPR64RegClass, MIB); static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX}, {AArch64::TBZW, AArch64::TBNZW}}; unsigned Opc = OpcTable[UseWReg][IsNegative]; auto TestBitMI = MIB.buildInstr(Opc).addReg(TestReg).addImm(Bit).addMBB(DstMBB); constrainSelectedInstRegOperands(*TestBitMI, TII, TRI, RBI); return &*TestBitMI; } bool AArch64InstructionSelector::tryOptAndIntoCompareBranch( MachineInstr &AndInst, bool Invert, MachineBasicBlock *DstMBB, MachineIRBuilder &MIB) const { assert(AndInst.getOpcode() == TargetOpcode::G_AND && "Expected G_AND only?"); // Given something like this: // // %x = ...Something... // %one = G_CONSTANT i64 1 // %zero = G_CONSTANT i64 0 // %and = G_AND %x, %one // %cmp = G_ICMP intpred(ne), %and, %zero // %cmp_trunc = G_TRUNC %cmp // G_BRCOND %cmp_trunc, %bb.3 // // We want to try and fold the AND into the G_BRCOND and produce either a // TBNZ (when we have intpred(ne)) or a TBZ (when we have intpred(eq)). // // In this case, we'd get // // TBNZ %x %bb.3 // // Check if the AND has a constant on its RHS which we can use as a mask. // If it's a power of 2, then it's the same as checking a specific bit. // (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set) auto MaybeBit = getIConstantVRegValWithLookThrough( AndInst.getOperand(2).getReg(), *MIB.getMRI()); if (!MaybeBit) return false; int32_t Bit = MaybeBit->Value.exactLogBase2(); if (Bit < 0) return false; Register TestReg = AndInst.getOperand(1).getReg(); // Emit a TB(N)Z. emitTestBit(TestReg, Bit, Invert, DstMBB, MIB); return true; } MachineInstr *AArch64InstructionSelector::emitCBZ(Register CompareReg, bool IsNegative, MachineBasicBlock *DestMBB, MachineIRBuilder &MIB) const { assert(ProduceNonFlagSettingCondBr && "CBZ does not set flags!"); MachineRegisterInfo &MRI = *MIB.getMRI(); assert(RBI.getRegBank(CompareReg, MRI, TRI)->getID() == AArch64::GPRRegBankID && "Expected GPRs only?"); auto Ty = MRI.getType(CompareReg); unsigned Width = Ty.getSizeInBits(); assert(!Ty.isVector() && "Expected scalar only?"); assert(Width <= 64 && "Expected width to be at most 64?"); static const unsigned OpcTable[2][2] = {{AArch64::CBZW, AArch64::CBZX}, {AArch64::CBNZW, AArch64::CBNZX}}; unsigned Opc = OpcTable[IsNegative][Width == 64]; auto BranchMI = MIB.buildInstr(Opc, {}, {CompareReg}).addMBB(DestMBB); constrainSelectedInstRegOperands(*BranchMI, TII, TRI, RBI); return &*BranchMI; } bool AArch64InstructionSelector::selectCompareBranchFedByFCmp( MachineInstr &I, MachineInstr &FCmp, MachineIRBuilder &MIB) const { assert(FCmp.getOpcode() == TargetOpcode::G_FCMP); assert(I.getOpcode() == TargetOpcode::G_BRCOND); // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't // totally clean. Some of them require two branches to implement. auto Pred = (CmpInst::Predicate)FCmp.getOperand(1).getPredicate(); emitFPCompare(FCmp.getOperand(2).getReg(), FCmp.getOperand(3).getReg(), MIB, Pred); AArch64CC::CondCode CC1, CC2; changeFCMPPredToAArch64CC(static_cast(Pred), CC1, CC2); MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC1).addMBB(DestMBB); if (CC2 != AArch64CC::AL) MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC2).addMBB(DestMBB); I.eraseFromParent(); return true; } bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp( MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const { assert(ICmp.getOpcode() == TargetOpcode::G_ICMP); assert(I.getOpcode() == TargetOpcode::G_BRCOND); // Attempt to optimize the G_BRCOND + G_ICMP into a TB(N)Z/CB(N)Z. // // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z // instructions will not be produced, as they are conditional branch // instructions that do not set flags. if (!ProduceNonFlagSettingCondBr) return false; MachineRegisterInfo &MRI = *MIB.getMRI(); MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); auto Pred = static_cast(ICmp.getOperand(1).getPredicate()); Register LHS = ICmp.getOperand(2).getReg(); Register RHS = ICmp.getOperand(3).getReg(); // We're allowed to emit a TB(N)Z/CB(N)Z. Try to do that. auto VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI); MachineInstr *AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI); // When we can emit a TB(N)Z, prefer that. // // Handle non-commutative condition codes first. // Note that we don't want to do this when we have a G_AND because it can // become a tst. The tst will make the test bit in the TB(N)Z redundant. if (VRegAndVal && !AndInst) { int64_t C = VRegAndVal->Value.getSExtValue(); // When we have a greater-than comparison, we can just test if the msb is // zero. if (C == -1 && Pred == CmpInst::ICMP_SGT) { uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1; emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB); I.eraseFromParent(); return true; } // When we have a less than comparison, we can just test if the msb is not // zero. if (C == 0 && Pred == CmpInst::ICMP_SLT) { uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1; emitTestBit(LHS, Bit, /*IsNegative = */ true, DestMBB, MIB); I.eraseFromParent(); return true; } // Inversely, if we have a signed greater-than-or-equal comparison to zero, // we can test if the msb is zero. if (C == 0 && Pred == CmpInst::ICMP_SGE) { uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1; emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB); I.eraseFromParent(); return true; } } // Attempt to handle commutative condition codes. Right now, that's only // eq/ne. if (ICmpInst::isEquality(Pred)) { if (!VRegAndVal) { std::swap(RHS, LHS); VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI); AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI); } if (VRegAndVal && VRegAndVal->Value == 0) { // If there's a G_AND feeding into this branch, try to fold it away by // emitting a TB(N)Z instead. // // Note: If we have LT, then it *is* possible to fold, but it wouldn't be // beneficial. When we have an AND and LT, we need a TST/ANDS, so folding // would be redundant. if (AndInst && tryOptAndIntoCompareBranch( *AndInst, /*Invert = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB)) { I.eraseFromParent(); return true; } // Otherwise, try to emit a CB(N)Z instead. auto LHSTy = MRI.getType(LHS); if (!LHSTy.isVector() && LHSTy.getSizeInBits() <= 64) { emitCBZ(LHS, /*IsNegative = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB); I.eraseFromParent(); return true; } } } return false; } bool AArch64InstructionSelector::selectCompareBranchFedByICmp( MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const { assert(ICmp.getOpcode() == TargetOpcode::G_ICMP); assert(I.getOpcode() == TargetOpcode::G_BRCOND); if (tryOptCompareBranchFedByICmp(I, ICmp, MIB)) return true; // Couldn't optimize. Emit a compare + a Bcc. MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); auto PredOp = ICmp.getOperand(1); emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB); const AArch64CC::CondCode CC = changeICMPPredToAArch64CC( static_cast(PredOp.getPredicate())); MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB); I.eraseFromParent(); return true; } bool AArch64InstructionSelector::selectCompareBranch( MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) { Register CondReg = I.getOperand(0).getReg(); MachineInstr *CCMI = MRI.getVRegDef(CondReg); // Try to select the G_BRCOND using whatever is feeding the condition if // possible. unsigned CCMIOpc = CCMI->getOpcode(); if (CCMIOpc == TargetOpcode::G_FCMP) return selectCompareBranchFedByFCmp(I, *CCMI, MIB); if (CCMIOpc == TargetOpcode::G_ICMP) return selectCompareBranchFedByICmp(I, *CCMI, MIB); // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z // instructions will not be produced, as they are conditional branch // instructions that do not set flags. if (ProduceNonFlagSettingCondBr) { emitTestBit(CondReg, /*Bit = */ 0, /*IsNegative = */ true, I.getOperand(1).getMBB(), MIB); I.eraseFromParent(); return true; } // Can't emit TB(N)Z/CB(N)Z. Emit a tst + bcc instead. auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {LLT::scalar(32)}, {CondReg}).addImm(1); constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI); auto Bcc = MIB.buildInstr(AArch64::Bcc) .addImm(AArch64CC::NE) .addMBB(I.getOperand(1).getMBB()); I.eraseFromParent(); return constrainSelectedInstRegOperands(*Bcc, TII, TRI, RBI); } /// Returns the element immediate value of a vector shift operand if found. /// This needs to detect a splat-like operation, e.g. a G_BUILD_VECTOR. static std::optional getVectorShiftImm(Register Reg, MachineRegisterInfo &MRI) { assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand"); MachineInstr *OpMI = MRI.getVRegDef(Reg); return getAArch64VectorSplatScalar(*OpMI, MRI); } /// Matches and returns the shift immediate value for a SHL instruction given /// a shift operand. static std::optional getVectorSHLImm(LLT SrcTy, Register Reg, MachineRegisterInfo &MRI) { std::optional ShiftImm = getVectorShiftImm(Reg, MRI); if (!ShiftImm) return std::nullopt; // Check the immediate is in range for a SHL. int64_t Imm = *ShiftImm; if (Imm < 0) return std::nullopt; switch (SrcTy.getElementType().getSizeInBits()) { default: LLVM_DEBUG(dbgs() << "Unhandled element type for vector shift"); return std::nullopt; case 8: if (Imm > 7) return std::nullopt; break; case 16: if (Imm > 15) return std::nullopt; break; case 32: if (Imm > 31) return std::nullopt; break; case 64: if (Imm > 63) return std::nullopt; break; } return Imm; } bool AArch64InstructionSelector::selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI) { assert(I.getOpcode() == TargetOpcode::G_SHL); Register DstReg = I.getOperand(0).getReg(); const LLT Ty = MRI.getType(DstReg); Register Src1Reg = I.getOperand(1).getReg(); Register Src2Reg = I.getOperand(2).getReg(); if (!Ty.isVector()) return false; // Check if we have a vector of constants on RHS that we can select as the // immediate form. std::optional ImmVal = getVectorSHLImm(Ty, Src2Reg, MRI); unsigned Opc = 0; if (Ty == LLT::fixed_vector(2, 64)) { Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64; } else if (Ty == LLT::fixed_vector(4, 32)) { Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32; } else if (Ty == LLT::fixed_vector(2, 32)) { Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32; } else if (Ty == LLT::fixed_vector(4, 16)) { Opc = ImmVal ? AArch64::SHLv4i16_shift : AArch64::USHLv4i16; } else if (Ty == LLT::fixed_vector(8, 16)) { Opc = ImmVal ? AArch64::SHLv8i16_shift : AArch64::USHLv8i16; } else if (Ty == LLT::fixed_vector(16, 8)) { Opc = ImmVal ? AArch64::SHLv16i8_shift : AArch64::USHLv16i8; } else if (Ty == LLT::fixed_vector(8, 8)) { Opc = ImmVal ? AArch64::SHLv8i8_shift : AArch64::USHLv8i8; } else { LLVM_DEBUG(dbgs() << "Unhandled G_SHL type"); return false; } auto Shl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg}); if (ImmVal) Shl.addImm(*ImmVal); else Shl.addUse(Src2Reg); constrainSelectedInstRegOperands(*Shl, TII, TRI, RBI); I.eraseFromParent(); return true; } bool AArch64InstructionSelector::selectVectorAshrLshr( MachineInstr &I, MachineRegisterInfo &MRI) { assert(I.getOpcode() == TargetOpcode::G_ASHR || I.getOpcode() == TargetOpcode::G_LSHR); Register DstReg = I.getOperand(0).getReg(); const LLT Ty = MRI.getType(DstReg); Register Src1Reg = I.getOperand(1).getReg(); Register Src2Reg = I.getOperand(2).getReg(); if (!Ty.isVector()) return false; bool IsASHR = I.getOpcode() == TargetOpcode::G_ASHR; // We expect the immediate case to be lowered in the PostLegalCombiner to // AArch64ISD::VASHR or AArch64ISD::VLSHR equivalents. // There is not a shift right register instruction, but the shift left // register instruction takes a signed value, where negative numbers specify a // right shift. unsigned Opc = 0; unsigned NegOpc = 0; const TargetRegisterClass *RC = getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID)); if (Ty == LLT::fixed_vector(2, 64)) { Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64; NegOpc = AArch64::NEGv2i64; } else if (Ty == LLT::fixed_vector(4, 32)) { Opc = IsASHR ? AArch64::SSHLv4i32 : AArch64::USHLv4i32; NegOpc = AArch64::NEGv4i32; } else if (Ty == LLT::fixed_vector(2, 32)) { Opc = IsASHR ? AArch64::SSHLv2i32 : AArch64::USHLv2i32; NegOpc = AArch64::NEGv2i32; } else if (Ty == LLT::fixed_vector(4, 16)) { Opc = IsASHR ? AArch64::SSHLv4i16 : AArch64::USHLv4i16; NegOpc = AArch64::NEGv4i16; } else if (Ty == LLT::fixed_vector(8, 16)) { Opc = IsASHR ? AArch64::SSHLv8i16 : AArch64::USHLv8i16; NegOpc = AArch64::NEGv8i16; } else if (Ty == LLT::fixed_vector(16, 8)) { Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8; NegOpc = AArch64::NEGv16i8; } else if (Ty == LLT::fixed_vector(8, 8)) { Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8; NegOpc = AArch64::NEGv8i8; } else { LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type"); return false; } auto Neg = MIB.buildInstr(NegOpc, {RC}, {Src2Reg}); constrainSelectedInstRegOperands(*Neg, TII, TRI, RBI); auto SShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Neg}); constrainSelectedInstRegOperands(*SShl, TII, TRI, RBI); I.eraseFromParent(); return true; } bool AArch64InstructionSelector::selectVaStartAAPCS( MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { return false; } bool AArch64InstructionSelector::selectVaStartDarwin( MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { AArch64FunctionInfo *FuncInfo = MF.getInfo(); Register ListReg = I.getOperand(0).getReg(); Register ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); int FrameIdx = FuncInfo->getVarArgsStackIndex(); if (MF.getSubtarget().isCallingConvWin64( MF.getFunction().getCallingConv())) { FrameIdx = FuncInfo->getVarArgsGPRSize() > 0 ? FuncInfo->getVarArgsGPRIndex() : FuncInfo->getVarArgsStackIndex(); } auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri)) .addDef(ArgsAddrReg) .addFrameIndex(FrameIdx) .addImm(0) .addImm(0); constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui)) .addUse(ArgsAddrReg) .addUse(ListReg) .addImm(0) .addMemOperand(*I.memoperands_begin()); constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); I.eraseFromParent(); return true; } void AArch64InstructionSelector::materializeLargeCMVal( MachineInstr &I, const Value *V, unsigned OpFlags) { MachineBasicBlock &MBB = *I.getParent(); MachineFunction &MF = *MBB.getParent(); MachineRegisterInfo &MRI = MF.getRegInfo(); auto MovZ = MIB.buildInstr(AArch64::MOVZXi, {&AArch64::GPR64RegClass}, {}); MovZ->addOperand(MF, I.getOperand(1)); MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 | AArch64II::MO_NC); MovZ->addOperand(MF, MachineOperand::CreateImm(0)); constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI); auto BuildMovK = [&](Register SrcReg, unsigned char Flags, unsigned Offset, Register ForceDstReg) { Register DstReg = ForceDstReg ? ForceDstReg : MRI.createVirtualRegister(&AArch64::GPR64RegClass); auto MovI = MIB.buildInstr(AArch64::MOVKXi).addDef(DstReg).addUse(SrcReg); if (auto *GV = dyn_cast(V)) { MovI->addOperand(MF, MachineOperand::CreateGA( GV, MovZ->getOperand(1).getOffset(), Flags)); } else { MovI->addOperand( MF, MachineOperand::CreateBA(cast(V), MovZ->getOperand(1).getOffset(), Flags)); } MovI->addOperand(MF, MachineOperand::CreateImm(Offset)); constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI); return DstReg; }; Register DstReg = BuildMovK(MovZ.getReg(0), AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0); DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0); BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg()); } bool AArch64InstructionSelector::preISelLower(MachineInstr &I) { MachineBasicBlock &MBB = *I.getParent(); MachineFunction &MF = *MBB.getParent(); MachineRegisterInfo &MRI = MF.getRegInfo(); switch (I.getOpcode()) { case TargetOpcode::G_STORE: { bool Changed = contractCrossBankCopyIntoStore(I, MRI); MachineOperand &SrcOp = I.getOperand(0); if (MRI.getType(SrcOp.getReg()).isPointer()) { // Allow matching with imported patterns for stores of pointers. Unlike // G_LOAD/G_PTR_ADD, we may not have selected all users. So, emit a copy // and constrain. auto Copy = MIB.buildCopy(LLT::scalar(64), SrcOp); Register NewSrc = Copy.getReg(0); SrcOp.setReg(NewSrc); RBI.constrainGenericRegister(NewSrc, AArch64::GPR64RegClass, MRI); Changed = true; } return Changed; } case TargetOpcode::G_PTR_ADD: return convertPtrAddToAdd(I, MRI); case TargetOpcode::G_LOAD: { // For scalar loads of pointers, we try to convert the dest type from p0 // to s64 so that our imported patterns can match. Like with the G_PTR_ADD // conversion, this should be ok because all users should have been // selected already, so the type doesn't matter for them. Register DstReg = I.getOperand(0).getReg(); const LLT DstTy = MRI.getType(DstReg); if (!DstTy.isPointer()) return false; MRI.setType(DstReg, LLT::scalar(64)); return true; } case AArch64::G_DUP: { // Convert the type from p0 to s64 to help selection. LLT DstTy = MRI.getType(I.getOperand(0).getReg()); if (!DstTy.getElementType().isPointer()) return false; auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(1).getReg()); MRI.setType(I.getOperand(0).getReg(), DstTy.changeElementType(LLT::scalar(64))); MRI.setRegClass(NewSrc.getReg(0), &AArch64::GPR64RegClass); I.getOperand(1).setReg(NewSrc.getReg(0)); return true; } case TargetOpcode::G_UITOFP: case TargetOpcode::G_SITOFP: { // If both source and destination regbanks are FPR, then convert the opcode // to G_SITOF so that the importer can select it to an fpr variant. // Otherwise, it ends up matching an fpr/gpr variant and adding a cross-bank // copy. Register SrcReg = I.getOperand(1).getReg(); LLT SrcTy = MRI.getType(SrcReg); LLT DstTy = MRI.getType(I.getOperand(0).getReg()); if (SrcTy.isVector() || SrcTy.getSizeInBits() != DstTy.getSizeInBits()) return false; if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::FPRRegBankID) { if (I.getOpcode() == TargetOpcode::G_SITOFP) I.setDesc(TII.get(AArch64::G_SITOF)); else I.setDesc(TII.get(AArch64::G_UITOF)); return true; } return false; } default: return false; } } /// This lowering tries to look for G_PTR_ADD instructions and then converts /// them to a standard G_ADD with a COPY on the source. /// /// The motivation behind this is to expose the add semantics to the imported /// tablegen patterns. We shouldn't need to check for uses being loads/stores, /// because the selector works bottom up, uses before defs. By the time we /// end up trying to select a G_PTR_ADD, we should have already attempted to /// fold this into addressing modes and were therefore unsuccessful. bool AArch64InstructionSelector::convertPtrAddToAdd( MachineInstr &I, MachineRegisterInfo &MRI) { assert(I.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD"); Register DstReg = I.getOperand(0).getReg(); Register AddOp1Reg = I.getOperand(1).getReg(); const LLT PtrTy = MRI.getType(DstReg); if (PtrTy.getAddressSpace() != 0) return false; const LLT CastPtrTy = PtrTy.isVector() ? LLT::fixed_vector(2, 64) : LLT::scalar(64); auto PtrToInt = MIB.buildPtrToInt(CastPtrTy, AddOp1Reg); // Set regbanks on the registers. if (PtrTy.isVector()) MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::FPRRegBankID)); else MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID)); // Now turn the %dst(p0) = G_PTR_ADD %base, off into: // %dst(intty) = G_ADD %intbase, off I.setDesc(TII.get(TargetOpcode::G_ADD)); MRI.setType(DstReg, CastPtrTy); I.getOperand(1).setReg(PtrToInt.getReg(0)); if (!select(*PtrToInt)) { LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd"); return false; } // Also take the opportunity here to try to do some optimization. // Try to convert this into a G_SUB if the offset is a 0-x negate idiom. Register NegatedReg; if (!mi_match(I.getOperand(2).getReg(), MRI, m_Neg(m_Reg(NegatedReg)))) return true; I.getOperand(2).setReg(NegatedReg); I.setDesc(TII.get(TargetOpcode::G_SUB)); return true; } bool AArch64InstructionSelector::earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI) { // We try to match the immediate variant of LSL, which is actually an alias // for a special case of UBFM. Otherwise, we fall back to the imported // selector which will match the register variant. assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op"); const auto &MO = I.getOperand(2); auto VRegAndVal = getIConstantVRegVal(MO.getReg(), MRI); if (!VRegAndVal) return false; const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); if (DstTy.isVector()) return false; bool Is64Bit = DstTy.getSizeInBits() == 64; auto Imm1Fn = Is64Bit ? selectShiftA_64(MO) : selectShiftA_32(MO); auto Imm2Fn = Is64Bit ? selectShiftB_64(MO) : selectShiftB_32(MO); if (!Imm1Fn || !Imm2Fn) return false; auto NewI = MIB.buildInstr(Is64Bit ? AArch64::UBFMXri : AArch64::UBFMWri, {I.getOperand(0).getReg()}, {I.getOperand(1).getReg()}); for (auto &RenderFn : *Imm1Fn) RenderFn(NewI); for (auto &RenderFn : *Imm2Fn) RenderFn(NewI); I.eraseFromParent(); return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI); } bool AArch64InstructionSelector::contractCrossBankCopyIntoStore( MachineInstr &I, MachineRegisterInfo &MRI) { assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE"); // If we're storing a scalar, it doesn't matter what register bank that // scalar is on. All that matters is the size. // // So, if we see something like this (with a 32-bit scalar as an example): // // %x:gpr(s32) = ... something ... // %y:fpr(s32) = COPY %x:gpr(s32) // G_STORE %y:fpr(s32) // // We can fix this up into something like this: // // G_STORE %x:gpr(s32) // // And then continue the selection process normally. Register DefDstReg = getSrcRegIgnoringCopies(I.getOperand(0).getReg(), MRI); if (!DefDstReg.isValid()) return false; LLT DefDstTy = MRI.getType(DefDstReg); Register StoreSrcReg = I.getOperand(0).getReg(); LLT StoreSrcTy = MRI.getType(StoreSrcReg); // If we get something strange like a physical register, then we shouldn't // go any further. if (!DefDstTy.isValid()) return false; // Are the source and dst types the same size? if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits()) return false; if (RBI.getRegBank(StoreSrcReg, MRI, TRI) == RBI.getRegBank(DefDstReg, MRI, TRI)) return false; // We have a cross-bank copy, which is entering a store. Let's fold it. I.getOperand(0).setReg(DefDstReg); return true; } bool AArch64InstructionSelector::earlySelect(MachineInstr &I) { assert(I.getParent() && "Instruction should be in a basic block!"); assert(I.getParent()->getParent() && "Instruction should be in a function!"); MachineBasicBlock &MBB = *I.getParent(); MachineFunction &MF = *MBB.getParent(); MachineRegisterInfo &MRI = MF.getRegInfo(); switch (I.getOpcode()) { case AArch64::G_DUP: { // Before selecting a DUP instruction, check if it is better selected as a // MOV or load from a constant pool. Register Src = I.getOperand(1).getReg(); auto ValAndVReg = getIConstantVRegValWithLookThrough(Src, MRI); if (!ValAndVReg) return false; LLVMContext &Ctx = MF.getFunction().getContext(); Register Dst = I.getOperand(0).getReg(); auto *CV = ConstantDataVector::getSplat( MRI.getType(Dst).getNumElements(), ConstantInt::get(Type::getIntNTy(Ctx, MRI.getType(Src).getSizeInBits()), ValAndVReg->Value)); if (!emitConstantVector(Dst, CV, MIB, MRI)) return false; I.eraseFromParent(); return true; } case TargetOpcode::G_SEXT: // Check for i64 sext(i32 vector_extract) prior to tablegen to select SMOV // over a normal extend. if (selectUSMovFromExtend(I, MRI)) return true; return false; case TargetOpcode::G_BR: return false; case TargetOpcode::G_SHL: return earlySelectSHL(I, MRI); case TargetOpcode::G_CONSTANT: { bool IsZero = false; if (I.getOperand(1).isCImm()) IsZero = I.getOperand(1).getCImm()->isZero(); else if (I.getOperand(1).isImm()) IsZero = I.getOperand(1).getImm() == 0; if (!IsZero) return false; Register DefReg = I.getOperand(0).getReg(); LLT Ty = MRI.getType(DefReg); if (Ty.getSizeInBits() == 64) { I.getOperand(1).ChangeToRegister(AArch64::XZR, false); RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI); } else if (Ty.getSizeInBits() == 32) { I.getOperand(1).ChangeToRegister(AArch64::WZR, false); RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI); } else return false; I.setDesc(TII.get(TargetOpcode::COPY)); return true; } case TargetOpcode::G_ADD: { // Check if this is being fed by a G_ICMP on either side. // // (cmp pred, x, y) + z // // In the above case, when the cmp is true, we increment z by 1. So, we can // fold the add into the cset for the cmp by using cinc. // // FIXME: This would probably be a lot nicer in PostLegalizerLowering. Register AddDst = I.getOperand(0).getReg(); Register AddLHS = I.getOperand(1).getReg(); Register AddRHS = I.getOperand(2).getReg(); // Only handle scalars. LLT Ty = MRI.getType(AddLHS); if (Ty.isVector()) return false; // Since G_ICMP is modeled as ADDS/SUBS/ANDS, we can handle 32 bits or 64 // bits. unsigned Size = Ty.getSizeInBits(); if (Size != 32 && Size != 64) return false; auto MatchCmp = [&](Register Reg) -> MachineInstr * { if (!MRI.hasOneNonDBGUse(Reg)) return nullptr; // If the LHS of the add is 32 bits, then we want to fold a 32-bit // compare. if (Size == 32) return getOpcodeDef(TargetOpcode::G_ICMP, Reg, MRI); // We model scalar compares using 32-bit destinations right now. // If it's a 64-bit compare, it'll have 64-bit sources. Register ZExt; if (!mi_match(Reg, MRI, m_OneNonDBGUse(m_GZExt(m_OneNonDBGUse(m_Reg(ZExt)))))) return nullptr; auto *Cmp = getOpcodeDef(TargetOpcode::G_ICMP, ZExt, MRI); if (!Cmp || MRI.getType(Cmp->getOperand(2).getReg()).getSizeInBits() != 64) return nullptr; return Cmp; }; // Try to match // z + (cmp pred, x, y) MachineInstr *Cmp = MatchCmp(AddRHS); if (!Cmp) { // (cmp pred, x, y) + z std::swap(AddLHS, AddRHS); Cmp = MatchCmp(AddRHS); if (!Cmp) return false; } auto &PredOp = Cmp->getOperand(1); auto Pred = static_cast(PredOp.getPredicate()); const AArch64CC::CondCode InvCC = changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred)); MIB.setInstrAndDebugLoc(I); emitIntegerCompare(/*LHS=*/Cmp->getOperand(2), /*RHS=*/Cmp->getOperand(3), PredOp, MIB); emitCSINC(/*Dst=*/AddDst, /*Src =*/AddLHS, /*Src2=*/AddLHS, InvCC, MIB); I.eraseFromParent(); return true; } case TargetOpcode::G_OR: { // Look for operations that take the lower `Width=Size-ShiftImm` bits of // `ShiftSrc` and insert them into the upper `Width` bits of `MaskSrc` via // shifting and masking that we can replace with a BFI (encoded as a BFM). Register Dst = I.getOperand(0).getReg(); LLT Ty = MRI.getType(Dst); if (!Ty.isScalar()) return false; unsigned Size = Ty.getSizeInBits(); if (Size != 32 && Size != 64) return false; Register ShiftSrc; int64_t ShiftImm; Register MaskSrc; int64_t MaskImm; if (!mi_match( Dst, MRI, m_GOr(m_OneNonDBGUse(m_GShl(m_Reg(ShiftSrc), m_ICst(ShiftImm))), m_OneNonDBGUse(m_GAnd(m_Reg(MaskSrc), m_ICst(MaskImm)))))) return false; if (ShiftImm > Size || ((1ULL << ShiftImm) - 1ULL) != uint64_t(MaskImm)) return false; int64_t Immr = Size - ShiftImm; int64_t Imms = Size - ShiftImm - 1; unsigned Opc = Size == 32 ? AArch64::BFMWri : AArch64::BFMXri; emitInstr(Opc, {Dst}, {MaskSrc, ShiftSrc, Immr, Imms}, MIB); I.eraseFromParent(); return true; } case TargetOpcode::G_FENCE: { if (I.getOperand(1).getImm() == 0) BuildMI(MBB, I, MIMetadata(I), TII.get(TargetOpcode::MEMBARRIER)); else BuildMI(MBB, I, MIMetadata(I), TII.get(AArch64::DMB)) .addImm(I.getOperand(0).getImm() == 4 ? 0x9 : 0xb); I.eraseFromParent(); return true; } default: return false; } } bool AArch64InstructionSelector::select(MachineInstr &I) { assert(I.getParent() && "Instruction should be in a basic block!"); assert(I.getParent()->getParent() && "Instruction should be in a function!"); MachineBasicBlock &MBB = *I.getParent(); MachineFunction &MF = *MBB.getParent(); MachineRegisterInfo &MRI = MF.getRegInfo(); const AArch64Subtarget *Subtarget = &MF.getSubtarget(); if (Subtarget->requiresStrictAlign()) { // We don't support this feature yet. LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n"); return false; } MIB.setInstrAndDebugLoc(I); unsigned Opcode = I.getOpcode(); // G_PHI requires same handling as PHI if (!I.isPreISelOpcode() || Opcode == TargetOpcode::G_PHI) { // Certain non-generic instructions also need some special handling. if (Opcode == TargetOpcode::LOAD_STACK_GUARD) return constrainSelectedInstRegOperands(I, TII, TRI, RBI); if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) { const Register DefReg = I.getOperand(0).getReg(); const LLT DefTy = MRI.getType(DefReg); const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(DefReg); const TargetRegisterClass *DefRC = RegClassOrBank.dyn_cast(); if (!DefRC) { if (!DefTy.isValid()) { LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); return false; } const RegisterBank &RB = *RegClassOrBank.get(); DefRC = getRegClassForTypeOnBank(DefTy, RB); if (!DefRC) { LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); return false; } } I.setDesc(TII.get(TargetOpcode::PHI)); return RBI.constrainGenericRegister(DefReg, *DefRC, MRI); } if (I.isCopy()) return selectCopy(I, TII, MRI, TRI, RBI); if (I.isDebugInstr()) return selectDebugInstr(I, MRI, RBI); return true; } if (I.getNumOperands() != I.getNumExplicitOperands()) { LLVM_DEBUG( dbgs() << "Generic instruction has unexpected implicit operands\n"); return false; } // Try to do some lowering before we start instruction selecting. These // lowerings are purely transformations on the input G_MIR and so selection // must continue after any modification of the instruction. if (preISelLower(I)) { Opcode = I.getOpcode(); // The opcode may have been modified, refresh it. } // There may be patterns where the importer can't deal with them optimally, // but does select it to a suboptimal sequence so our custom C++ selection // code later never has a chance to work on it. Therefore, we have an early // selection attempt here to give priority to certain selection routines // over the imported ones. if (earlySelect(I)) return true; if (selectImpl(I, *CoverageInfo)) return true; LLT Ty = I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{}; switch (Opcode) { case TargetOpcode::G_SBFX: case TargetOpcode::G_UBFX: { static const unsigned OpcTable[2][2] = { {AArch64::UBFMWri, AArch64::UBFMXri}, {AArch64::SBFMWri, AArch64::SBFMXri}}; bool IsSigned = Opcode == TargetOpcode::G_SBFX; unsigned Size = Ty.getSizeInBits(); unsigned Opc = OpcTable[IsSigned][Size == 64]; auto Cst1 = getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), MRI); assert(Cst1 && "Should have gotten a constant for src 1?"); auto Cst2 = getIConstantVRegValWithLookThrough(I.getOperand(3).getReg(), MRI); assert(Cst2 && "Should have gotten a constant for src 2?"); auto LSB = Cst1->Value.getZExtValue(); auto Width = Cst2->Value.getZExtValue(); auto BitfieldInst = MIB.buildInstr(Opc, {I.getOperand(0)}, {I.getOperand(1)}) .addImm(LSB) .addImm(LSB + Width - 1); I.eraseFromParent(); return constrainSelectedInstRegOperands(*BitfieldInst, TII, TRI, RBI); } case TargetOpcode::G_BRCOND: return selectCompareBranch(I, MF, MRI); case TargetOpcode::G_BRINDIRECT: { I.setDesc(TII.get(AArch64::BR)); return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } case TargetOpcode::G_BRJT: return selectBrJT(I, MRI); case AArch64::G_ADD_LOW: { // This op may have been separated from it's ADRP companion by the localizer // or some other code motion pass. Given that many CPUs will try to // macro fuse these operations anyway, select this into a MOVaddr pseudo // which will later be expanded into an ADRP+ADD pair after scheduling. MachineInstr *BaseMI = MRI.getVRegDef(I.getOperand(1).getReg()); if (BaseMI->getOpcode() != AArch64::ADRP) { I.setDesc(TII.get(AArch64::ADDXri)); I.addOperand(MachineOperand::CreateImm(0)); return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } assert(TM.getCodeModel() == CodeModel::Small && "Expected small code model"); auto Op1 = BaseMI->getOperand(1); auto Op2 = I.getOperand(2); auto MovAddr = MIB.buildInstr(AArch64::MOVaddr, {I.getOperand(0)}, {}) .addGlobalAddress(Op1.getGlobal(), Op1.getOffset(), Op1.getTargetFlags()) .addGlobalAddress(Op2.getGlobal(), Op2.getOffset(), Op2.getTargetFlags()); I.eraseFromParent(); return constrainSelectedInstRegOperands(*MovAddr, TII, TRI, RBI); } case TargetOpcode::G_BSWAP: { // Handle vector types for G_BSWAP directly. Register DstReg = I.getOperand(0).getReg(); LLT DstTy = MRI.getType(DstReg); // We should only get vector types here; everything else is handled by the // importer right now. if (!DstTy.isVector() || DstTy.getSizeInBits() > 128) { LLVM_DEBUG(dbgs() << "Dst type for G_BSWAP currently unsupported.\n"); return false; } // Only handle 4 and 2 element vectors for now. // TODO: 16-bit elements. unsigned NumElts = DstTy.getNumElements(); if (NumElts != 4 && NumElts != 2) { LLVM_DEBUG(dbgs() << "Unsupported number of elements for G_BSWAP.\n"); return false; } // Choose the correct opcode for the supported types. Right now, that's // v2s32, v4s32, and v2s64. unsigned Opc = 0; unsigned EltSize = DstTy.getElementType().getSizeInBits(); if (EltSize == 32) Opc = (DstTy.getNumElements() == 2) ? AArch64::REV32v8i8 : AArch64::REV32v16i8; else if (EltSize == 64) Opc = AArch64::REV64v16i8; // We should always get something by the time we get here... assert(Opc != 0 && "Didn't get an opcode for G_BSWAP?"); I.setDesc(TII.get(Opc)); return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } case TargetOpcode::G_FCONSTANT: case TargetOpcode::G_CONSTANT: { const bool isFP = Opcode == TargetOpcode::G_FCONSTANT; const LLT s8 = LLT::scalar(8); const LLT s16 = LLT::scalar(16); const LLT s32 = LLT::scalar(32); const LLT s64 = LLT::scalar(64); const LLT s128 = LLT::scalar(128); const LLT p0 = LLT::pointer(0, 64); const Register DefReg = I.getOperand(0).getReg(); const LLT DefTy = MRI.getType(DefReg); const unsigned DefSize = DefTy.getSizeInBits(); const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); // FIXME: Redundant check, but even less readable when factored out. if (isFP) { if (Ty != s16 && Ty != s32 && Ty != s64 && Ty != s128) { LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty << " constant, expected: " << s16 << " or " << s32 << " or " << s64 << " or " << s128 << '\n'); return false; } if (RB.getID() != AArch64::FPRRegBankID) { LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty << " constant on bank: " << RB << ", expected: FPR\n"); return false; } // The case when we have 0.0 is covered by tablegen. Reject it here so we // can be sure tablegen works correctly and isn't rescued by this code. // 0.0 is not covered by tablegen for FP128. So we will handle this // scenario in the code here. if (DefSize != 128 && I.getOperand(1).getFPImm()->isExactlyValue(0.0)) return false; } else { // s32 and s64 are covered by tablegen. if (Ty != p0 && Ty != s8 && Ty != s16) { LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty << " constant, expected: " << s32 << ", " << s64 << ", or " << p0 << '\n'); return false; } if (RB.getID() != AArch64::GPRRegBankID) { LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty << " constant on bank: " << RB << ", expected: GPR\n"); return false; } } if (isFP) { const TargetRegisterClass &FPRRC = *getRegClassForTypeOnBank(DefTy, RB); // For 16, 64, and 128b values, emit a constant pool load. switch (DefSize) { default: llvm_unreachable("Unexpected destination size for G_FCONSTANT?"); case 32: // For s32, use a cp load if we have optsize/minsize. if (!shouldOptForSize(&MF)) break; [[fallthrough]]; case 16: case 64: case 128: { auto *FPImm = I.getOperand(1).getFPImm(); auto *LoadMI = emitLoadFromConstantPool(FPImm, MIB); if (!LoadMI) { LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n"); return false; } MIB.buildCopy({DefReg}, {LoadMI->getOperand(0).getReg()}); I.eraseFromParent(); return RBI.constrainGenericRegister(DefReg, FPRRC, MRI); } } // Either emit a FMOV, or emit a copy to emit a normal mov. assert(DefSize == 32 && "Expected constant pool loads for all sizes other than 32!"); const Register DefGPRReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); MachineOperand &RegOp = I.getOperand(0); RegOp.setReg(DefGPRReg); MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator())); MIB.buildCopy({DefReg}, {DefGPRReg}); if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) { LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n"); return false; } MachineOperand &ImmOp = I.getOperand(1); // FIXME: Is going through int64_t always correct? ImmOp.ChangeToImmediate( ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue()); } else if (I.getOperand(1).isCImm()) { uint64_t Val = I.getOperand(1).getCImm()->getZExtValue(); I.getOperand(1).ChangeToImmediate(Val); } else if (I.getOperand(1).isImm()) { uint64_t Val = I.getOperand(1).getImm(); I.getOperand(1).ChangeToImmediate(Val); } const unsigned MovOpc = DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm; I.setDesc(TII.get(MovOpc)); constrainSelectedInstRegOperands(I, TII, TRI, RBI); return true; } case TargetOpcode::G_EXTRACT: { Register DstReg = I.getOperand(0).getReg(); Register SrcReg = I.getOperand(1).getReg(); LLT SrcTy = MRI.getType(SrcReg); LLT DstTy = MRI.getType(DstReg); (void)DstTy; unsigned SrcSize = SrcTy.getSizeInBits(); if (SrcTy.getSizeInBits() > 64) { // This should be an extract of an s128, which is like a vector extract. if (SrcTy.getSizeInBits() != 128) return false; // Only support extracting 64 bits from an s128 at the moment. if (DstTy.getSizeInBits() != 64) return false; unsigned Offset = I.getOperand(2).getImm(); if (Offset % 64 != 0) return false; // Check we have the right regbank always. const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI); const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); assert(SrcRB.getID() == DstRB.getID() && "Wrong extract regbank!"); if (SrcRB.getID() == AArch64::GPRRegBankID) { auto NewI = MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}) .addUse(SrcReg, 0, Offset == 0 ? AArch64::sube64 : AArch64::subo64); constrainOperandRegClass(MF, TRI, MRI, TII, RBI, *NewI, AArch64::GPR64RegClass, NewI->getOperand(0)); I.eraseFromParent(); return true; } // Emit the same code as a vector extract. // Offset must be a multiple of 64. unsigned LaneIdx = Offset / 64; MachineInstr *Extract = emitExtractVectorElt( DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB); if (!Extract) return false; I.eraseFromParent(); return true; } I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri)); MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() + Ty.getSizeInBits() - 1); if (SrcSize < 64) { assert(SrcSize == 32 && DstTy.getSizeInBits() == 16 && "unexpected G_EXTRACT types"); return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator())); MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {}) .addReg(DstReg, 0, AArch64::sub_32); RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR32RegClass, MRI); I.getOperand(0).setReg(DstReg); return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } case TargetOpcode::G_INSERT: { LLT SrcTy = MRI.getType(I.getOperand(2).getReg()); LLT DstTy = MRI.getType(I.getOperand(0).getReg()); unsigned DstSize = DstTy.getSizeInBits(); // Larger inserts are vectors, same-size ones should be something else by // now (split up or turned into COPYs). if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32) return false; I.setDesc(TII.get(DstSize == 64 ? AArch64::BFMXri : AArch64::BFMWri)); unsigned LSB = I.getOperand(3).getImm(); unsigned Width = MRI.getType(I.getOperand(2).getReg()).getSizeInBits(); I.getOperand(3).setImm((DstSize - LSB) % DstSize); MachineInstrBuilder(MF, I).addImm(Width - 1); if (DstSize < 64) { assert(DstSize == 32 && SrcTy.getSizeInBits() == 16 && "unexpected G_INSERT types"); return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } Register SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); BuildMI(MBB, I.getIterator(), I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG)) .addDef(SrcReg) .addImm(0) .addUse(I.getOperand(2).getReg()) .addImm(AArch64::sub_32); RBI.constrainGenericRegister(I.getOperand(2).getReg(), AArch64::GPR32RegClass, MRI); I.getOperand(2).setReg(SrcReg); return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } case TargetOpcode::G_FRAME_INDEX: { // allocas and G_FRAME_INDEX are only supported in addrspace(0). if (Ty != LLT::pointer(0, 64)) { LLVM_DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty << ", expected: " << LLT::pointer(0, 64) << '\n'); return false; } I.setDesc(TII.get(AArch64::ADDXri)); // MOs for a #0 shifted immediate. I.addOperand(MachineOperand::CreateImm(0)); I.addOperand(MachineOperand::CreateImm(0)); return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } case TargetOpcode::G_GLOBAL_VALUE: { auto GV = I.getOperand(1).getGlobal(); if (GV->isThreadLocal()) return selectTLSGlobalValue(I, MRI); unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM); if (OpFlags & AArch64II::MO_GOT) { I.setDesc(TII.get(AArch64::LOADgot)); I.getOperand(1).setTargetFlags(OpFlags); } else if (TM.getCodeModel() == CodeModel::Large) { // Materialize the global using movz/movk instructions. materializeLargeCMVal(I, GV, OpFlags); I.eraseFromParent(); return true; } else if (TM.getCodeModel() == CodeModel::Tiny) { I.setDesc(TII.get(AArch64::ADR)); I.getOperand(1).setTargetFlags(OpFlags); } else { I.setDesc(TII.get(AArch64::MOVaddr)); I.getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_PAGE); MachineInstrBuilder MIB(MF, I); MIB.addGlobalAddress(GV, I.getOperand(1).getOffset(), OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); } return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } case TargetOpcode::G_ZEXTLOAD: case TargetOpcode::G_LOAD: case TargetOpcode::G_STORE: { GLoadStore &LdSt = cast(I); bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD; LLT PtrTy = MRI.getType(LdSt.getPointerReg()); if (PtrTy != LLT::pointer(0, 64)) { LLVM_DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy << ", expected: " << LLT::pointer(0, 64) << '\n'); return false; } uint64_t MemSizeInBytes = LdSt.getMemSize(); unsigned MemSizeInBits = LdSt.getMemSizeInBits(); AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering(); // Need special instructions for atomics that affect ordering. if (Order != AtomicOrdering::NotAtomic && Order != AtomicOrdering::Unordered && Order != AtomicOrdering::Monotonic) { assert(!isa(LdSt)); if (MemSizeInBytes > 64) return false; if (isa(LdSt)) { static constexpr unsigned LDAPROpcodes[] = { AArch64::LDAPRB, AArch64::LDAPRH, AArch64::LDAPRW, AArch64::LDAPRX}; static constexpr unsigned LDAROpcodes[] = { AArch64::LDARB, AArch64::LDARH, AArch64::LDARW, AArch64::LDARX}; ArrayRef Opcodes = STI.hasRCPC() && Order != AtomicOrdering::SequentiallyConsistent ? LDAPROpcodes : LDAROpcodes; I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)])); } else { static constexpr unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH, AArch64::STLRW, AArch64::STLRX}; Register ValReg = LdSt.getReg(0); if (MRI.getType(ValReg).getSizeInBits() == 64 && MemSizeInBits != 64) { // Emit a subreg copy of 32 bits. Register NewVal = MRI.createVirtualRegister(&AArch64::GPR32RegClass); MIB.buildInstr(TargetOpcode::COPY, {NewVal}, {}) .addReg(I.getOperand(0).getReg(), 0, AArch64::sub_32); I.getOperand(0).setReg(NewVal); } I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)])); } constrainSelectedInstRegOperands(I, TII, TRI, RBI); return true; } #ifndef NDEBUG const Register PtrReg = LdSt.getPointerReg(); const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI); // Check that the pointer register is valid. assert(PtrRB.getID() == AArch64::GPRRegBankID && "Load/Store pointer operand isn't a GPR"); assert(MRI.getType(PtrReg).isPointer() && "Load/Store pointer operand isn't a pointer"); #endif const Register ValReg = LdSt.getReg(0); const LLT ValTy = MRI.getType(ValReg); const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI); // The code below doesn't support truncating stores, so we need to split it // again. if (isa(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) { unsigned SubReg; LLT MemTy = LdSt.getMMO().getMemoryType(); auto *RC = getRegClassForTypeOnBank(MemTy, RB); if (!getSubRegForClass(RC, TRI, SubReg)) return false; // Generate a subreg copy. auto Copy = MIB.buildInstr(TargetOpcode::COPY, {MemTy}, {}) .addReg(ValReg, 0, SubReg) .getReg(0); RBI.constrainGenericRegister(Copy, *RC, MRI); LdSt.getOperand(0).setReg(Copy); } else if (isa(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) { // If this is an any-extending load from the FPR bank, split it into a regular // load + extend. if (RB.getID() == AArch64::FPRRegBankID) { unsigned SubReg; LLT MemTy = LdSt.getMMO().getMemoryType(); auto *RC = getRegClassForTypeOnBank(MemTy, RB); if (!getSubRegForClass(RC, TRI, SubReg)) return false; Register OldDst = LdSt.getReg(0); Register NewDst = MRI.createGenericVirtualRegister(LdSt.getMMO().getMemoryType()); LdSt.getOperand(0).setReg(NewDst); MRI.setRegBank(NewDst, RB); // Generate a SUBREG_TO_REG to extend it. MIB.setInsertPt(MIB.getMBB(), std::next(LdSt.getIterator())); MIB.buildInstr(AArch64::SUBREG_TO_REG, {OldDst}, {}) .addImm(0) .addUse(NewDst) .addImm(SubReg); auto SubRegRC = getRegClassForTypeOnBank(MRI.getType(OldDst), RB); RBI.constrainGenericRegister(OldDst, *SubRegRC, MRI); MIB.setInstr(LdSt); } } // Helper lambda for partially selecting I. Either returns the original // instruction with an updated opcode, or a new instruction. auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * { bool IsStore = isa(I); const unsigned NewOpc = selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits); if (NewOpc == I.getOpcode()) return nullptr; // Check if we can fold anything into the addressing mode. auto AddrModeFns = selectAddrModeIndexed(I.getOperand(1), MemSizeInBytes); if (!AddrModeFns) { // Can't fold anything. Use the original instruction. I.setDesc(TII.get(NewOpc)); I.addOperand(MachineOperand::CreateImm(0)); return &I; } // Folded something. Create a new instruction and return it. auto NewInst = MIB.buildInstr(NewOpc, {}, {}, I.getFlags()); Register CurValReg = I.getOperand(0).getReg(); IsStore ? NewInst.addUse(CurValReg) : NewInst.addDef(CurValReg); NewInst.cloneMemRefs(I); for (auto &Fn : *AddrModeFns) Fn(NewInst); I.eraseFromParent(); return &*NewInst; }; MachineInstr *LoadStore = SelectLoadStoreAddressingMode(); if (!LoadStore) return false; // If we're storing a 0, use WZR/XZR. if (Opcode == TargetOpcode::G_STORE) { auto CVal = getIConstantVRegValWithLookThrough( LoadStore->getOperand(0).getReg(), MRI); if (CVal && CVal->Value == 0) { switch (LoadStore->getOpcode()) { case AArch64::STRWui: case AArch64::STRHHui: case AArch64::STRBBui: LoadStore->getOperand(0).setReg(AArch64::WZR); break; case AArch64::STRXui: LoadStore->getOperand(0).setReg(AArch64::XZR); break; } } } if (IsZExtLoad) { // The zextload from a smaller type to i32 should be handled by the // importer. if (MRI.getType(LoadStore->getOperand(0).getReg()).getSizeInBits() != 64) return false; // If we have a ZEXTLOAD then change the load's type to be a narrower reg // and zero_extend with SUBREG_TO_REG. Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); Register DstReg = LoadStore->getOperand(0).getReg(); LoadStore->getOperand(0).setReg(LdReg); MIB.setInsertPt(MIB.getMBB(), std::next(LoadStore->getIterator())); MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {}) .addImm(0) .addUse(LdReg) .addImm(AArch64::sub_32); constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI); return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass, MRI); } return constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI); } case TargetOpcode::G_SMULH: case TargetOpcode::G_UMULH: { // Reject the various things we don't support yet. if (unsupportedBinOp(I, RBI, MRI, TRI)) return false; const Register DefReg = I.getOperand(0).getReg(); const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); if (RB.getID() != AArch64::GPRRegBankID) { LLVM_DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n"); return false; } if (Ty != LLT::scalar(64)) { LLVM_DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty << ", expected: " << LLT::scalar(64) << '\n'); return false; } unsigned NewOpc = I.getOpcode() == TargetOpcode::G_SMULH ? AArch64::SMULHrr : AArch64::UMULHrr; I.setDesc(TII.get(NewOpc)); // Now that we selected an opcode, we need to constrain the register // operands to use appropriate classes. return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } case TargetOpcode::G_LSHR: case TargetOpcode::G_ASHR: if (MRI.getType(I.getOperand(0).getReg()).isVector()) return selectVectorAshrLshr(I, MRI); [[fallthrough]]; case TargetOpcode::G_SHL: if (Opcode == TargetOpcode::G_SHL && MRI.getType(I.getOperand(0).getReg()).isVector()) return selectVectorSHL(I, MRI); // These shifts were legalized to have 64 bit shift amounts because we // want to take advantage of the selection patterns that assume the // immediates are s64s, however, selectBinaryOp will assume both operands // will have the same bit size. { Register SrcReg = I.getOperand(1).getReg(); Register ShiftReg = I.getOperand(2).getReg(); const LLT ShiftTy = MRI.getType(ShiftReg); const LLT SrcTy = MRI.getType(SrcReg); if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 32 && ShiftTy.getSizeInBits() == 64) { assert(!ShiftTy.isVector() && "unexpected vector shift ty"); // Insert a subregister copy to implement a 64->32 trunc auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {}) .addReg(ShiftReg, 0, AArch64::sub_32); MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID)); I.getOperand(2).setReg(Trunc.getReg(0)); } } [[fallthrough]]; case TargetOpcode::G_OR: { // Reject the various things we don't support yet. if (unsupportedBinOp(I, RBI, MRI, TRI)) return false; const unsigned OpSize = Ty.getSizeInBits(); const Register DefReg = I.getOperand(0).getReg(); const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); const unsigned NewOpc = selectBinaryOp(I.getOpcode(), RB.getID(), OpSize); if (NewOpc == I.getOpcode()) return false; I.setDesc(TII.get(NewOpc)); // FIXME: Should the type be always reset in setDesc? // Now that we selected an opcode, we need to constrain the register // operands to use appropriate classes. return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } case TargetOpcode::G_PTR_ADD: { emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2), MIB); I.eraseFromParent(); return true; } case TargetOpcode::G_SADDE: case TargetOpcode::G_UADDE: case TargetOpcode::G_SSUBE: case TargetOpcode::G_USUBE: case TargetOpcode::G_SADDO: case TargetOpcode::G_UADDO: case TargetOpcode::G_SSUBO: case TargetOpcode::G_USUBO: return selectOverflowOp(I, MRI); case TargetOpcode::G_PTRMASK: { Register MaskReg = I.getOperand(2).getReg(); std::optional MaskVal = getIConstantVRegSExtVal(MaskReg, MRI); // TODO: Implement arbitrary cases if (!MaskVal || !isShiftedMask_64(*MaskVal)) return false; uint64_t Mask = *MaskVal; I.setDesc(TII.get(AArch64::ANDXri)); I.getOperand(2).ChangeToImmediate( AArch64_AM::encodeLogicalImmediate(Mask, 64)); return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } case TargetOpcode::G_PTRTOINT: case TargetOpcode::G_TRUNC: { const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); const LLT SrcTy = MRI.getType(I.getOperand(1).getReg()); const Register DstReg = I.getOperand(0).getReg(); const Register SrcReg = I.getOperand(1).getReg(); const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI); if (DstRB.getID() != SrcRB.getID()) { LLVM_DEBUG( dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n"); return false; } if (DstRB.getID() == AArch64::GPRRegBankID) { const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB); if (!DstRC) return false; const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(SrcTy, SrcRB); if (!SrcRC) return false; if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) || !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n"); return false; } if (DstRC == SrcRC) { // Nothing to be done } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) && SrcTy == LLT::scalar(64)) { llvm_unreachable("TableGen can import this case"); return false; } else if (DstRC == &AArch64::GPR32RegClass && SrcRC == &AArch64::GPR64RegClass) { I.getOperand(1).setSubReg(AArch64::sub_32); } else { LLVM_DEBUG( dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n"); return false; } I.setDesc(TII.get(TargetOpcode::COPY)); return true; } else if (DstRB.getID() == AArch64::FPRRegBankID) { if (DstTy == LLT::fixed_vector(4, 16) && SrcTy == LLT::fixed_vector(4, 32)) { I.setDesc(TII.get(AArch64::XTNv4i16)); constrainSelectedInstRegOperands(I, TII, TRI, RBI); return true; } if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) { MachineInstr *Extract = emitExtractVectorElt( DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB); if (!Extract) return false; I.eraseFromParent(); return true; } // We might have a vector G_PTRTOINT, in which case just emit a COPY. if (Opcode == TargetOpcode::G_PTRTOINT) { assert(DstTy.isVector() && "Expected an FPR ptrtoint to be a vector"); I.setDesc(TII.get(TargetOpcode::COPY)); return selectCopy(I, TII, MRI, TRI, RBI); } } return false; } case TargetOpcode::G_ANYEXT: { if (selectUSMovFromExtend(I, MRI)) return true; const Register DstReg = I.getOperand(0).getReg(); const Register SrcReg = I.getOperand(1).getReg(); const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI); if (RBDst.getID() != AArch64::GPRRegBankID) { LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst << ", expected: GPR\n"); return false; } const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI); if (RBSrc.getID() != AArch64::GPRRegBankID) { LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc << ", expected: GPR\n"); return false; } const unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); if (DstSize == 0) { LLVM_DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n"); return false; } if (DstSize != 64 && DstSize > 32) { LLVM_DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize << ", expected: 32 or 64\n"); return false; } // At this point G_ANYEXT is just like a plain COPY, but we need // to explicitly form the 64-bit value if any. if (DstSize > 32) { Register ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass); BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG)) .addDef(ExtSrc) .addImm(0) .addUse(SrcReg) .addImm(AArch64::sub_32); I.getOperand(1).setReg(ExtSrc); } return selectCopy(I, TII, MRI, TRI, RBI); } case TargetOpcode::G_ZEXT: case TargetOpcode::G_SEXT_INREG: case TargetOpcode::G_SEXT: { if (selectUSMovFromExtend(I, MRI)) return true; unsigned Opcode = I.getOpcode(); const bool IsSigned = Opcode != TargetOpcode::G_ZEXT; const Register DefReg = I.getOperand(0).getReg(); Register SrcReg = I.getOperand(1).getReg(); const LLT DstTy = MRI.getType(DefReg); const LLT SrcTy = MRI.getType(SrcReg); unsigned DstSize = DstTy.getSizeInBits(); unsigned SrcSize = SrcTy.getSizeInBits(); // SEXT_INREG has the same src reg size as dst, the size of the value to be // extended is encoded in the imm. if (Opcode == TargetOpcode::G_SEXT_INREG) SrcSize = I.getOperand(2).getImm(); if (DstTy.isVector()) return false; // Should be handled by imported patterns. assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() == AArch64::GPRRegBankID && "Unexpected ext regbank"); MachineInstr *ExtI; // First check if we're extending the result of a load which has a dest type // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest // GPR register on AArch64 and all loads which are smaller automatically // zero-extend the upper bits. E.g. // %v(s8) = G_LOAD %p, :: (load 1) // %v2(s32) = G_ZEXT %v(s8) if (!IsSigned) { auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI); bool IsGPR = RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID; if (LoadMI && IsGPR) { const MachineMemOperand *MemOp = *LoadMI->memoperands_begin(); unsigned BytesLoaded = MemOp->getSize(); if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded) return selectCopy(I, TII, MRI, TRI, RBI); } // For the 32-bit -> 64-bit case, we can emit a mov (ORRWrs) // + SUBREG_TO_REG. if (IsGPR && SrcSize == 32 && DstSize == 64) { Register SubregToRegSrc = MRI.createVirtualRegister(&AArch64::GPR32RegClass); const Register ZReg = AArch64::WZR; MIB.buildInstr(AArch64::ORRWrs, {SubregToRegSrc}, {ZReg, SrcReg}) .addImm(0); MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {}) .addImm(0) .addUse(SubregToRegSrc) .addImm(AArch64::sub_32); if (!RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI)) { LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n"); return false; } if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, MRI)) { LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n"); return false; } I.eraseFromParent(); return true; } } if (DstSize == 64) { if (Opcode != TargetOpcode::G_SEXT_INREG) { // FIXME: Can we avoid manually doing this? if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, MRI)) { LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode) << " operand\n"); return false; } SrcReg = MIB.buildInstr(AArch64::SUBREG_TO_REG, {&AArch64::GPR64RegClass}, {}) .addImm(0) .addUse(SrcReg) .addImm(AArch64::sub_32) .getReg(0); } ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri, {DefReg}, {SrcReg}) .addImm(0) .addImm(SrcSize - 1); } else if (DstSize <= 32) { ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri, {DefReg}, {SrcReg}) .addImm(0) .addImm(SrcSize - 1); } else { return false; } constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); I.eraseFromParent(); return true; } case TargetOpcode::G_SITOFP: case TargetOpcode::G_UITOFP: case TargetOpcode::G_FPTOSI: case TargetOpcode::G_FPTOUI: { const LLT DstTy = MRI.getType(I.getOperand(0).getReg()), SrcTy = MRI.getType(I.getOperand(1).getReg()); const unsigned NewOpc = selectFPConvOpc(Opcode, DstTy, SrcTy); if (NewOpc == Opcode) return false; I.setDesc(TII.get(NewOpc)); constrainSelectedInstRegOperands(I, TII, TRI, RBI); I.setFlags(MachineInstr::NoFPExcept); return true; } case TargetOpcode::G_FREEZE: return selectCopy(I, TII, MRI, TRI, RBI); case TargetOpcode::G_INTTOPTR: // The importer is currently unable to import pointer types since they // didn't exist in SelectionDAG. return selectCopy(I, TII, MRI, TRI, RBI); case TargetOpcode::G_BITCAST: // Imported SelectionDAG rules can handle every bitcast except those that // bitcast from a type to the same type. Ideally, these shouldn't occur // but we might not run an optimizer that deletes them. The other exception // is bitcasts involving pointer types, as SelectionDAG has no knowledge // of them. return selectCopy(I, TII, MRI, TRI, RBI); case TargetOpcode::G_SELECT: { auto &Sel = cast(I); const Register CondReg = Sel.getCondReg(); const Register TReg = Sel.getTrueReg(); const Register FReg = Sel.getFalseReg(); if (tryOptSelect(Sel)) return true; // Make sure to use an unused vreg instead of wzr, so that the peephole // optimizations will be able to optimize these. Register DeadVReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg}) .addImm(AArch64_AM::encodeLogicalImmediate(1, 32)); constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI); if (!emitSelect(Sel.getReg(0), TReg, FReg, AArch64CC::NE, MIB)) return false; Sel.eraseFromParent(); return true; } case TargetOpcode::G_ICMP: { if (Ty.isVector()) return selectVectorICmp(I, MRI); if (Ty != LLT::scalar(32)) { LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty << ", expected: " << LLT::scalar(32) << '\n'); return false; } auto Pred = static_cast(I.getOperand(1).getPredicate()); const AArch64CC::CondCode InvCC = changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred)); emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1), MIB); emitCSINC(/*Dst=*/I.getOperand(0).getReg(), /*Src1=*/AArch64::WZR, /*Src2=*/AArch64::WZR, InvCC, MIB); I.eraseFromParent(); return true; } case TargetOpcode::G_FCMP: { CmpInst::Predicate Pred = static_cast(I.getOperand(1).getPredicate()); if (!emitFPCompare(I.getOperand(2).getReg(), I.getOperand(3).getReg(), MIB, Pred) || !emitCSetForFCmp(I.getOperand(0).getReg(), Pred, MIB)) return false; I.eraseFromParent(); return true; } case TargetOpcode::G_VASTART: return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI) : selectVaStartAAPCS(I, MF, MRI); case TargetOpcode::G_INTRINSIC: return selectIntrinsic(I, MRI); case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: return selectIntrinsicWithSideEffects(I, MRI); case TargetOpcode::G_IMPLICIT_DEF: { I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); const Register DstReg = I.getOperand(0).getReg(); const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB); RBI.constrainGenericRegister(DstReg, *DstRC, MRI); return true; } case TargetOpcode::G_BLOCK_ADDR: { if (TM.getCodeModel() == CodeModel::Large) { materializeLargeCMVal(I, I.getOperand(1).getBlockAddress(), 0); I.eraseFromParent(); return true; } else { I.setDesc(TII.get(AArch64::MOVaddrBA)); auto MovMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::MOVaddrBA), I.getOperand(0).getReg()) .addBlockAddress(I.getOperand(1).getBlockAddress(), /* Offset */ 0, AArch64II::MO_PAGE) .addBlockAddress( I.getOperand(1).getBlockAddress(), /* Offset */ 0, AArch64II::MO_NC | AArch64II::MO_PAGEOFF); I.eraseFromParent(); return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI); } } case AArch64::G_DUP: { // When the scalar of G_DUP is an s8/s16 gpr, they can't be selected by // imported patterns. Do it manually here. Avoiding generating s16 gpr is // difficult because at RBS we may end up pessimizing the fpr case if we // decided to add an anyextend to fix this. Manual selection is the most // robust solution for now. if (RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() != AArch64::GPRRegBankID) return false; // We expect the fpr regbank case to be imported. LLT VecTy = MRI.getType(I.getOperand(0).getReg()); if (VecTy == LLT::fixed_vector(8, 8)) I.setDesc(TII.get(AArch64::DUPv8i8gpr)); else if (VecTy == LLT::fixed_vector(16, 8)) I.setDesc(TII.get(AArch64::DUPv16i8gpr)); else if (VecTy == LLT::fixed_vector(4, 16)) I.setDesc(TII.get(AArch64::DUPv4i16gpr)); else if (VecTy == LLT::fixed_vector(8, 16)) I.setDesc(TII.get(AArch64::DUPv8i16gpr)); else return false; return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } case TargetOpcode::G_INTRINSIC_TRUNC: return selectIntrinsicTrunc(I, MRI); case TargetOpcode::G_INTRINSIC_ROUND: return selectIntrinsicRound(I, MRI); case TargetOpcode::G_BUILD_VECTOR: return selectBuildVector(I, MRI); case TargetOpcode::G_MERGE_VALUES: return selectMergeValues(I, MRI); case TargetOpcode::G_UNMERGE_VALUES: return selectUnmergeValues(I, MRI); case TargetOpcode::G_SHUFFLE_VECTOR: return selectShuffleVector(I, MRI); case TargetOpcode::G_EXTRACT_VECTOR_ELT: return selectExtractElt(I, MRI); case TargetOpcode::G_INSERT_VECTOR_ELT: return selectInsertElt(I, MRI); case TargetOpcode::G_CONCAT_VECTORS: return selectConcatVectors(I, MRI); case TargetOpcode::G_JUMP_TABLE: return selectJumpTable(I, MRI); case TargetOpcode::G_VECREDUCE_ADD: return selectReduction(I, MRI); case TargetOpcode::G_MEMCPY: case TargetOpcode::G_MEMCPY_INLINE: case TargetOpcode::G_MEMMOVE: case TargetOpcode::G_MEMSET: assert(STI.hasMOPS() && "Shouldn't get here without +mops feature"); return selectMOPS(I, MRI); } return false; } bool AArch64InstructionSelector::selectReduction(MachineInstr &I, MachineRegisterInfo &MRI) { Register VecReg = I.getOperand(1).getReg(); LLT VecTy = MRI.getType(VecReg); if (I.getOpcode() == TargetOpcode::G_VECREDUCE_ADD) { // For <2 x i32> ADDPv2i32 generates an FPR64 value, so we need to emit // a subregister copy afterwards. if (VecTy == LLT::fixed_vector(2, 32)) { Register DstReg = I.getOperand(0).getReg(); auto AddP = MIB.buildInstr(AArch64::ADDPv2i32, {&AArch64::FPR64RegClass}, {VecReg, VecReg}); auto Copy = MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}) .addReg(AddP.getReg(0), 0, AArch64::ssub) .getReg(0); RBI.constrainGenericRegister(Copy, AArch64::FPR32RegClass, MRI); I.eraseFromParent(); return constrainSelectedInstRegOperands(*AddP, TII, TRI, RBI); } unsigned Opc = 0; if (VecTy == LLT::fixed_vector(16, 8)) Opc = AArch64::ADDVv16i8v; else if (VecTy == LLT::fixed_vector(8, 16)) Opc = AArch64::ADDVv8i16v; else if (VecTy == LLT::fixed_vector(4, 32)) Opc = AArch64::ADDVv4i32v; else if (VecTy == LLT::fixed_vector(2, 64)) Opc = AArch64::ADDPv2i64p; else { LLVM_DEBUG(dbgs() << "Unhandled type for add reduction"); return false; } I.setDesc(TII.get(Opc)); return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } return false; } bool AArch64InstructionSelector::selectMOPS(MachineInstr &GI, MachineRegisterInfo &MRI) { unsigned Mopcode; switch (GI.getOpcode()) { case TargetOpcode::G_MEMCPY: case TargetOpcode::G_MEMCPY_INLINE: Mopcode = AArch64::MOPSMemoryCopyPseudo; break; case TargetOpcode::G_MEMMOVE: Mopcode = AArch64::MOPSMemoryMovePseudo; break; case TargetOpcode::G_MEMSET: // For tagged memset see llvm.aarch64.mops.memset.tag Mopcode = AArch64::MOPSMemorySetPseudo; break; } auto &DstPtr = GI.getOperand(0); auto &SrcOrVal = GI.getOperand(1); auto &Size = GI.getOperand(2); // Create copies of the registers that can be clobbered. const Register DstPtrCopy = MRI.cloneVirtualRegister(DstPtr.getReg()); const Register SrcValCopy = MRI.cloneVirtualRegister(SrcOrVal.getReg()); const Register SizeCopy = MRI.cloneVirtualRegister(Size.getReg()); const bool IsSet = Mopcode == AArch64::MOPSMemorySetPseudo; const auto &SrcValRegClass = IsSet ? AArch64::GPR64RegClass : AArch64::GPR64commonRegClass; // Constrain to specific registers RBI.constrainGenericRegister(DstPtrCopy, AArch64::GPR64commonRegClass, MRI); RBI.constrainGenericRegister(SrcValCopy, SrcValRegClass, MRI); RBI.constrainGenericRegister(SizeCopy, AArch64::GPR64RegClass, MRI); MIB.buildCopy(DstPtrCopy, DstPtr); MIB.buildCopy(SrcValCopy, SrcOrVal); MIB.buildCopy(SizeCopy, Size); // New instruction uses the copied registers because it must update them. // The defs are not used since they don't exist in G_MEM*. They are still // tied. // Note: order of operands is different from G_MEMSET, G_MEMCPY, G_MEMMOVE Register DefDstPtr = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass); Register DefSize = MRI.createVirtualRegister(&AArch64::GPR64RegClass); if (IsSet) { MIB.buildInstr(Mopcode, {DefDstPtr, DefSize}, {DstPtrCopy, SizeCopy, SrcValCopy}); } else { Register DefSrcPtr = MRI.createVirtualRegister(&SrcValRegClass); MIB.buildInstr(Mopcode, {DefDstPtr, DefSrcPtr, DefSize}, {DstPtrCopy, SrcValCopy, SizeCopy}); } GI.eraseFromParent(); return true; } bool AArch64InstructionSelector::selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI) { assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT"); Register JTAddr = I.getOperand(0).getReg(); unsigned JTI = I.getOperand(1).getIndex(); Register Index = I.getOperand(2).getReg(); Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass); MF->getInfo()->setJumpTableEntryInfo(JTI, 4, nullptr); auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32, {TargetReg, ScratchReg}, {JTAddr, Index}) .addJumpTableIndex(JTI); // Build the indirect branch. MIB.buildInstr(AArch64::BR, {}, {TargetReg}); I.eraseFromParent(); return constrainSelectedInstRegOperands(*JumpTableInst, TII, TRI, RBI); } bool AArch64InstructionSelector::selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI) { assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table"); assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!"); Register DstReg = I.getOperand(0).getReg(); unsigned JTI = I.getOperand(1).getIndex(); // We generate a MOVaddrJT which will get expanded to an ADRP + ADD later. auto MovMI = MIB.buildInstr(AArch64::MOVaddrJT, {DstReg}, {}) .addJumpTableIndex(JTI, AArch64II::MO_PAGE) .addJumpTableIndex(JTI, AArch64II::MO_NC | AArch64II::MO_PAGEOFF); I.eraseFromParent(); return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI); } bool AArch64InstructionSelector::selectTLSGlobalValue( MachineInstr &I, MachineRegisterInfo &MRI) { if (!STI.isTargetMachO()) return false; MachineFunction &MF = *I.getParent()->getParent(); MF.getFrameInfo().setAdjustsStack(true); const auto &GlobalOp = I.getOperand(1); assert(GlobalOp.getOffset() == 0 && "Shouldn't have an offset on TLS globals!"); const GlobalValue &GV = *GlobalOp.getGlobal(); auto LoadGOT = MIB.buildInstr(AArch64::LOADgot, {&AArch64::GPR64commonRegClass}, {}) .addGlobalAddress(&GV, 0, AArch64II::MO_TLS); auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass}, {LoadGOT.getReg(0)}) .addImm(0); MIB.buildCopy(Register(AArch64::X0), LoadGOT.getReg(0)); // TLS calls preserve all registers except those that absolutely must be // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be // silly). MIB.buildInstr(getBLRCallOpcode(MF), {}, {Load}) .addUse(AArch64::X0, RegState::Implicit) .addDef(AArch64::X0, RegState::Implicit) .addRegMask(TRI.getTLSCallPreservedMask()); MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X0)); RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR64RegClass, MRI); I.eraseFromParent(); return true; } bool AArch64InstructionSelector::selectIntrinsicTrunc( MachineInstr &I, MachineRegisterInfo &MRI) const { const LLT SrcTy = MRI.getType(I.getOperand(0).getReg()); // Select the correct opcode. unsigned Opc = 0; if (!SrcTy.isVector()) { switch (SrcTy.getSizeInBits()) { default: case 16: Opc = AArch64::FRINTZHr; break; case 32: Opc = AArch64::FRINTZSr; break; case 64: Opc = AArch64::FRINTZDr; break; } } else { unsigned NumElts = SrcTy.getNumElements(); switch (SrcTy.getElementType().getSizeInBits()) { default: break; case 16: if (NumElts == 4) Opc = AArch64::FRINTZv4f16; else if (NumElts == 8) Opc = AArch64::FRINTZv8f16; break; case 32: if (NumElts == 2) Opc = AArch64::FRINTZv2f32; else if (NumElts == 4) Opc = AArch64::FRINTZv4f32; break; case 64: if (NumElts == 2) Opc = AArch64::FRINTZv2f64; break; } } if (!Opc) { // Didn't get an opcode above, bail. LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_TRUNC!\n"); return false; } // Legalization would have set us up perfectly for this; we just need to // set the opcode and move on. I.setDesc(TII.get(Opc)); return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } bool AArch64InstructionSelector::selectIntrinsicRound( MachineInstr &I, MachineRegisterInfo &MRI) const { const LLT SrcTy = MRI.getType(I.getOperand(0).getReg()); // Select the correct opcode. unsigned Opc = 0; if (!SrcTy.isVector()) { switch (SrcTy.getSizeInBits()) { default: case 16: Opc = AArch64::FRINTAHr; break; case 32: Opc = AArch64::FRINTASr; break; case 64: Opc = AArch64::FRINTADr; break; } } else { unsigned NumElts = SrcTy.getNumElements(); switch (SrcTy.getElementType().getSizeInBits()) { default: break; case 16: if (NumElts == 4) Opc = AArch64::FRINTAv4f16; else if (NumElts == 8) Opc = AArch64::FRINTAv8f16; break; case 32: if (NumElts == 2) Opc = AArch64::FRINTAv2f32; else if (NumElts == 4) Opc = AArch64::FRINTAv4f32; break; case 64: if (NumElts == 2) Opc = AArch64::FRINTAv2f64; break; } } if (!Opc) { // Didn't get an opcode above, bail. LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_ROUND!\n"); return false; } // Legalization would have set us up perfectly for this; we just need to // set the opcode and move on. I.setDesc(TII.get(Opc)); return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } bool AArch64InstructionSelector::selectVectorICmp( MachineInstr &I, MachineRegisterInfo &MRI) { Register DstReg = I.getOperand(0).getReg(); LLT DstTy = MRI.getType(DstReg); Register SrcReg = I.getOperand(2).getReg(); Register Src2Reg = I.getOperand(3).getReg(); LLT SrcTy = MRI.getType(SrcReg); unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits(); unsigned NumElts = DstTy.getNumElements(); // First index is element size, 0 == 8b, 1 == 16b, 2 == 32b, 3 == 64b // Second index is num elts, 0 == v2, 1 == v4, 2 == v8, 3 == v16 // Third index is cc opcode: // 0 == eq // 1 == ugt // 2 == uge // 3 == ult // 4 == ule // 5 == sgt // 6 == sge // 7 == slt // 8 == sle // ne is done by negating 'eq' result. // This table below assumes that for some comparisons the operands will be // commuted. // ult op == commute + ugt op // ule op == commute + uge op // slt op == commute + sgt op // sle op == commute + sge op unsigned PredIdx = 0; bool SwapOperands = false; CmpInst::Predicate Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); switch (Pred) { case CmpInst::ICMP_NE: case CmpInst::ICMP_EQ: PredIdx = 0; break; case CmpInst::ICMP_UGT: PredIdx = 1; break; case CmpInst::ICMP_UGE: PredIdx = 2; break; case CmpInst::ICMP_ULT: PredIdx = 3; SwapOperands = true; break; case CmpInst::ICMP_ULE: PredIdx = 4; SwapOperands = true; break; case CmpInst::ICMP_SGT: PredIdx = 5; break; case CmpInst::ICMP_SGE: PredIdx = 6; break; case CmpInst::ICMP_SLT: PredIdx = 7; SwapOperands = true; break; case CmpInst::ICMP_SLE: PredIdx = 8; SwapOperands = true; break; default: llvm_unreachable("Unhandled icmp predicate"); return false; } // This table obviously should be tablegen'd when we have our GISel native // tablegen selector. static const unsigned OpcTable[4][4][9] = { { {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */}, {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */}, {AArch64::CMEQv8i8, AArch64::CMHIv8i8, AArch64::CMHSv8i8, AArch64::CMHIv8i8, AArch64::CMHSv8i8, AArch64::CMGTv8i8, AArch64::CMGEv8i8, AArch64::CMGTv8i8, AArch64::CMGEv8i8}, {AArch64::CMEQv16i8, AArch64::CMHIv16i8, AArch64::CMHSv16i8, AArch64::CMHIv16i8, AArch64::CMHSv16i8, AArch64::CMGTv16i8, AArch64::CMGEv16i8, AArch64::CMGTv16i8, AArch64::CMGEv16i8} }, { {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */}, {AArch64::CMEQv4i16, AArch64::CMHIv4i16, AArch64::CMHSv4i16, AArch64::CMHIv4i16, AArch64::CMHSv4i16, AArch64::CMGTv4i16, AArch64::CMGEv4i16, AArch64::CMGTv4i16, AArch64::CMGEv4i16}, {AArch64::CMEQv8i16, AArch64::CMHIv8i16, AArch64::CMHSv8i16, AArch64::CMHIv8i16, AArch64::CMHSv8i16, AArch64::CMGTv8i16, AArch64::CMGEv8i16, AArch64::CMGTv8i16, AArch64::CMGEv8i16}, {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */} }, { {AArch64::CMEQv2i32, AArch64::CMHIv2i32, AArch64::CMHSv2i32, AArch64::CMHIv2i32, AArch64::CMHSv2i32, AArch64::CMGTv2i32, AArch64::CMGEv2i32, AArch64::CMGTv2i32, AArch64::CMGEv2i32}, {AArch64::CMEQv4i32, AArch64::CMHIv4i32, AArch64::CMHSv4i32, AArch64::CMHIv4i32, AArch64::CMHSv4i32, AArch64::CMGTv4i32, AArch64::CMGEv4i32, AArch64::CMGTv4i32, AArch64::CMGEv4i32}, {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */}, {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */} }, { {AArch64::CMEQv2i64, AArch64::CMHIv2i64, AArch64::CMHSv2i64, AArch64::CMHIv2i64, AArch64::CMHSv2i64, AArch64::CMGTv2i64, AArch64::CMGEv2i64, AArch64::CMGTv2i64, AArch64::CMGEv2i64}, {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */}, {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */}, {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */} }, }; unsigned EltIdx = Log2_32(SrcEltSize / 8); unsigned NumEltsIdx = Log2_32(NumElts / 2); unsigned Opc = OpcTable[EltIdx][NumEltsIdx][PredIdx]; if (!Opc) { LLVM_DEBUG(dbgs() << "Could not map G_ICMP to cmp opcode"); return false; } const RegisterBank &VecRB = *RBI.getRegBank(SrcReg, MRI, TRI); const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(SrcTy, VecRB, true); if (!SrcRC) { LLVM_DEBUG(dbgs() << "Could not determine source register class.\n"); return false; } unsigned NotOpc = Pred == ICmpInst::ICMP_NE ? AArch64::NOTv8i8 : 0; if (SrcTy.getSizeInBits() == 128) NotOpc = NotOpc ? AArch64::NOTv16i8 : 0; if (SwapOperands) std::swap(SrcReg, Src2Reg); auto Cmp = MIB.buildInstr(Opc, {SrcRC}, {SrcReg, Src2Reg}); constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI); // Invert if we had a 'ne' cc. if (NotOpc) { Cmp = MIB.buildInstr(NotOpc, {DstReg}, {Cmp}); constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI); } else { MIB.buildCopy(DstReg, Cmp.getReg(0)); } RBI.constrainGenericRegister(DstReg, *SrcRC, MRI); I.eraseFromParent(); return true; } MachineInstr *AArch64InstructionSelector::emitScalarToVector( unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar, MachineIRBuilder &MIRBuilder) const { auto Undef = MIRBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstRC}, {}); auto BuildFn = [&](unsigned SubregIndex) { auto Ins = MIRBuilder .buildInstr(TargetOpcode::INSERT_SUBREG, {DstRC}, {Undef, Scalar}) .addImm(SubregIndex); constrainSelectedInstRegOperands(*Undef, TII, TRI, RBI); constrainSelectedInstRegOperands(*Ins, TII, TRI, RBI); return &*Ins; }; switch (EltSize) { case 8: return BuildFn(AArch64::bsub); case 16: return BuildFn(AArch64::hsub); case 32: return BuildFn(AArch64::ssub); case 64: return BuildFn(AArch64::dsub); default: return nullptr; } } bool AArch64InstructionSelector::selectMergeValues( MachineInstr &I, MachineRegisterInfo &MRI) { assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode"); const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); const LLT SrcTy = MRI.getType(I.getOperand(1).getReg()); assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation"); const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI); if (I.getNumOperands() != 3) return false; // Merging 2 s64s into an s128. if (DstTy == LLT::scalar(128)) { if (SrcTy.getSizeInBits() != 64) return false; Register DstReg = I.getOperand(0).getReg(); Register Src1Reg = I.getOperand(1).getReg(); Register Src2Reg = I.getOperand(2).getReg(); auto Tmp = MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstTy}, {}); MachineInstr *InsMI = emitLaneInsert(std::nullopt, Tmp.getReg(0), Src1Reg, /* LaneIdx */ 0, RB, MIB); if (!InsMI) return false; MachineInstr *Ins2MI = emitLaneInsert(DstReg, InsMI->getOperand(0).getReg(), Src2Reg, /* LaneIdx */ 1, RB, MIB); if (!Ins2MI) return false; constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI); constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI); I.eraseFromParent(); return true; } if (RB.getID() != AArch64::GPRRegBankID) return false; if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32) return false; auto *DstRC = &AArch64::GPR64RegClass; Register SubToRegDef = MRI.createVirtualRegister(DstRC); MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::SUBREG_TO_REG)) .addDef(SubToRegDef) .addImm(0) .addUse(I.getOperand(1).getReg()) .addImm(AArch64::sub_32); Register SubToRegDef2 = MRI.createVirtualRegister(DstRC); // Need to anyext the second scalar before we can use bfm MachineInstr &SubRegMI2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::SUBREG_TO_REG)) .addDef(SubToRegDef2) .addImm(0) .addUse(I.getOperand(2).getReg()) .addImm(AArch64::sub_32); MachineInstr &BFM = *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::BFMXri)) .addDef(I.getOperand(0).getReg()) .addUse(SubToRegDef) .addUse(SubToRegDef2) .addImm(32) .addImm(31); constrainSelectedInstRegOperands(SubRegMI, TII, TRI, RBI); constrainSelectedInstRegOperands(SubRegMI2, TII, TRI, RBI); constrainSelectedInstRegOperands(BFM, TII, TRI, RBI); I.eraseFromParent(); return true; } static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg, const unsigned EltSize) { // Choose a lane copy opcode and subregister based off of the size of the // vector's elements. switch (EltSize) { case 8: CopyOpc = AArch64::DUPi8; ExtractSubReg = AArch64::bsub; break; case 16: CopyOpc = AArch64::DUPi16; ExtractSubReg = AArch64::hsub; break; case 32: CopyOpc = AArch64::DUPi32; ExtractSubReg = AArch64::ssub; break; case 64: CopyOpc = AArch64::DUPi64; ExtractSubReg = AArch64::dsub; break; default: // Unknown size, bail out. LLVM_DEBUG(dbgs() << "Elt size '" << EltSize << "' unsupported.\n"); return false; } return true; } MachineInstr *AArch64InstructionSelector::emitExtractVectorElt( std::optional DstReg, const RegisterBank &DstRB, LLT ScalarTy, Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const { MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); unsigned CopyOpc = 0; unsigned ExtractSubReg = 0; if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, ScalarTy.getSizeInBits())) { LLVM_DEBUG( dbgs() << "Couldn't determine lane copy opcode for instruction.\n"); return nullptr; } const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(ScalarTy, DstRB, true); if (!DstRC) { LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n"); return nullptr; } const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI); const LLT &VecTy = MRI.getType(VecReg); const TargetRegisterClass *VecRC = getRegClassForTypeOnBank(VecTy, VecRB, true); if (!VecRC) { LLVM_DEBUG(dbgs() << "Could not determine source register class.\n"); return nullptr; } // The register that we're going to copy into. Register InsertReg = VecReg; if (!DstReg) DstReg = MRI.createVirtualRegister(DstRC); // If the lane index is 0, we just use a subregister COPY. if (LaneIdx == 0) { auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {*DstReg}, {}) .addReg(VecReg, 0, ExtractSubReg); RBI.constrainGenericRegister(*DstReg, *DstRC, MRI); return &*Copy; } // Lane copies require 128-bit wide registers. If we're dealing with an // unpacked vector, then we need to move up to that width. Insert an implicit // def and a subregister insert to get us there. if (VecTy.getSizeInBits() != 128) { MachineInstr *ScalarToVector = emitScalarToVector( VecTy.getSizeInBits(), &AArch64::FPR128RegClass, VecReg, MIRBuilder); if (!ScalarToVector) return nullptr; InsertReg = ScalarToVector->getOperand(0).getReg(); } MachineInstr *LaneCopyMI = MIRBuilder.buildInstr(CopyOpc, {*DstReg}, {InsertReg}).addImm(LaneIdx); constrainSelectedInstRegOperands(*LaneCopyMI, TII, TRI, RBI); // Make sure that we actually constrain the initial copy. RBI.constrainGenericRegister(*DstReg, *DstRC, MRI); return LaneCopyMI; } bool AArch64InstructionSelector::selectExtractElt( MachineInstr &I, MachineRegisterInfo &MRI) { assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT && "unexpected opcode!"); Register DstReg = I.getOperand(0).getReg(); const LLT NarrowTy = MRI.getType(DstReg); const Register SrcReg = I.getOperand(1).getReg(); const LLT WideTy = MRI.getType(SrcReg); (void)WideTy; assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() && "source register size too small!"); assert(!NarrowTy.isVector() && "cannot extract vector into vector!"); // Need the lane index to determine the correct copy opcode. MachineOperand &LaneIdxOp = I.getOperand(2); assert(LaneIdxOp.isReg() && "Lane index operand was not a register?"); if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) { LLVM_DEBUG(dbgs() << "Cannot extract into GPR.\n"); return false; } // Find the index to extract from. auto VRegAndVal = getIConstantVRegValWithLookThrough(LaneIdxOp.getReg(), MRI); if (!VRegAndVal) return false; unsigned LaneIdx = VRegAndVal->Value.getSExtValue(); const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, NarrowTy, SrcReg, LaneIdx, MIB); if (!Extract) return false; I.eraseFromParent(); return true; } bool AArch64InstructionSelector::selectSplitVectorUnmerge( MachineInstr &I, MachineRegisterInfo &MRI) { unsigned NumElts = I.getNumOperands() - 1; Register SrcReg = I.getOperand(NumElts).getReg(); const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg()); const LLT SrcTy = MRI.getType(SrcReg); assert(NarrowTy.isVector() && "Expected an unmerge into vectors"); if (SrcTy.getSizeInBits() > 128) { LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge"); return false; } // We implement a split vector operation by treating the sub-vectors as // scalars and extracting them. const RegisterBank &DstRB = *RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI); for (unsigned OpIdx = 0; OpIdx < NumElts; ++OpIdx) { Register Dst = I.getOperand(OpIdx).getReg(); MachineInstr *Extract = emitExtractVectorElt(Dst, DstRB, NarrowTy, SrcReg, OpIdx, MIB); if (!Extract) return false; } I.eraseFromParent(); return true; } bool AArch64InstructionSelector::selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI) { assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES && "unexpected opcode"); // TODO: Handle unmerging into GPRs and from scalars to scalars. if (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() != AArch64::FPRRegBankID || RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() != AArch64::FPRRegBankID) { LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar " "currently unsupported.\n"); return false; } // The last operand is the vector source register, and every other operand is // a register to unpack into. unsigned NumElts = I.getNumOperands() - 1; Register SrcReg = I.getOperand(NumElts).getReg(); const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg()); const LLT WideTy = MRI.getType(SrcReg); (void)WideTy; assert((WideTy.isVector() || WideTy.getSizeInBits() == 128) && "can only unmerge from vector or s128 types!"); assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() && "source register size too small!"); if (!NarrowTy.isScalar()) return selectSplitVectorUnmerge(I, MRI); // Choose a lane copy opcode and subregister based off of the size of the // vector's elements. unsigned CopyOpc = 0; unsigned ExtractSubReg = 0; if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, NarrowTy.getSizeInBits())) return false; // Set up for the lane copies. MachineBasicBlock &MBB = *I.getParent(); // Stores the registers we'll be copying from. SmallVector InsertRegs; // We'll use the first register twice, so we only need NumElts-1 registers. unsigned NumInsertRegs = NumElts - 1; // If our elements fit into exactly 128 bits, then we can copy from the source // directly. Otherwise, we need to do a bit of setup with some subregister // inserts. if (NarrowTy.getSizeInBits() * NumElts == 128) { InsertRegs = SmallVector(NumInsertRegs, SrcReg); } else { // No. We have to perform subregister inserts. For each insert, create an // implicit def and a subregister insert, and save the register we create. const TargetRegisterClass *RC = getRegClassForTypeOnBank( LLT::fixed_vector(NumElts, WideTy.getScalarSizeInBits()), *RBI.getRegBank(SrcReg, MRI, TRI)); unsigned SubReg = 0; bool Found = getSubRegForClass(RC, TRI, SubReg); (void)Found; assert(Found && "expected to find last operand's subeg idx"); for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) { Register ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass); MachineInstr &ImpDefMI = *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::IMPLICIT_DEF), ImpDefReg); // Now, create the subregister insert from SrcReg. Register InsertReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass); MachineInstr &InsMI = *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::INSERT_SUBREG), InsertReg) .addUse(ImpDefReg) .addUse(SrcReg) .addImm(SubReg); constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI); constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI); // Save the register so that we can copy from it after. InsertRegs.push_back(InsertReg); } } // Now that we've created any necessary subregister inserts, we can // create the copies. // // Perform the first copy separately as a subregister copy. Register CopyTo = I.getOperand(0).getReg(); auto FirstCopy = MIB.buildInstr(TargetOpcode::COPY, {CopyTo}, {}) .addReg(InsertRegs[0], 0, ExtractSubReg); constrainSelectedInstRegOperands(*FirstCopy, TII, TRI, RBI); // Now, perform the remaining copies as vector lane copies. unsigned LaneIdx = 1; for (Register InsReg : InsertRegs) { Register CopyTo = I.getOperand(LaneIdx).getReg(); MachineInstr &CopyInst = *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CopyOpc), CopyTo) .addUse(InsReg) .addImm(LaneIdx); constrainSelectedInstRegOperands(CopyInst, TII, TRI, RBI); ++LaneIdx; } // Separately constrain the first copy's destination. Because of the // limitation in constrainOperandRegClass, we can't guarantee that this will // actually be constrained. So, do it ourselves using the second operand. const TargetRegisterClass *RC = MRI.getRegClassOrNull(I.getOperand(1).getReg()); if (!RC) { LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n"); return false; } RBI.constrainGenericRegister(CopyTo, *RC, MRI); I.eraseFromParent(); return true; } bool AArch64InstructionSelector::selectConcatVectors( MachineInstr &I, MachineRegisterInfo &MRI) { assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS && "Unexpected opcode"); Register Dst = I.getOperand(0).getReg(); Register Op1 = I.getOperand(1).getReg(); Register Op2 = I.getOperand(2).getReg(); MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIB); if (!ConcatMI) return false; I.eraseFromParent(); return true; } unsigned AArch64InstructionSelector::emitConstantPoolEntry(const Constant *CPVal, MachineFunction &MF) const { Type *CPTy = CPVal->getType(); Align Alignment = MF.getDataLayout().getPrefTypeAlign(CPTy); MachineConstantPool *MCP = MF.getConstantPool(); return MCP->getConstantPoolIndex(CPVal, Alignment); } MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool( const Constant *CPVal, MachineIRBuilder &MIRBuilder) const { const TargetRegisterClass *RC; unsigned Opc; bool IsTiny = TM.getCodeModel() == CodeModel::Tiny; unsigned Size = MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType()); switch (Size) { case 16: RC = &AArch64::FPR128RegClass; Opc = IsTiny ? AArch64::LDRQl : AArch64::LDRQui; break; case 8: RC = &AArch64::FPR64RegClass; Opc = IsTiny ? AArch64::LDRDl : AArch64::LDRDui; break; case 4: RC = &AArch64::FPR32RegClass; Opc = IsTiny ? AArch64::LDRSl : AArch64::LDRSui; break; case 2: RC = &AArch64::FPR16RegClass; Opc = AArch64::LDRHui; break; default: LLVM_DEBUG(dbgs() << "Could not load from constant pool of type " << *CPVal->getType()); return nullptr; } MachineInstr *LoadMI = nullptr; auto &MF = MIRBuilder.getMF(); unsigned CPIdx = emitConstantPoolEntry(CPVal, MF); if (IsTiny && (Size == 16 || Size == 8 || Size == 4)) { // Use load(literal) for tiny code model. LoadMI = &*MIRBuilder.buildInstr(Opc, {RC}, {}).addConstantPoolIndex(CPIdx); } else { auto Adrp = MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {}) .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE); LoadMI = &*MIRBuilder.buildInstr(Opc, {RC}, {Adrp}) .addConstantPoolIndex( CPIdx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC); constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI); } MachinePointerInfo PtrInfo = MachinePointerInfo::getConstantPool(MF); LoadMI->addMemOperand(MF, MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, Size, Align(Size))); constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI); return LoadMI; } /// Return an pair to do an vector elt insert of a given /// size and RB. static std::pair getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) { unsigned Opc, SubregIdx; if (RB.getID() == AArch64::GPRRegBankID) { if (EltSize == 8) { Opc = AArch64::INSvi8gpr; SubregIdx = AArch64::bsub; } else if (EltSize == 16) { Opc = AArch64::INSvi16gpr; SubregIdx = AArch64::ssub; } else if (EltSize == 32) { Opc = AArch64::INSvi32gpr; SubregIdx = AArch64::ssub; } else if (EltSize == 64) { Opc = AArch64::INSvi64gpr; SubregIdx = AArch64::dsub; } else { llvm_unreachable("invalid elt size!"); } } else { if (EltSize == 8) { Opc = AArch64::INSvi8lane; SubregIdx = AArch64::bsub; } else if (EltSize == 16) { Opc = AArch64::INSvi16lane; SubregIdx = AArch64::hsub; } else if (EltSize == 32) { Opc = AArch64::INSvi32lane; SubregIdx = AArch64::ssub; } else if (EltSize == 64) { Opc = AArch64::INSvi64lane; SubregIdx = AArch64::dsub; } else { llvm_unreachable("invalid elt size!"); } } return std::make_pair(Opc, SubregIdx); } MachineInstr *AArch64InstructionSelector::emitInstr( unsigned Opcode, std::initializer_list DstOps, std::initializer_list SrcOps, MachineIRBuilder &MIRBuilder, const ComplexRendererFns &RenderFns) const { assert(Opcode && "Expected an opcode?"); assert(!isPreISelGenericOpcode(Opcode) && "Function should only be used to produce selected instructions!"); auto MI = MIRBuilder.buildInstr(Opcode, DstOps, SrcOps); if (RenderFns) for (auto &Fn : *RenderFns) Fn(MI); constrainSelectedInstRegOperands(*MI, TII, TRI, RBI); return &*MI; } MachineInstr *AArch64InstructionSelector::emitAddSub( const std::array, 5> &AddrModeAndSizeToOpcode, Register Dst, MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const { MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); assert(LHS.isReg() && RHS.isReg() && "Expected register operands?"); auto Ty = MRI.getType(LHS.getReg()); assert(!Ty.isVector() && "Expected a scalar or pointer?"); unsigned Size = Ty.getSizeInBits(); assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit type only"); bool Is32Bit = Size == 32; // INSTRri form with positive arithmetic immediate. if (auto Fns = selectArithImmed(RHS)) return emitInstr(AddrModeAndSizeToOpcode[0][Is32Bit], {Dst}, {LHS}, MIRBuilder, Fns); // INSTRri form with negative arithmetic immediate. if (auto Fns = selectNegArithImmed(RHS)) return emitInstr(AddrModeAndSizeToOpcode[3][Is32Bit], {Dst}, {LHS}, MIRBuilder, Fns); // INSTRrx form. if (auto Fns = selectArithExtendedRegister(RHS)) return emitInstr(AddrModeAndSizeToOpcode[4][Is32Bit], {Dst}, {LHS}, MIRBuilder, Fns); // INSTRrs form. if (auto Fns = selectShiftedRegister(RHS)) return emitInstr(AddrModeAndSizeToOpcode[1][Is32Bit], {Dst}, {LHS}, MIRBuilder, Fns); return emitInstr(AddrModeAndSizeToOpcode[2][Is32Bit], {Dst}, {LHS, RHS}, MIRBuilder); } MachineInstr * AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const { const std::array, 5> OpcTable{ {{AArch64::ADDXri, AArch64::ADDWri}, {AArch64::ADDXrs, AArch64::ADDWrs}, {AArch64::ADDXrr, AArch64::ADDWrr}, {AArch64::SUBXri, AArch64::SUBWri}, {AArch64::ADDXrx, AArch64::ADDWrx}}}; return emitAddSub(OpcTable, DefReg, LHS, RHS, MIRBuilder); } MachineInstr * AArch64InstructionSelector::emitADDS(Register Dst, MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const { const std::array, 5> OpcTable{ {{AArch64::ADDSXri, AArch64::ADDSWri}, {AArch64::ADDSXrs, AArch64::ADDSWrs}, {AArch64::ADDSXrr, AArch64::ADDSWrr}, {AArch64::SUBSXri, AArch64::SUBSWri}, {AArch64::ADDSXrx, AArch64::ADDSWrx}}}; return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder); } MachineInstr * AArch64InstructionSelector::emitSUBS(Register Dst, MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const { const std::array, 5> OpcTable{ {{AArch64::SUBSXri, AArch64::SUBSWri}, {AArch64::SUBSXrs, AArch64::SUBSWrs}, {AArch64::SUBSXrr, AArch64::SUBSWrr}, {AArch64::ADDSXri, AArch64::ADDSWri}, {AArch64::SUBSXrx, AArch64::SUBSWrx}}}; return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder); } MachineInstr * AArch64InstructionSelector::emitADCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const { assert(LHS.isReg() && RHS.isReg() && "Expected register operands?"); MachineRegisterInfo *MRI = MIRBuilder.getMRI(); bool Is32Bit = (MRI->getType(LHS.getReg()).getSizeInBits() == 32); static const unsigned OpcTable[2] = {AArch64::ADCSXr, AArch64::ADCSWr}; return emitInstr(OpcTable[Is32Bit], {Dst}, {LHS, RHS}, MIRBuilder); } MachineInstr * AArch64InstructionSelector::emitSBCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const { assert(LHS.isReg() && RHS.isReg() && "Expected register operands?"); MachineRegisterInfo *MRI = MIRBuilder.getMRI(); bool Is32Bit = (MRI->getType(LHS.getReg()).getSizeInBits() == 32); static const unsigned OpcTable[2] = {AArch64::SBCSXr, AArch64::SBCSWr}; return emitInstr(OpcTable[Is32Bit], {Dst}, {LHS, RHS}, MIRBuilder); } MachineInstr * AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const { MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); bool Is32Bit = (MRI.getType(LHS.getReg()).getSizeInBits() == 32); auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass; return emitADDS(MRI.createVirtualRegister(RC), LHS, RHS, MIRBuilder); } MachineInstr * AArch64InstructionSelector::emitTST(MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const { assert(LHS.isReg() && RHS.isReg() && "Expected register operands?"); MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); LLT Ty = MRI.getType(LHS.getReg()); unsigned RegSize = Ty.getSizeInBits(); bool Is32Bit = (RegSize == 32); const unsigned OpcTable[3][2] = {{AArch64::ANDSXri, AArch64::ANDSWri}, {AArch64::ANDSXrs, AArch64::ANDSWrs}, {AArch64::ANDSXrr, AArch64::ANDSWrr}}; // ANDS needs a logical immediate for its immediate form. Check if we can // fold one in. if (auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI)) { int64_t Imm = ValAndVReg->Value.getSExtValue(); if (AArch64_AM::isLogicalImmediate(Imm, RegSize)) { auto TstMI = MIRBuilder.buildInstr(OpcTable[0][Is32Bit], {Ty}, {LHS}); TstMI.addImm(AArch64_AM::encodeLogicalImmediate(Imm, RegSize)); constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI); return &*TstMI; } } if (auto Fns = selectLogicalShiftedRegister(RHS)) return emitInstr(OpcTable[1][Is32Bit], {Ty}, {LHS}, MIRBuilder, Fns); return emitInstr(OpcTable[2][Is32Bit], {Ty}, {LHS, RHS}, MIRBuilder); } MachineInstr *AArch64InstructionSelector::emitIntegerCompare( MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate, MachineIRBuilder &MIRBuilder) const { assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!"); assert(Predicate.isPredicate() && "Expected predicate?"); MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); LLT CmpTy = MRI.getType(LHS.getReg()); assert(!CmpTy.isVector() && "Expected scalar or pointer"); unsigned Size = CmpTy.getSizeInBits(); (void)Size; assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit LHS/RHS?"); // Fold the compare into a cmn or tst if possible. if (auto FoldCmp = tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder)) return FoldCmp; auto Dst = MRI.cloneVirtualRegister(LHS.getReg()); return emitSUBS(Dst, LHS, RHS, MIRBuilder); } MachineInstr *AArch64InstructionSelector::emitCSetForFCmp( Register Dst, CmpInst::Predicate Pred, MachineIRBuilder &MIRBuilder) const { MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); #ifndef NDEBUG LLT Ty = MRI.getType(Dst); assert(!Ty.isVector() && Ty.getSizeInBits() == 32 && "Expected a 32-bit scalar register?"); #endif const Register ZReg = AArch64::WZR; AArch64CC::CondCode CC1, CC2; changeFCMPPredToAArch64CC(Pred, CC1, CC2); auto InvCC1 = AArch64CC::getInvertedCondCode(CC1); if (CC2 == AArch64CC::AL) return emitCSINC(/*Dst=*/Dst, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1, MIRBuilder); const TargetRegisterClass *RC = &AArch64::GPR32RegClass; Register Def1Reg = MRI.createVirtualRegister(RC); Register Def2Reg = MRI.createVirtualRegister(RC); auto InvCC2 = AArch64CC::getInvertedCondCode(CC2); emitCSINC(/*Dst=*/Def1Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1, MIRBuilder); emitCSINC(/*Dst=*/Def2Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC2, MIRBuilder); auto OrMI = MIRBuilder.buildInstr(AArch64::ORRWrr, {Dst}, {Def1Reg, Def2Reg}); constrainSelectedInstRegOperands(*OrMI, TII, TRI, RBI); return &*OrMI; } MachineInstr *AArch64InstructionSelector::emitFPCompare( Register LHS, Register RHS, MachineIRBuilder &MIRBuilder, std::optional Pred) const { MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); LLT Ty = MRI.getType(LHS); if (Ty.isVector()) return nullptr; unsigned OpSize = Ty.getSizeInBits(); if (OpSize != 32 && OpSize != 64) return nullptr; // If this is a compare against +0.0, then we don't have // to explicitly materialize a constant. const ConstantFP *FPImm = getConstantFPVRegVal(RHS, MRI); bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative()); auto IsEqualityPred = [](CmpInst::Predicate P) { return P == CmpInst::FCMP_OEQ || P == CmpInst::FCMP_ONE || P == CmpInst::FCMP_UEQ || P == CmpInst::FCMP_UNE; }; if (!ShouldUseImm && Pred && IsEqualityPred(*Pred)) { // Try commutating the operands. const ConstantFP *LHSImm = getConstantFPVRegVal(LHS, MRI); if (LHSImm && (LHSImm->isZero() && !LHSImm->isNegative())) { ShouldUseImm = true; std::swap(LHS, RHS); } } unsigned CmpOpcTbl[2][2] = {{AArch64::FCMPSrr, AArch64::FCMPDrr}, {AArch64::FCMPSri, AArch64::FCMPDri}}; unsigned CmpOpc = CmpOpcTbl[ShouldUseImm][OpSize == 64]; // Partially build the compare. Decide if we need to add a use for the // third operand based off whether or not we're comparing against 0.0. auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addUse(LHS); CmpMI.setMIFlags(MachineInstr::NoFPExcept); if (!ShouldUseImm) CmpMI.addUse(RHS); constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI); return &*CmpMI; } MachineInstr *AArch64InstructionSelector::emitVectorConcat( std::optional Dst, Register Op1, Register Op2, MachineIRBuilder &MIRBuilder) const { // We implement a vector concat by: // 1. Use scalar_to_vector to insert the lower vector into the larger dest // 2. Insert the upper vector into the destination's upper element // TODO: some of this code is common with G_BUILD_VECTOR handling. MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); const LLT Op1Ty = MRI.getType(Op1); const LLT Op2Ty = MRI.getType(Op2); if (Op1Ty != Op2Ty) { LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys"); return nullptr; } assert(Op1Ty.isVector() && "Expected a vector for vector concat"); if (Op1Ty.getSizeInBits() >= 128) { LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors"); return nullptr; } // At the moment we just support 64 bit vector concats. if (Op1Ty.getSizeInBits() != 64) { LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors"); return nullptr; } const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits()); const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI); const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(Op1Ty.multiplyElements(2), FPRBank); MachineInstr *WidenedOp1 = emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder); MachineInstr *WidenedOp2 = emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op2, MIRBuilder); if (!WidenedOp1 || !WidenedOp2) { LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value"); return nullptr; } // Now do the insert of the upper element. unsigned InsertOpc, InsSubRegIdx; std::tie(InsertOpc, InsSubRegIdx) = getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits()); if (!Dst) Dst = MRI.createVirtualRegister(DstRC); auto InsElt = MIRBuilder .buildInstr(InsertOpc, {*Dst}, {WidenedOp1->getOperand(0).getReg()}) .addImm(1) /* Lane index */ .addUse(WidenedOp2->getOperand(0).getReg()) .addImm(0); constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI); return &*InsElt; } MachineInstr * AArch64InstructionSelector::emitCSINC(Register Dst, Register Src1, Register Src2, AArch64CC::CondCode Pred, MachineIRBuilder &MIRBuilder) const { auto &MRI = *MIRBuilder.getMRI(); const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Dst); // If we used a register class, then this won't necessarily have an LLT. // Compute the size based off whether or not we have a class or bank. unsigned Size; if (const auto *RC = RegClassOrBank.dyn_cast()) Size = TRI.getRegSizeInBits(*RC); else Size = MRI.getType(Dst).getSizeInBits(); // Some opcodes use s1. assert(Size <= 64 && "Expected 64 bits or less only!"); static const unsigned OpcTable[2] = {AArch64::CSINCWr, AArch64::CSINCXr}; unsigned Opc = OpcTable[Size == 64]; auto CSINC = MIRBuilder.buildInstr(Opc, {Dst}, {Src1, Src2}).addImm(Pred); constrainSelectedInstRegOperands(*CSINC, TII, TRI, RBI); return &*CSINC; } MachineInstr *AArch64InstructionSelector::emitCarryIn(MachineInstr &I, Register CarryReg) { MachineRegisterInfo *MRI = MIB.getMRI(); unsigned Opcode = I.getOpcode(); // If the instruction is a SUB, we need to negate the carry, // because borrowing is indicated by carry-flag == 0. bool NeedsNegatedCarry = (Opcode == TargetOpcode::G_USUBE || Opcode == TargetOpcode::G_SSUBE); // If the previous instruction will already produce the correct carry, do not // emit a carry generating instruction. E.g. for G_UADDE/G_USUBE sequences // generated during legalization of wide add/sub. This optimization depends on // these sequences not being interrupted by other instructions. MachineInstr *SrcMI = MRI->getVRegDef(CarryReg); if (SrcMI == I.getPrevNode()) { if (auto *CarrySrcMI = dyn_cast(SrcMI)) { bool ProducesNegatedCarry = CarrySrcMI->isSub(); if (NeedsNegatedCarry == ProducesNegatedCarry && CarrySrcMI->isUnsigned()) return nullptr; } } Register DeadReg = MRI->createVirtualRegister(&AArch64::GPR32RegClass); if (NeedsNegatedCarry) { // (0 - Carry) sets !C in NZCV when Carry == 1 Register ZReg = AArch64::WZR; return emitInstr(AArch64::SUBSWrr, {DeadReg}, {ZReg, CarryReg}, MIB); } // (Carry - 1) sets !C in NZCV when Carry == 0 auto Fns = select12BitValueWithLeftShift(1); return emitInstr(AArch64::SUBSWri, {DeadReg}, {CarryReg}, MIB, Fns); } bool AArch64InstructionSelector::selectOverflowOp(MachineInstr &I, MachineRegisterInfo &MRI) { auto &CarryMI = cast(I); if (auto *CarryInMI = dyn_cast(&I)) { // Set NZCV carry according to carry-in VReg emitCarryIn(I, CarryInMI->getCarryInReg()); } // Emit the operation and get the correct condition code. auto OpAndCC = emitOverflowOp(I.getOpcode(), CarryMI.getDstReg(), CarryMI.getLHS(), CarryMI.getRHS(), MIB); Register CarryOutReg = CarryMI.getCarryOutReg(); // Don't convert carry-out to VReg if it is never used if (!MRI.use_nodbg_empty(CarryOutReg)) { // Now, put the overflow result in the register given by the first operand // to the overflow op. CSINC increments the result when the predicate is // false, so to get the increment when it's true, we need to use the // inverse. In this case, we want to increment when carry is set. Register ZReg = AArch64::WZR; emitCSINC(/*Dst=*/CarryOutReg, /*Src1=*/ZReg, /*Src2=*/ZReg, getInvertedCondCode(OpAndCC.second), MIB); } I.eraseFromParent(); return true; } std::pair AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const { switch (Opcode) { default: llvm_unreachable("Unexpected opcode!"); case TargetOpcode::G_SADDO: return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS); case TargetOpcode::G_UADDO: return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS); case TargetOpcode::G_SSUBO: return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS); case TargetOpcode::G_USUBO: return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO); case TargetOpcode::G_SADDE: return std::make_pair(emitADCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS); case TargetOpcode::G_UADDE: return std::make_pair(emitADCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS); case TargetOpcode::G_SSUBE: return std::make_pair(emitSBCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS); case TargetOpcode::G_USUBE: return std::make_pair(emitSBCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO); } } /// Returns true if @p Val is a tree of AND/OR/CMP operations that can be /// expressed as a conjunction. /// \param CanNegate Set to true if we can negate the whole sub-tree just by /// changing the conditions on the CMP tests. /// (this means we can call emitConjunctionRec() with /// Negate==true on this sub-tree) /// \param MustBeFirst Set to true if this subtree needs to be negated and we /// cannot do the negation naturally. We are required to /// emit the subtree first in this case. /// \param WillNegate Is true if are called when the result of this /// subexpression must be negated. This happens when the /// outer expression is an OR. We can use this fact to know /// that we have a double negation (or (or ...) ...) that /// can be implemented for free. static bool canEmitConjunction(Register Val, bool &CanNegate, bool &MustBeFirst, bool WillNegate, MachineRegisterInfo &MRI, unsigned Depth = 0) { if (!MRI.hasOneNonDBGUse(Val)) return false; MachineInstr *ValDef = MRI.getVRegDef(Val); unsigned Opcode = ValDef->getOpcode(); if (isa(ValDef)) { CanNegate = true; MustBeFirst = false; return true; } // Protect against exponential runtime and stack overflow. if (Depth > 6) return false; if (Opcode == TargetOpcode::G_AND || Opcode == TargetOpcode::G_OR) { bool IsOR = Opcode == TargetOpcode::G_OR; Register O0 = ValDef->getOperand(1).getReg(); Register O1 = ValDef->getOperand(2).getReg(); bool CanNegateL; bool MustBeFirstL; if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, MRI, Depth + 1)) return false; bool CanNegateR; bool MustBeFirstR; if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, MRI, Depth + 1)) return false; if (MustBeFirstL && MustBeFirstR) return false; if (IsOR) { // For an OR expression we need to be able to naturally negate at least // one side or we cannot do the transformation at all. if (!CanNegateL && !CanNegateR) return false; // If we the result of the OR will be negated and we can naturally negate // the leaves, then this sub-tree as a whole negates naturally. CanNegate = WillNegate && CanNegateL && CanNegateR; // If we cannot naturally negate the whole sub-tree, then this must be // emitted first. MustBeFirst = !CanNegate; } else { assert(Opcode == TargetOpcode::G_AND && "Must be G_AND"); // We cannot naturally negate an AND operation. CanNegate = false; MustBeFirst = MustBeFirstL || MustBeFirstR; } return true; } return false; } MachineInstr *AArch64InstructionSelector::emitConditionalComparison( Register LHS, Register RHS, CmpInst::Predicate CC, AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, MachineIRBuilder &MIB) const { // TODO: emit CMN as an optimization. auto &MRI = *MIB.getMRI(); LLT OpTy = MRI.getType(LHS); assert(OpTy.getSizeInBits() == 32 || OpTy.getSizeInBits() == 64); unsigned CCmpOpc; std::optional C; if (CmpInst::isIntPredicate(CC)) { C = getIConstantVRegValWithLookThrough(RHS, MRI); if (C && C->Value.ult(32)) CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWi : AArch64::CCMPXi; else CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWr : AArch64::CCMPXr; } else { switch (OpTy.getSizeInBits()) { case 16: CCmpOpc = AArch64::FCCMPHrr; break; case 32: CCmpOpc = AArch64::FCCMPSrr; break; case 64: CCmpOpc = AArch64::FCCMPDrr; break; default: return nullptr; } } AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); auto CCmp = MIB.buildInstr(CCmpOpc, {}, {LHS}); if (CCmpOpc == AArch64::CCMPWi || CCmpOpc == AArch64::CCMPXi) CCmp.addImm(C->Value.getZExtValue()); else CCmp.addReg(RHS); CCmp.addImm(NZCV).addImm(Predicate); constrainSelectedInstRegOperands(*CCmp, TII, TRI, RBI); return &*CCmp; } MachineInstr *AArch64InstructionSelector::emitConjunctionRec( Register Val, AArch64CC::CondCode &OutCC, bool Negate, Register CCOp, AArch64CC::CondCode Predicate, MachineIRBuilder &MIB) const { // We're at a tree leaf, produce a conditional comparison operation. auto &MRI = *MIB.getMRI(); MachineInstr *ValDef = MRI.getVRegDef(Val); unsigned Opcode = ValDef->getOpcode(); if (auto *Cmp = dyn_cast(ValDef)) { Register LHS = Cmp->getLHSReg(); Register RHS = Cmp->getRHSReg(); CmpInst::Predicate CC = Cmp->getCond(); if (Negate) CC = CmpInst::getInversePredicate(CC); if (isa(Cmp)) { OutCC = changeICMPPredToAArch64CC(CC); } else { // Handle special FP cases. AArch64CC::CondCode ExtraCC; changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC); // Some floating point conditions can't be tested with a single condition // code. Construct an additional comparison in this case. if (ExtraCC != AArch64CC::AL) { MachineInstr *ExtraCmp; if (!CCOp) ExtraCmp = emitFPCompare(LHS, RHS, MIB, CC); else ExtraCmp = emitConditionalComparison(LHS, RHS, CC, Predicate, ExtraCC, MIB); CCOp = ExtraCmp->getOperand(0).getReg(); Predicate = ExtraCC; } } // Produce a normal comparison if we are first in the chain if (!CCOp) { auto Dst = MRI.cloneVirtualRegister(LHS); if (isa(Cmp)) return emitSUBS(Dst, Cmp->getOperand(2), Cmp->getOperand(3), MIB); return emitFPCompare(Cmp->getOperand(2).getReg(), Cmp->getOperand(3).getReg(), MIB); } // Otherwise produce a ccmp. return emitConditionalComparison(LHS, RHS, CC, Predicate, OutCC, MIB); } assert(MRI.hasOneNonDBGUse(Val) && "Valid conjunction/disjunction tree"); bool IsOR = Opcode == TargetOpcode::G_OR; Register LHS = ValDef->getOperand(1).getReg(); bool CanNegateL; bool MustBeFirstL; bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR, MRI); assert(ValidL && "Valid conjunction/disjunction tree"); (void)ValidL; Register RHS = ValDef->getOperand(2).getReg(); bool CanNegateR; bool MustBeFirstR; bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR, MRI); assert(ValidR && "Valid conjunction/disjunction tree"); (void)ValidR; // Swap sub-tree that must come first to the right side. if (MustBeFirstL) { assert(!MustBeFirstR && "Valid conjunction/disjunction tree"); std::swap(LHS, RHS); std::swap(CanNegateL, CanNegateR); std::swap(MustBeFirstL, MustBeFirstR); } bool NegateR; bool NegateAfterR; bool NegateL; bool NegateAfterAll; if (Opcode == TargetOpcode::G_OR) { // Swap the sub-tree that we can negate naturally to the left. if (!CanNegateL) { assert(CanNegateR && "at least one side must be negatable"); assert(!MustBeFirstR && "invalid conjunction/disjunction tree"); assert(!Negate); std::swap(LHS, RHS); NegateR = false; NegateAfterR = true; } else { // Negate the left sub-tree if possible, otherwise negate the result. NegateR = CanNegateR; NegateAfterR = !CanNegateR; } NegateL = true; NegateAfterAll = !Negate; } else { assert(Opcode == TargetOpcode::G_AND && "Valid conjunction/disjunction tree"); assert(!Negate && "Valid conjunction/disjunction tree"); NegateL = false; NegateR = false; NegateAfterR = false; NegateAfterAll = false; } // Emit sub-trees. AArch64CC::CondCode RHSCC; MachineInstr *CmpR = emitConjunctionRec(RHS, RHSCC, NegateR, CCOp, Predicate, MIB); if (NegateAfterR) RHSCC = AArch64CC::getInvertedCondCode(RHSCC); MachineInstr *CmpL = emitConjunctionRec( LHS, OutCC, NegateL, CmpR->getOperand(0).getReg(), RHSCC, MIB); if (NegateAfterAll) OutCC = AArch64CC::getInvertedCondCode(OutCC); return CmpL; } MachineInstr *AArch64InstructionSelector::emitConjunction( Register Val, AArch64CC::CondCode &OutCC, MachineIRBuilder &MIB) const { bool DummyCanNegate; bool DummyMustBeFirst; if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false, *MIB.getMRI())) return nullptr; return emitConjunctionRec(Val, OutCC, false, Register(), AArch64CC::AL, MIB); } bool AArch64InstructionSelector::tryOptSelectConjunction(GSelect &SelI, MachineInstr &CondMI) { AArch64CC::CondCode AArch64CC; MachineInstr *ConjMI = emitConjunction(SelI.getCondReg(), AArch64CC, MIB); if (!ConjMI) return false; emitSelect(SelI.getReg(0), SelI.getTrueReg(), SelI.getFalseReg(), AArch64CC, MIB); SelI.eraseFromParent(); return true; } bool AArch64InstructionSelector::tryOptSelect(GSelect &I) { MachineRegisterInfo &MRI = *MIB.getMRI(); // We want to recognize this pattern: // // $z = G_FCMP pred, $x, $y // ... // $w = G_SELECT $z, $a, $b // // Where the value of $z is *only* ever used by the G_SELECT (possibly with // some copies/truncs in between.) // // If we see this, then we can emit something like this: // // fcmp $x, $y // fcsel $w, $a, $b, pred // // Rather than emitting both of the rather long sequences in the standard // G_FCMP/G_SELECT select methods. // First, check if the condition is defined by a compare. MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg()); // We can only fold if all of the defs have one use. Register CondDefReg = CondDef->getOperand(0).getReg(); if (!MRI.hasOneNonDBGUse(CondDefReg)) { // Unless it's another select. for (const MachineInstr &UI : MRI.use_nodbg_instructions(CondDefReg)) { if (CondDef == &UI) continue; if (UI.getOpcode() != TargetOpcode::G_SELECT) return false; } } // Is the condition defined by a compare? unsigned CondOpc = CondDef->getOpcode(); if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP) { if (tryOptSelectConjunction(I, *CondDef)) return true; return false; } AArch64CC::CondCode CondCode; if (CondOpc == TargetOpcode::G_ICMP) { auto Pred = static_cast(CondDef->getOperand(1).getPredicate()); CondCode = changeICMPPredToAArch64CC(Pred); emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3), CondDef->getOperand(1), MIB); } else { // Get the condition code for the select. auto Pred = static_cast(CondDef->getOperand(1).getPredicate()); AArch64CC::CondCode CondCode2; changeFCMPPredToAArch64CC(Pred, CondCode, CondCode2); // changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two // instructions to emit the comparison. // TODO: Handle FCMP_UEQ and FCMP_ONE. After that, this check will be // unnecessary. if (CondCode2 != AArch64CC::AL) return false; if (!emitFPCompare(CondDef->getOperand(2).getReg(), CondDef->getOperand(3).getReg(), MIB)) { LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n"); return false; } } // Emit the select. emitSelect(I.getOperand(0).getReg(), I.getOperand(2).getReg(), I.getOperand(3).getReg(), CondCode, MIB); I.eraseFromParent(); return true; } MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare( MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate, MachineIRBuilder &MIRBuilder) const { assert(LHS.isReg() && RHS.isReg() && Predicate.isPredicate() && "Unexpected MachineOperand"); MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); // We want to find this sort of thing: // x = G_SUB 0, y // G_ICMP z, x // // In this case, we can fold the G_SUB into the G_ICMP using a CMN instead. // e.g: // // cmn z, y // Check if the RHS or LHS of the G_ICMP is defined by a SUB MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI); MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI); auto P = static_cast(Predicate.getPredicate()); // Given this: // // x = G_SUB 0, y // G_ICMP x, z // // Produce this: // // cmn y, z if (isCMN(LHSDef, P, MRI)) return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder); // Same idea here, but with the RHS of the compare instead: // // Given this: // // x = G_SUB 0, y // G_ICMP z, x // // Produce this: // // cmn z, y if (isCMN(RHSDef, P, MRI)) return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder); // Given this: // // z = G_AND x, y // G_ICMP z, 0 // // Produce this if the compare is signed: // // tst x, y if (!CmpInst::isUnsigned(P) && LHSDef && LHSDef->getOpcode() == TargetOpcode::G_AND) { // Make sure that the RHS is 0. auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI); if (!ValAndVReg || ValAndVReg->Value != 0) return nullptr; return emitTST(LHSDef->getOperand(1), LHSDef->getOperand(2), MIRBuilder); } return nullptr; } bool AArch64InstructionSelector::selectShuffleVector( MachineInstr &I, MachineRegisterInfo &MRI) { const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); Register Src1Reg = I.getOperand(1).getReg(); const LLT Src1Ty = MRI.getType(Src1Reg); Register Src2Reg = I.getOperand(2).getReg(); const LLT Src2Ty = MRI.getType(Src2Reg); ArrayRef Mask = I.getOperand(3).getShuffleMask(); MachineBasicBlock &MBB = *I.getParent(); MachineFunction &MF = *MBB.getParent(); LLVMContext &Ctx = MF.getFunction().getContext(); // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if // it's originated from a <1 x T> type. Those should have been lowered into // G_BUILD_VECTOR earlier. if (!Src1Ty.isVector() || !Src2Ty.isVector()) { LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n"); return false; } unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8; SmallVector CstIdxs; for (int Val : Mask) { // For now, any undef indexes we'll just assume to be 0. This should be // optimized in future, e.g. to select DUP etc. Val = Val < 0 ? 0 : Val; for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { unsigned Offset = Byte + Val * BytesPerElt; CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset)); } } // Use a constant pool to load the index vector for TBL. Constant *CPVal = ConstantVector::get(CstIdxs); MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIB); if (!IndexLoad) { LLVM_DEBUG(dbgs() << "Could not load from a constant pool"); return false; } if (DstTy.getSizeInBits() != 128) { assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty"); // This case can be done with TBL1. MachineInstr *Concat = emitVectorConcat(std::nullopt, Src1Reg, Src2Reg, MIB); if (!Concat) { LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1"); return false; } // The constant pool load will be 64 bits, so need to convert to FPR128 reg. IndexLoad = emitScalarToVector(64, &AArch64::FPR128RegClass, IndexLoad->getOperand(0).getReg(), MIB); auto TBL1 = MIB.buildInstr( AArch64::TBLv16i8One, {&AArch64::FPR128RegClass}, {Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()}); constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI); auto Copy = MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {}) .addReg(TBL1.getReg(0), 0, AArch64::dsub); RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI); I.eraseFromParent(); return true; } // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive // Q registers for regalloc. SmallVector Regs = {Src1Reg, Src2Reg}; auto RegSeq = createQTuple(Regs, MIB); auto TBL2 = MIB.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0)}, {RegSeq, IndexLoad->getOperand(0)}); constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI); I.eraseFromParent(); return true; } MachineInstr *AArch64InstructionSelector::emitLaneInsert( std::optional DstReg, Register SrcReg, Register EltReg, unsigned LaneIdx, const RegisterBank &RB, MachineIRBuilder &MIRBuilder) const { MachineInstr *InsElt = nullptr; const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass; MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); // Create a register to define with the insert if one wasn't passed in. if (!DstReg) DstReg = MRI.createVirtualRegister(DstRC); unsigned EltSize = MRI.getType(EltReg).getSizeInBits(); unsigned Opc = getInsertVecEltOpInfo(RB, EltSize).first; if (RB.getID() == AArch64::FPRRegBankID) { auto InsSub = emitScalarToVector(EltSize, DstRC, EltReg, MIRBuilder); InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg}) .addImm(LaneIdx) .addUse(InsSub->getOperand(0).getReg()) .addImm(0); } else { InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg}) .addImm(LaneIdx) .addUse(EltReg); } constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI); return InsElt; } bool AArch64InstructionSelector::selectUSMovFromExtend( MachineInstr &MI, MachineRegisterInfo &MRI) { if (MI.getOpcode() != TargetOpcode::G_SEXT && MI.getOpcode() != TargetOpcode::G_ZEXT && MI.getOpcode() != TargetOpcode::G_ANYEXT) return false; bool IsSigned = MI.getOpcode() == TargetOpcode::G_SEXT; const Register DefReg = MI.getOperand(0).getReg(); const LLT DstTy = MRI.getType(DefReg); unsigned DstSize = DstTy.getSizeInBits(); if (DstSize != 32 && DstSize != 64) return false; MachineInstr *Extract = getOpcodeDef(TargetOpcode::G_EXTRACT_VECTOR_ELT, MI.getOperand(1).getReg(), MRI); int64_t Lane; if (!Extract || !mi_match(Extract->getOperand(2).getReg(), MRI, m_ICst(Lane))) return false; Register Src0 = Extract->getOperand(1).getReg(); const LLT &VecTy = MRI.getType(Src0); if (VecTy.getSizeInBits() != 128) { const MachineInstr *ScalarToVector = emitScalarToVector( VecTy.getSizeInBits(), &AArch64::FPR128RegClass, Src0, MIB); assert(ScalarToVector && "Didn't expect emitScalarToVector to fail!"); Src0 = ScalarToVector->getOperand(0).getReg(); } unsigned Opcode; if (DstSize == 64 && VecTy.getScalarSizeInBits() == 32) Opcode = IsSigned ? AArch64::SMOVvi32to64 : AArch64::UMOVvi32; else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 16) Opcode = IsSigned ? AArch64::SMOVvi16to64 : AArch64::UMOVvi16; else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 8) Opcode = IsSigned ? AArch64::SMOVvi8to64 : AArch64::UMOVvi8; else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 16) Opcode = IsSigned ? AArch64::SMOVvi16to32 : AArch64::UMOVvi16; else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 8) Opcode = IsSigned ? AArch64::SMOVvi8to32 : AArch64::UMOVvi8; else llvm_unreachable("Unexpected type combo for S/UMov!"); // We may need to generate one of these, depending on the type and sign of the // input: // DstReg = SMOV Src0, Lane; // NewReg = UMOV Src0, Lane; DstReg = SUBREG_TO_REG NewReg, sub_32; MachineInstr *ExtI = nullptr; if (DstSize == 64 && !IsSigned) { Register NewReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); MIB.buildInstr(Opcode, {NewReg}, {Src0}).addImm(Lane); ExtI = MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {}) .addImm(0) .addUse(NewReg) .addImm(AArch64::sub_32); RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI); } else ExtI = MIB.buildInstr(Opcode, {DefReg}, {Src0}).addImm(Lane); constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); MI.eraseFromParent(); return true; } bool AArch64InstructionSelector::selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI) { assert(I.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT); // Get information on the destination. Register DstReg = I.getOperand(0).getReg(); const LLT DstTy = MRI.getType(DstReg); unsigned VecSize = DstTy.getSizeInBits(); // Get information on the element we want to insert into the destination. Register EltReg = I.getOperand(2).getReg(); const LLT EltTy = MRI.getType(EltReg); unsigned EltSize = EltTy.getSizeInBits(); if (EltSize < 8 || EltSize > 64) return false; // Find the definition of the index. Bail out if it's not defined by a // G_CONSTANT. Register IdxReg = I.getOperand(3).getReg(); auto VRegAndVal = getIConstantVRegValWithLookThrough(IdxReg, MRI); if (!VRegAndVal) return false; unsigned LaneIdx = VRegAndVal->Value.getSExtValue(); // Perform the lane insert. Register SrcReg = I.getOperand(1).getReg(); const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI); if (VecSize < 128) { // If the vector we're inserting into is smaller than 128 bits, widen it // to 128 to do the insert. MachineInstr *ScalarToVec = emitScalarToVector(VecSize, &AArch64::FPR128RegClass, SrcReg, MIB); if (!ScalarToVec) return false; SrcReg = ScalarToVec->getOperand(0).getReg(); } // Create an insert into a new FPR128 register. // Note that if our vector is already 128 bits, we end up emitting an extra // register. MachineInstr *InsMI = emitLaneInsert(std::nullopt, SrcReg, EltReg, LaneIdx, EltRB, MIB); if (VecSize < 128) { // If we had to widen to perform the insert, then we have to demote back to // the original size to get the result we want. Register DemoteVec = InsMI->getOperand(0).getReg(); const TargetRegisterClass *RC = getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DemoteVec, MRI, TRI)); if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) { LLVM_DEBUG(dbgs() << "Unsupported register class!\n"); return false; } unsigned SubReg = 0; if (!getSubRegForClass(RC, TRI, SubReg)) return false; if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) { LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << VecSize << "\n"); return false; } MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}) .addReg(DemoteVec, 0, SubReg); RBI.constrainGenericRegister(DstReg, *RC, MRI); } else { // No widening needed. InsMI->getOperand(0).setReg(DstReg); constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI); } I.eraseFromParent(); return true; } MachineInstr * AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV, MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI) { LLT DstTy = MRI.getType(Dst); unsigned DstSize = DstTy.getSizeInBits(); if (CV->isNullValue()) { if (DstSize == 128) { auto Mov = MIRBuilder.buildInstr(AArch64::MOVIv2d_ns, {Dst}, {}).addImm(0); constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI); return &*Mov; } if (DstSize == 64) { auto Mov = MIRBuilder .buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {}) .addImm(0); auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {Dst}, {}) .addReg(Mov.getReg(0), 0, AArch64::dsub); RBI.constrainGenericRegister(Dst, AArch64::FPR64RegClass, MRI); return &*Copy; } } auto *CPLoad = emitLoadFromConstantPool(CV, MIRBuilder); if (!CPLoad) { LLVM_DEBUG(dbgs() << "Could not generate cp load for constant vector!"); return nullptr; } auto Copy = MIRBuilder.buildCopy(Dst, CPLoad->getOperand(0)); RBI.constrainGenericRegister( Dst, *MRI.getRegClass(CPLoad->getOperand(0).getReg()), MRI); return &*Copy; } bool AArch64InstructionSelector::tryOptConstantBuildVec( MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) { assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR); unsigned DstSize = DstTy.getSizeInBits(); assert(DstSize <= 128 && "Unexpected build_vec type!"); if (DstSize < 32) return false; // Check if we're building a constant vector, in which case we want to // generate a constant pool load instead of a vector insert sequence. SmallVector Csts; for (unsigned Idx = 1; Idx < I.getNumOperands(); ++Idx) { // Try to find G_CONSTANT or G_FCONSTANT auto *OpMI = getOpcodeDef(TargetOpcode::G_CONSTANT, I.getOperand(Idx).getReg(), MRI); if (OpMI) Csts.emplace_back( const_cast(OpMI->getOperand(1).getCImm())); else if ((OpMI = getOpcodeDef(TargetOpcode::G_FCONSTANT, I.getOperand(Idx).getReg(), MRI))) Csts.emplace_back( const_cast(OpMI->getOperand(1).getFPImm())); else return false; } Constant *CV = ConstantVector::get(Csts); if (!emitConstantVector(I.getOperand(0).getReg(), CV, MIB, MRI)) return false; I.eraseFromParent(); return true; } bool AArch64InstructionSelector::tryOptBuildVecToSubregToReg( MachineInstr &I, MachineRegisterInfo &MRI) { // Given: // %vec = G_BUILD_VECTOR %elt, %undef, %undef, ... %undef // // Select the G_BUILD_VECTOR as a SUBREG_TO_REG from %elt. Register Dst = I.getOperand(0).getReg(); Register EltReg = I.getOperand(1).getReg(); LLT EltTy = MRI.getType(EltReg); // If the index isn't on the same bank as its elements, then this can't be a // SUBREG_TO_REG. const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI); const RegisterBank &DstRB = *RBI.getRegBank(Dst, MRI, TRI); if (EltRB != DstRB) return false; if (any_of(make_range(I.operands_begin() + 2, I.operands_end()), [&MRI](const MachineOperand &Op) { return !getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, Op.getReg(), MRI); })) return false; unsigned SubReg; const TargetRegisterClass *EltRC = getRegClassForTypeOnBank(EltTy, EltRB); if (!EltRC) return false; const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(MRI.getType(Dst), DstRB); if (!DstRC) return false; if (!getSubRegForClass(EltRC, TRI, SubReg)) return false; auto SubregToReg = MIB.buildInstr(AArch64::SUBREG_TO_REG, {Dst}, {}) .addImm(0) .addUse(EltReg) .addImm(SubReg); I.eraseFromParent(); constrainSelectedInstRegOperands(*SubregToReg, TII, TRI, RBI); return RBI.constrainGenericRegister(Dst, *DstRC, MRI); } bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI) { assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR); // Until we port more of the optimized selections, for now just use a vector // insert sequence. const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); const LLT EltTy = MRI.getType(I.getOperand(1).getReg()); unsigned EltSize = EltTy.getSizeInBits(); if (tryOptConstantBuildVec(I, DstTy, MRI)) return true; if (tryOptBuildVecToSubregToReg(I, MRI)) return true; if (EltSize != 8 && EltSize != 16 && EltSize != 32 && EltSize != 64) return false; // Don't support all element types yet. const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI); const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass; MachineInstr *ScalarToVec = emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC, I.getOperand(1).getReg(), MIB); if (!ScalarToVec) return false; Register DstVec = ScalarToVec->getOperand(0).getReg(); unsigned DstSize = DstTy.getSizeInBits(); // Keep track of the last MI we inserted. Later on, we might be able to save // a copy using it. MachineInstr *PrevMI = nullptr; for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) { // Note that if we don't do a subregister copy, we can end up making an // extra register. PrevMI = &*emitLaneInsert(std::nullopt, DstVec, I.getOperand(i).getReg(), i - 1, RB, MIB); DstVec = PrevMI->getOperand(0).getReg(); } // If DstTy's size in bits is less than 128, then emit a subregister copy // from DstVec to the last register we've defined. if (DstSize < 128) { // Force this to be FPR using the destination vector. const TargetRegisterClass *RC = getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DstVec, MRI, TRI)); if (!RC) return false; if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) { LLVM_DEBUG(dbgs() << "Unsupported register class!\n"); return false; } unsigned SubReg = 0; if (!getSubRegForClass(RC, TRI, SubReg)) return false; if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) { LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << DstSize << "\n"); return false; } Register Reg = MRI.createVirtualRegister(RC); Register DstReg = I.getOperand(0).getReg(); MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}).addReg(DstVec, 0, SubReg); MachineOperand &RegOp = I.getOperand(1); RegOp.setReg(Reg); RBI.constrainGenericRegister(DstReg, *RC, MRI); } else { // We don't need a subregister copy. Save a copy by re-using the // destination register on the final insert. assert(PrevMI && "PrevMI was null?"); PrevMI->getOperand(0).setReg(I.getOperand(0).getReg()); constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI); } I.eraseFromParent(); return true; } bool AArch64InstructionSelector::selectVectorLoadIntrinsic(unsigned Opc, unsigned NumVecs, MachineInstr &I) { assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS); assert(Opc && "Expected an opcode?"); assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors"); auto &MRI = *MIB.getMRI(); LLT Ty = MRI.getType(I.getOperand(0).getReg()); unsigned Size = Ty.getSizeInBits(); assert((Size == 64 || Size == 128) && "Destination must be 64 bits or 128 bits?"); unsigned SubReg = Size == 64 ? AArch64::dsub0 : AArch64::qsub0; auto Ptr = I.getOperand(I.getNumOperands() - 1).getReg(); assert(MRI.getType(Ptr).isPointer() && "Expected a pointer type?"); auto Load = MIB.buildInstr(Opc, {Ty}, {Ptr}); Load.cloneMemRefs(I); constrainSelectedInstRegOperands(*Load, TII, TRI, RBI); Register SelectedLoadDst = Load->getOperand(0).getReg(); for (unsigned Idx = 0; Idx < NumVecs; ++Idx) { auto Vec = MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(Idx)}, {}) .addReg(SelectedLoadDst, 0, SubReg + Idx); // Emit the subreg copies and immediately select them. // FIXME: We should refactor our copy code into an emitCopy helper and // clean up uses of this pattern elsewhere in the selector. selectCopy(*Vec, TII, MRI, TRI, RBI); } return true; } bool AArch64InstructionSelector::selectIntrinsicWithSideEffects( MachineInstr &I, MachineRegisterInfo &MRI) { // Find the intrinsic ID. unsigned IntrinID = I.getIntrinsicID(); const LLT S8 = LLT::scalar(8); const LLT S16 = LLT::scalar(16); const LLT S32 = LLT::scalar(32); const LLT S64 = LLT::scalar(64); const LLT P0 = LLT::pointer(0, 64); // Select the instruction. switch (IntrinID) { default: return false; case Intrinsic::aarch64_ldxp: case Intrinsic::aarch64_ldaxp: { auto NewI = MIB.buildInstr( IntrinID == Intrinsic::aarch64_ldxp ? AArch64::LDXPX : AArch64::LDAXPX, {I.getOperand(0).getReg(), I.getOperand(1).getReg()}, {I.getOperand(3)}); NewI.cloneMemRefs(I); constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI); break; } case Intrinsic::trap: MIB.buildInstr(AArch64::BRK, {}, {}).addImm(1); break; case Intrinsic::debugtrap: MIB.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000); break; case Intrinsic::ubsantrap: MIB.buildInstr(AArch64::BRK, {}, {}) .addImm(I.getOperand(1).getImm() | ('U' << 8)); break; case Intrinsic::aarch64_neon_ld2: { LLT Ty = MRI.getType(I.getOperand(0).getReg()); unsigned Opc = 0; if (Ty == LLT::fixed_vector(8, S8)) Opc = AArch64::LD2Twov8b; else if (Ty == LLT::fixed_vector(16, S8)) Opc = AArch64::LD2Twov16b; else if (Ty == LLT::fixed_vector(4, S16)) Opc = AArch64::LD2Twov4h; else if (Ty == LLT::fixed_vector(8, S16)) Opc = AArch64::LD2Twov8h; else if (Ty == LLT::fixed_vector(2, S32)) Opc = AArch64::LD2Twov2s; else if (Ty == LLT::fixed_vector(4, S32)) Opc = AArch64::LD2Twov4s; else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) Opc = AArch64::LD2Twov2d; else if (Ty == S64 || Ty == P0) Opc = AArch64::LD1Twov1d; else llvm_unreachable("Unexpected type for ld2!"); selectVectorLoadIntrinsic(Opc, 2, I); break; } case Intrinsic::aarch64_neon_ld4: { LLT Ty = MRI.getType(I.getOperand(0).getReg()); unsigned Opc = 0; if (Ty == LLT::fixed_vector(8, S8)) Opc = AArch64::LD4Fourv8b; else if (Ty == LLT::fixed_vector(16, S8)) Opc = AArch64::LD4Fourv16b; else if (Ty == LLT::fixed_vector(4, S16)) Opc = AArch64::LD4Fourv4h; else if (Ty == LLT::fixed_vector(8, S16)) Opc = AArch64::LD4Fourv8h; else if (Ty == LLT::fixed_vector(2, S32)) Opc = AArch64::LD4Fourv2s; else if (Ty == LLT::fixed_vector(4, S32)) Opc = AArch64::LD4Fourv4s; else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) Opc = AArch64::LD4Fourv2d; else if (Ty == S64 || Ty == P0) Opc = AArch64::LD1Fourv1d; else llvm_unreachable("Unexpected type for ld4!"); selectVectorLoadIntrinsic(Opc, 4, I); break; } case Intrinsic::aarch64_neon_st2: { Register Src1 = I.getOperand(1).getReg(); Register Src2 = I.getOperand(2).getReg(); Register Ptr = I.getOperand(3).getReg(); LLT Ty = MRI.getType(Src1); unsigned Opc; if (Ty == LLT::fixed_vector(8, S8)) Opc = AArch64::ST2Twov8b; else if (Ty == LLT::fixed_vector(16, S8)) Opc = AArch64::ST2Twov16b; else if (Ty == LLT::fixed_vector(4, S16)) Opc = AArch64::ST2Twov4h; else if (Ty == LLT::fixed_vector(8, S16)) Opc = AArch64::ST2Twov8h; else if (Ty == LLT::fixed_vector(2, S32)) Opc = AArch64::ST2Twov2s; else if (Ty == LLT::fixed_vector(4, S32)) Opc = AArch64::ST2Twov4s; else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) Opc = AArch64::ST2Twov2d; else if (Ty == S64 || Ty == P0) Opc = AArch64::ST1Twov1d; else llvm_unreachable("Unexpected type for st2!"); SmallVector Regs = {Src1, Src2}; Register Tuple = Ty.getSizeInBits() == 128 ? createQTuple(Regs, MIB) : createDTuple(Regs, MIB); auto Store = MIB.buildInstr(Opc, {}, {Tuple, Ptr}); Store.cloneMemRefs(I); constrainSelectedInstRegOperands(*Store, TII, TRI, RBI); break; } case Intrinsic::aarch64_mops_memset_tag: { // Transform // %dst:gpr(p0) = \ // G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.mops.memset.tag), // \ %dst:gpr(p0), %val:gpr(s64), %n:gpr(s64) // where %dst is updated, into // %Rd:GPR64common, %Rn:GPR64) = \ // MOPSMemorySetTaggingPseudo \ // %Rd:GPR64common, %Rn:GPR64, %Rm:GPR64 // where Rd and Rn are tied. // It is expected that %val has been extended to s64 in legalization. // Note that the order of the size/value operands are swapped. Register DstDef = I.getOperand(0).getReg(); // I.getOperand(1) is the intrinsic function Register DstUse = I.getOperand(2).getReg(); Register ValUse = I.getOperand(3).getReg(); Register SizeUse = I.getOperand(4).getReg(); // MOPSMemorySetTaggingPseudo has two defs; the intrinsic call has only one. // Therefore an additional virtual register is requried for the updated size // operand. This value is not accessible via the semantics of the intrinsic. Register SizeDef = MRI.createGenericVirtualRegister(LLT::scalar(64)); auto Memset = MIB.buildInstr(AArch64::MOPSMemorySetTaggingPseudo, {DstDef, SizeDef}, {DstUse, SizeUse, ValUse}); Memset.cloneMemRefs(I); constrainSelectedInstRegOperands(*Memset, TII, TRI, RBI); break; } } I.eraseFromParent(); return true; } bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI) { unsigned IntrinID = I.getIntrinsicID(); switch (IntrinID) { default: break; case Intrinsic::aarch64_crypto_sha1h: { Register DstReg = I.getOperand(0).getReg(); Register SrcReg = I.getOperand(2).getReg(); // FIXME: Should this be an assert? if (MRI.getType(DstReg).getSizeInBits() != 32 || MRI.getType(SrcReg).getSizeInBits() != 32) return false; // The operation has to happen on FPRs. Set up some new FPR registers for // the source and destination if they are on GPRs. if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) { SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); MIB.buildCopy({SrcReg}, {I.getOperand(2)}); // Make sure the copy ends up getting constrained properly. RBI.constrainGenericRegister(I.getOperand(2).getReg(), AArch64::GPR32RegClass, MRI); } if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); // Actually insert the instruction. auto SHA1Inst = MIB.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg}); constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI); // Did we create a new register for the destination? if (DstReg != I.getOperand(0).getReg()) { // Yep. Copy the result of the instruction back into the original // destination. MIB.buildCopy({I.getOperand(0)}, {DstReg}); RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR32RegClass, MRI); } I.eraseFromParent(); return true; } case Intrinsic::ptrauth_sign: { Register DstReg = I.getOperand(0).getReg(); Register ValReg = I.getOperand(2).getReg(); uint64_t Key = I.getOperand(3).getImm(); Register DiscReg = I.getOperand(4).getReg(); auto DiscVal = getIConstantVRegVal(DiscReg, MRI); bool IsDiscZero = DiscVal && DiscVal->isZero(); if (Key > AArch64PACKey::LAST) return false; unsigned Opcodes[][4] = { {AArch64::PACIA, AArch64::PACIB, AArch64::PACDA, AArch64::PACDB}, {AArch64::PACIZA, AArch64::PACIZB, AArch64::PACDZA, AArch64::PACDZB}}; unsigned Opcode = Opcodes[IsDiscZero][Key]; auto PAC = MIB.buildInstr(Opcode, {DstReg}, {ValReg}); if (!IsDiscZero) { PAC.addUse(DiscReg); RBI.constrainGenericRegister(DiscReg, AArch64::GPR64spRegClass, MRI); } RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI); I.eraseFromParent(); return true; } case Intrinsic::ptrauth_strip: { Register DstReg = I.getOperand(0).getReg(); Register ValReg = I.getOperand(2).getReg(); uint64_t Key = I.getOperand(3).getImm(); if (Key > AArch64PACKey::LAST) return false; unsigned Opcode = getXPACOpcodeForKey((AArch64PACKey::ID)Key); MIB.buildInstr(Opcode, {DstReg}, {ValReg}); RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI); RBI.constrainGenericRegister(ValReg, AArch64::GPR64RegClass, MRI); I.eraseFromParent(); return true; } case Intrinsic::ptrauth_blend: { MachineFunction &MF = *I.getParent()->getParent(); auto RHS = getIConstantVRegVal(I.getOperand(3).getReg(), MRI); if (RHS && (RHS->getZExtValue() <= 0xffff)) { I.setDesc(TII.get(AArch64::MOVKXi)); I.removeOperand(3); I.removeOperand(1); MachineInstrBuilder(MF, I) .addImm(RHS->getZExtValue() & 0xffff) .addImm(48) .constrainAllUses(TII, TRI, RBI); } else { I.setDesc(TII.get(AArch64::BFMXri)); I.removeOperand(1); MachineInstrBuilder(MF, I).addImm(16).addImm(15).constrainAllUses( TII, TRI, RBI); } return true; } case Intrinsic::frameaddress: case Intrinsic::returnaddress: { MachineFunction &MF = *I.getParent()->getParent(); MachineFrameInfo &MFI = MF.getFrameInfo(); unsigned Depth = I.getOperand(2).getImm(); Register DstReg = I.getOperand(0).getReg(); RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI); if (Depth == 0 && IntrinID == Intrinsic::returnaddress) { if (!MFReturnAddr) { // Insert the copy from LR/X30 into the entry block, before it can be // clobbered by anything. MFI.setReturnAddressIsTaken(true); MFReturnAddr = getFunctionLiveInPhysReg( MF, TII, AArch64::LR, AArch64::GPR64RegClass, I.getDebugLoc()); } if (STI.hasPAuth()) { MIB.buildInstr(AArch64::XPACI, {DstReg}, {MFReturnAddr}); } else { MIB.buildCopy({Register(AArch64::LR)}, {MFReturnAddr}); MIB.buildInstr(AArch64::XPACLRI); MIB.buildCopy({DstReg}, {Register(AArch64::LR)}); } I.eraseFromParent(); return true; } MFI.setFrameAddressIsTaken(true); Register FrameAddr(AArch64::FP); while (Depth--) { Register NextFrame = MRI.createVirtualRegister(&AArch64::GPR64spRegClass); auto Ldr = MIB.buildInstr(AArch64::LDRXui, {NextFrame}, {FrameAddr}).addImm(0); constrainSelectedInstRegOperands(*Ldr, TII, TRI, RBI); FrameAddr = NextFrame; } if (IntrinID == Intrinsic::frameaddress) MIB.buildCopy({DstReg}, {FrameAddr}); else { MFI.setReturnAddressIsTaken(true); if (STI.hasPAuth()) { Register TmpReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); MIB.buildInstr(AArch64::LDRXui, {TmpReg}, {FrameAddr}).addImm(1); MIB.buildInstr(AArch64::XPACI, {DstReg}, {TmpReg}); } else { MIB.buildInstr(AArch64::LDRXui, {Register(AArch64::LR)}, {FrameAddr}) .addImm(1); MIB.buildInstr(AArch64::XPACLRI); MIB.buildCopy({DstReg}, {Register(AArch64::LR)}); } } I.eraseFromParent(); return true; } case Intrinsic::swift_async_context_addr: auto Sub = MIB.buildInstr(AArch64::SUBXri, {I.getOperand(0).getReg()}, {Register(AArch64::FP)}) .addImm(8) .addImm(0); constrainSelectedInstRegOperands(*Sub, TII, TRI, RBI); MF->getFrameInfo().setFrameAddressIsTaken(true); MF->getInfo()->setHasSwiftAsyncContext(true); I.eraseFromParent(); return true; } return false; } InstructionSelector::ComplexRendererFns AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const { auto MaybeImmed = getImmedFromMO(Root); if (MaybeImmed == std::nullopt || *MaybeImmed > 31) return std::nullopt; uint64_t Enc = (32 - *MaybeImmed) & 0x1f; return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; } InstructionSelector::ComplexRendererFns AArch64InstructionSelector::selectShiftB_32(const MachineOperand &Root) const { auto MaybeImmed = getImmedFromMO(Root); if (MaybeImmed == std::nullopt || *MaybeImmed > 31) return std::nullopt; uint64_t Enc = 31 - *MaybeImmed; return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; } InstructionSelector::ComplexRendererFns AArch64InstructionSelector::selectShiftA_64(const MachineOperand &Root) const { auto MaybeImmed = getImmedFromMO(Root); if (MaybeImmed == std::nullopt || *MaybeImmed > 63) return std::nullopt; uint64_t Enc = (64 - *MaybeImmed) & 0x3f; return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; } InstructionSelector::ComplexRendererFns AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const { auto MaybeImmed = getImmedFromMO(Root); if (MaybeImmed == std::nullopt || *MaybeImmed > 63) return std::nullopt; uint64_t Enc = 63 - *MaybeImmed; return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; } /// Helper to select an immediate value that can be represented as a 12-bit /// value shifted left by either 0 or 12. If it is possible to do so, return /// the immediate and shift value. If not, return std::nullopt. /// /// Used by selectArithImmed and selectNegArithImmed. InstructionSelector::ComplexRendererFns AArch64InstructionSelector::select12BitValueWithLeftShift( uint64_t Immed) const { unsigned ShiftAmt; if (Immed >> 12 == 0) { ShiftAmt = 0; } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) { ShiftAmt = 12; Immed = Immed >> 12; } else return std::nullopt; unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt); return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); }, [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); }, }}; } /// SelectArithImmed - Select an immediate value that can be represented as /// a 12-bit value shifted left by either 0 or 12. If so, return true with /// Val set to the 12-bit value and Shift set to the shifter operand. InstructionSelector::ComplexRendererFns AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const { // This function is called from the addsub_shifted_imm ComplexPattern, // which lists [imm] as the list of opcode it's interested in, however // we still need to check whether the operand is actually an immediate // here because the ComplexPattern opcode list is only used in // root-level opcode matching. auto MaybeImmed = getImmedFromMO(Root); if (MaybeImmed == std::nullopt) return std::nullopt; return select12BitValueWithLeftShift(*MaybeImmed); } /// SelectNegArithImmed - As above, but negates the value before trying to /// select it. InstructionSelector::ComplexRendererFns AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const { // We need a register here, because we need to know if we have a 64 or 32 // bit immediate. if (!Root.isReg()) return std::nullopt; auto MaybeImmed = getImmedFromMO(Root); if (MaybeImmed == std::nullopt) return std::nullopt; uint64_t Immed = *MaybeImmed; // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0" // have the opposite effect on the C flag, so this pattern mustn't match under // those circumstances. if (Immed == 0) return std::nullopt; // Check if we're dealing with a 32-bit type on the root or a 64-bit type on // the root. MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); if (MRI.getType(Root.getReg()).getSizeInBits() == 32) Immed = ~((uint32_t)Immed) + 1; else Immed = ~Immed + 1ULL; if (Immed & 0xFFFFFFFFFF000000ULL) return std::nullopt; Immed &= 0xFFFFFFULL; return select12BitValueWithLeftShift(Immed); } /// Return true if it is worth folding MI into an extended register. That is, /// if it's safe to pull it into the addressing mode of a load or store as a /// shift. bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg( MachineInstr &MI, const MachineRegisterInfo &MRI) const { // Always fold if there is one use, or if we're optimizing for size. Register DefReg = MI.getOperand(0).getReg(); if (MRI.hasOneNonDBGUse(DefReg) || MI.getParent()->getParent()->getFunction().hasOptSize()) return true; // It's better to avoid folding and recomputing shifts when we don't have a // fastpath. if (!STI.hasLSLFast()) return false; // We have a fastpath, so folding a shift in and potentially computing it // many times may be beneficial. Check if this is only used in memory ops. // If it is, then we should fold. return all_of(MRI.use_nodbg_instructions(DefReg), [](MachineInstr &Use) { return Use.mayLoadOrStore(); }); } static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) { switch (Type) { case AArch64_AM::SXTB: case AArch64_AM::SXTH: case AArch64_AM::SXTW: return true; default: return false; } } InstructionSelector::ComplexRendererFns AArch64InstructionSelector::selectExtendedSHL( MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset, unsigned SizeInBytes, bool WantsExt) const { assert(Base.isReg() && "Expected base to be a register operand"); assert(Offset.isReg() && "Expected offset to be a register operand"); MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); MachineInstr *OffsetInst = MRI.getVRegDef(Offset.getReg()); unsigned OffsetOpc = OffsetInst->getOpcode(); bool LookedThroughZExt = false; if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) { // Try to look through a ZEXT. if (OffsetOpc != TargetOpcode::G_ZEXT || !WantsExt) return std::nullopt; OffsetInst = MRI.getVRegDef(OffsetInst->getOperand(1).getReg()); OffsetOpc = OffsetInst->getOpcode(); LookedThroughZExt = true; if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) return std::nullopt; } // Make sure that the memory op is a valid size. int64_t LegalShiftVal = Log2_32(SizeInBytes); if (LegalShiftVal == 0) return std::nullopt; if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI)) return std::nullopt; // Now, try to find the specific G_CONSTANT. Start by assuming that the // register we will offset is the LHS, and the register containing the // constant is the RHS. Register OffsetReg = OffsetInst->getOperand(1).getReg(); Register ConstantReg = OffsetInst->getOperand(2).getReg(); auto ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI); if (!ValAndVReg) { // We didn't get a constant on the RHS. If the opcode is a shift, then // we're done. if (OffsetOpc == TargetOpcode::G_SHL) return std::nullopt; // If we have a G_MUL, we can use either register. Try looking at the RHS. std::swap(OffsetReg, ConstantReg); ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI); if (!ValAndVReg) return std::nullopt; } // The value must fit into 3 bits, and must be positive. Make sure that is // true. int64_t ImmVal = ValAndVReg->Value.getSExtValue(); // Since we're going to pull this into a shift, the constant value must be // a power of 2. If we got a multiply, then we need to check this. if (OffsetOpc == TargetOpcode::G_MUL) { if (!llvm::has_single_bit(ImmVal)) return std::nullopt; // Got a power of 2. So, the amount we'll shift is the log base-2 of that. ImmVal = Log2_32(ImmVal); } if ((ImmVal & 0x7) != ImmVal) return std::nullopt; // We are only allowed to shift by LegalShiftVal. This shift value is built // into the instruction, so we can't just use whatever we want. if (ImmVal != LegalShiftVal) return std::nullopt; unsigned SignExtend = 0; if (WantsExt) { // Check if the offset is defined by an extend, unless we looked through a // G_ZEXT earlier. if (!LookedThroughZExt) { MachineInstr *ExtInst = getDefIgnoringCopies(OffsetReg, MRI); auto Ext = getExtendTypeForInst(*ExtInst, MRI, true); if (Ext == AArch64_AM::InvalidShiftExtend) return std::nullopt; SignExtend = isSignExtendShiftType(Ext) ? 1 : 0; // We only support SXTW for signed extension here. if (SignExtend && Ext != AArch64_AM::SXTW) return std::nullopt; OffsetReg = ExtInst->getOperand(1).getReg(); } // Need a 32-bit wide register here. MachineIRBuilder MIB(*MRI.getVRegDef(Root.getReg())); OffsetReg = moveScalarRegClass(OffsetReg, AArch64::GPR32RegClass, MIB); } // We can use the LHS of the GEP as the base, and the LHS of the shift as an // offset. Signify that we are shifting by setting the shift flag to 1. return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(Base.getReg()); }, [=](MachineInstrBuilder &MIB) { MIB.addUse(OffsetReg); }, [=](MachineInstrBuilder &MIB) { // Need to add both immediates here to make sure that they are both // added to the instruction. MIB.addImm(SignExtend); MIB.addImm(1); }}}; } /// This is used for computing addresses like this: /// /// ldr x1, [x2, x3, lsl #3] /// /// Where x2 is the base register, and x3 is an offset register. The shift-left /// is a constant value specific to this load instruction. That is, we'll never /// see anything other than a 3 here (which corresponds to the size of the /// element being loaded.) InstructionSelector::ComplexRendererFns AArch64InstructionSelector::selectAddrModeShiftedExtendXReg( MachineOperand &Root, unsigned SizeInBytes) const { if (!Root.isReg()) return std::nullopt; MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); // We want to find something like this: // // val = G_CONSTANT LegalShiftVal // shift = G_SHL off_reg val // ptr = G_PTR_ADD base_reg shift // x = G_LOAD ptr // // And fold it into this addressing mode: // // ldr x, [base_reg, off_reg, lsl #LegalShiftVal] // Check if we can find the G_PTR_ADD. MachineInstr *PtrAdd = getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI)) return std::nullopt; // Now, try to match an opcode which will match our specific offset. // We want a G_SHL or a G_MUL. MachineInstr *OffsetInst = getDefIgnoringCopies(PtrAdd->getOperand(2).getReg(), MRI); return selectExtendedSHL(Root, PtrAdd->getOperand(1), OffsetInst->getOperand(0), SizeInBytes, /*WantsExt=*/false); } /// This is used for computing addresses like this: /// /// ldr x1, [x2, x3] /// /// Where x2 is the base register, and x3 is an offset register. /// /// When possible (or profitable) to fold a G_PTR_ADD into the address /// calculation, this will do so. Otherwise, it will return std::nullopt. InstructionSelector::ComplexRendererFns AArch64InstructionSelector::selectAddrModeRegisterOffset( MachineOperand &Root) const { MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); // We need a GEP. MachineInstr *Gep = MRI.getVRegDef(Root.getReg()); if (Gep->getOpcode() != TargetOpcode::G_PTR_ADD) return std::nullopt; // If this is used more than once, let's not bother folding. // TODO: Check if they are memory ops. If they are, then we can still fold // without having to recompute anything. if (!MRI.hasOneNonDBGUse(Gep->getOperand(0).getReg())) return std::nullopt; // Base is the GEP's LHS, offset is its RHS. return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(Gep->getOperand(1).getReg()); }, [=](MachineInstrBuilder &MIB) { MIB.addUse(Gep->getOperand(2).getReg()); }, [=](MachineInstrBuilder &MIB) { // Need to add both immediates here to make sure that they are both // added to the instruction. MIB.addImm(0); MIB.addImm(0); }}}; } /// This is intended to be equivalent to selectAddrModeXRO in /// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads. InstructionSelector::ComplexRendererFns AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root, unsigned SizeInBytes) const { MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); if (!Root.isReg()) return std::nullopt; MachineInstr *PtrAdd = getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); if (!PtrAdd) return std::nullopt; // Check for an immediates which cannot be encoded in the [base + imm] // addressing mode, and can't be encoded in an add/sub. If this happens, we'll // end up with code like: // // mov x0, wide // add x1 base, x0 // ldr x2, [x1, x0] // // In this situation, we can use the [base, xreg] addressing mode to save an // add/sub: // // mov x0, wide // ldr x2, [base, x0] auto ValAndVReg = getIConstantVRegValWithLookThrough(PtrAdd->getOperand(2).getReg(), MRI); if (ValAndVReg) { unsigned Scale = Log2_32(SizeInBytes); int64_t ImmOff = ValAndVReg->Value.getSExtValue(); // Skip immediates that can be selected in the load/store addresing // mode. if (ImmOff % SizeInBytes == 0 && ImmOff >= 0 && ImmOff < (0x1000 << Scale)) return std::nullopt; // Helper lambda to decide whether or not it is preferable to emit an add. auto isPreferredADD = [](int64_t ImmOff) { // Constants in [0x0, 0xfff] can be encoded in an add. if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL) return true; // Can it be encoded in an add lsl #12? if ((ImmOff & 0xffffffffff000fffLL) != 0x0LL) return false; // It can be encoded in an add lsl #12, but we may not want to. If it is // possible to select this as a single movz, then prefer that. A single // movz is faster than an add with a shift. return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL && (ImmOff & 0xffffffffffff0fffLL) != 0x0LL; }; // If the immediate can be encoded in a single add/sub, then bail out. if (isPreferredADD(ImmOff) || isPreferredADD(-ImmOff)) return std::nullopt; } // Try to fold shifts into the addressing mode. auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes); if (AddrModeFns) return AddrModeFns; // If that doesn't work, see if it's possible to fold in registers from // a GEP. return selectAddrModeRegisterOffset(Root); } /// This is used for computing addresses like this: /// /// ldr x0, [xBase, wOffset, sxtw #LegalShiftVal] /// /// Where we have a 64-bit base register, a 32-bit offset register, and an /// extend (which may or may not be signed). InstructionSelector::ComplexRendererFns AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root, unsigned SizeInBytes) const { MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); MachineInstr *PtrAdd = getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI)) return std::nullopt; MachineOperand &LHS = PtrAdd->getOperand(1); MachineOperand &RHS = PtrAdd->getOperand(2); MachineInstr *OffsetInst = getDefIgnoringCopies(RHS.getReg(), MRI); // The first case is the same as selectAddrModeXRO, except we need an extend. // In this case, we try to find a shift and extend, and fold them into the // addressing mode. // // E.g. // // off_reg = G_Z/S/ANYEXT ext_reg // val = G_CONSTANT LegalShiftVal // shift = G_SHL off_reg val // ptr = G_PTR_ADD base_reg shift // x = G_LOAD ptr // // In this case we can get a load like this: // // ldr x0, [base_reg, ext_reg, sxtw #LegalShiftVal] auto ExtendedShl = selectExtendedSHL(Root, LHS, OffsetInst->getOperand(0), SizeInBytes, /*WantsExt=*/true); if (ExtendedShl) return ExtendedShl; // There was no shift. We can try and fold a G_Z/S/ANYEXT in alone though. // // e.g. // ldr something, [base_reg, ext_reg, sxtw] if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI)) return std::nullopt; // Check if this is an extend. We'll get an extend type if it is. AArch64_AM::ShiftExtendType Ext = getExtendTypeForInst(*OffsetInst, MRI, /*IsLoadStore=*/true); if (Ext == AArch64_AM::InvalidShiftExtend) return std::nullopt; // Need a 32-bit wide register. MachineIRBuilder MIB(*PtrAdd); Register ExtReg = moveScalarRegClass(OffsetInst->getOperand(1).getReg(), AArch64::GPR32RegClass, MIB); unsigned SignExtend = Ext == AArch64_AM::SXTW; // Base is LHS, offset is ExtReg. return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(LHS.getReg()); }, [=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }, [=](MachineInstrBuilder &MIB) { MIB.addImm(SignExtend); MIB.addImm(0); }}}; } /// Select a "register plus unscaled signed 9-bit immediate" address. This /// should only match when there is an offset that is not valid for a scaled /// immediate addressing mode. The "Size" argument is the size in bytes of the /// memory reference, which is needed here to know what is valid for a scaled /// immediate. InstructionSelector::ComplexRendererFns AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root, unsigned Size) const { MachineRegisterInfo &MRI = Root.getParent()->getParent()->getParent()->getRegInfo(); if (!Root.isReg()) return std::nullopt; if (!isBaseWithConstantOffset(Root, MRI)) return std::nullopt; MachineInstr *RootDef = MRI.getVRegDef(Root.getReg()); MachineOperand &OffImm = RootDef->getOperand(2); if (!OffImm.isReg()) return std::nullopt; MachineInstr *RHS = MRI.getVRegDef(OffImm.getReg()); if (RHS->getOpcode() != TargetOpcode::G_CONSTANT) return std::nullopt; int64_t RHSC; MachineOperand &RHSOp1 = RHS->getOperand(1); if (!RHSOp1.isCImm() || RHSOp1.getCImm()->getBitWidth() > 64) return std::nullopt; RHSC = RHSOp1.getCImm()->getSExtValue(); // If the offset is valid as a scaled immediate, don't match here. if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Log2_32(Size))) return std::nullopt; if (RHSC >= -256 && RHSC < 256) { MachineOperand &Base = RootDef->getOperand(1); return {{ [=](MachineInstrBuilder &MIB) { MIB.add(Base); }, [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC); }, }}; } return std::nullopt; } InstructionSelector::ComplexRendererFns AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef, unsigned Size, MachineRegisterInfo &MRI) const { if (RootDef.getOpcode() != AArch64::G_ADD_LOW) return std::nullopt; MachineInstr &Adrp = *MRI.getVRegDef(RootDef.getOperand(1).getReg()); if (Adrp.getOpcode() != AArch64::ADRP) return std::nullopt; // TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG. auto Offset = Adrp.getOperand(1).getOffset(); if (Offset % Size != 0) return std::nullopt; auto GV = Adrp.getOperand(1).getGlobal(); if (GV->isThreadLocal()) return std::nullopt; auto &MF = *RootDef.getParent()->getParent(); if (GV->getPointerAlignment(MF.getDataLayout()) < Size) return std::nullopt; unsigned OpFlags = STI.ClassifyGlobalReference(GV, MF.getTarget()); MachineIRBuilder MIRBuilder(RootDef); Register AdrpReg = Adrp.getOperand(0).getReg(); return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(AdrpReg); }, [=](MachineInstrBuilder &MIB) { MIB.addGlobalAddress(GV, Offset, OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); }}}; } /// Select a "register plus scaled unsigned 12-bit immediate" address. The /// "Size" argument is the size in bytes of the memory reference, which /// determines the scale. InstructionSelector::ComplexRendererFns AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root, unsigned Size) const { MachineFunction &MF = *Root.getParent()->getParent()->getParent(); MachineRegisterInfo &MRI = MF.getRegInfo(); if (!Root.isReg()) return std::nullopt; MachineInstr *RootDef = MRI.getVRegDef(Root.getReg()); if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) { return {{ [=](MachineInstrBuilder &MIB) { MIB.add(RootDef->getOperand(1)); }, [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, }}; } CodeModel::Model CM = MF.getTarget().getCodeModel(); // Check if we can fold in the ADD of small code model ADRP + ADD address. if (CM == CodeModel::Small) { auto OpFns = tryFoldAddLowIntoImm(*RootDef, Size, MRI); if (OpFns) return OpFns; } if (isBaseWithConstantOffset(Root, MRI)) { MachineOperand &LHS = RootDef->getOperand(1); MachineOperand &RHS = RootDef->getOperand(2); MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg()); MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg()); int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue(); unsigned Scale = Log2_32(Size); if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) { if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) return {{ [=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); }, [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); }, }}; return {{ [=](MachineInstrBuilder &MIB) { MIB.add(LHS); }, [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); }, }}; } } // Before falling back to our general case, check if the unscaled // instructions can handle this. If so, that's preferable. if (selectAddrModeUnscaled(Root, Size)) return std::nullopt; return {{ [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, }}; } /// Given a shift instruction, return the correct shift type for that /// instruction. static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) { switch (MI.getOpcode()) { default: return AArch64_AM::InvalidShiftExtend; case TargetOpcode::G_SHL: return AArch64_AM::LSL; case TargetOpcode::G_LSHR: return AArch64_AM::LSR; case TargetOpcode::G_ASHR: return AArch64_AM::ASR; case TargetOpcode::G_ROTR: return AArch64_AM::ROR; } } /// Select a "shifted register" operand. If the value is not shifted, set the /// shift operand to a default value of "lsl 0". InstructionSelector::ComplexRendererFns AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root, bool AllowROR) const { if (!Root.isReg()) return std::nullopt; MachineRegisterInfo &MRI = Root.getParent()->getParent()->getParent()->getRegInfo(); // Check if the operand is defined by an instruction which corresponds to // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc. MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg()); AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst); if (ShType == AArch64_AM::InvalidShiftExtend) return std::nullopt; if (ShType == AArch64_AM::ROR && !AllowROR) return std::nullopt; if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI)) return std::nullopt; // Need an immediate on the RHS. MachineOperand &ShiftRHS = ShiftInst->getOperand(2); auto Immed = getImmedFromMO(ShiftRHS); if (!Immed) return std::nullopt; // We have something that we can fold. Fold in the shift's LHS and RHS into // the instruction. MachineOperand &ShiftLHS = ShiftInst->getOperand(1); Register ShiftReg = ShiftLHS.getReg(); unsigned NumBits = MRI.getType(ShiftReg).getSizeInBits(); unsigned Val = *Immed & (NumBits - 1); unsigned ShiftVal = AArch64_AM::getShifterImm(ShType, Val); return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ShiftReg); }, [=](MachineInstrBuilder &MIB) { MIB.addImm(ShiftVal); }}}; } AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst( MachineInstr &MI, MachineRegisterInfo &MRI, bool IsLoadStore) const { unsigned Opc = MI.getOpcode(); // Handle explicit extend instructions first. if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) { unsigned Size; if (Opc == TargetOpcode::G_SEXT) Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); else Size = MI.getOperand(2).getImm(); assert(Size != 64 && "Extend from 64 bits?"); switch (Size) { case 8: return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTB; case 16: return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTH; case 32: return AArch64_AM::SXTW; default: return AArch64_AM::InvalidShiftExtend; } } if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) { unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); assert(Size != 64 && "Extend from 64 bits?"); switch (Size) { case 8: return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTB; case 16: return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTH; case 32: return AArch64_AM::UXTW; default: return AArch64_AM::InvalidShiftExtend; } } // Don't have an explicit extend. Try to handle a G_AND with a constant mask // on the RHS. if (Opc != TargetOpcode::G_AND) return AArch64_AM::InvalidShiftExtend; std::optional MaybeAndMask = getImmedFromMO(MI.getOperand(2)); if (!MaybeAndMask) return AArch64_AM::InvalidShiftExtend; uint64_t AndMask = *MaybeAndMask; switch (AndMask) { default: return AArch64_AM::InvalidShiftExtend; case 0xFF: return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend; case 0xFFFF: return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend; case 0xFFFFFFFF: return AArch64_AM::UXTW; } } Register AArch64InstructionSelector::moveScalarRegClass( Register Reg, const TargetRegisterClass &RC, MachineIRBuilder &MIB) const { MachineRegisterInfo &MRI = *MIB.getMRI(); auto Ty = MRI.getType(Reg); assert(!Ty.isVector() && "Expected scalars only!"); if (Ty.getSizeInBits() == TRI.getRegSizeInBits(RC)) return Reg; // Create a copy and immediately select it. // FIXME: We should have an emitCopy function? auto Copy = MIB.buildCopy({&RC}, {Reg}); selectCopy(*Copy, TII, MRI, TRI, RBI); return Copy.getReg(0); } /// Select an "extended register" operand. This operand folds in an extend /// followed by an optional left shift. InstructionSelector::ComplexRendererFns AArch64InstructionSelector::selectArithExtendedRegister( MachineOperand &Root) const { if (!Root.isReg()) return std::nullopt; MachineRegisterInfo &MRI = Root.getParent()->getParent()->getParent()->getRegInfo(); uint64_t ShiftVal = 0; Register ExtReg; AArch64_AM::ShiftExtendType Ext; MachineInstr *RootDef = getDefIgnoringCopies(Root.getReg(), MRI); if (!RootDef) return std::nullopt; if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI)) return std::nullopt; // Check if we can fold a shift and an extend. if (RootDef->getOpcode() == TargetOpcode::G_SHL) { // Look for a constant on the RHS of the shift. MachineOperand &RHS = RootDef->getOperand(2); std::optional MaybeShiftVal = getImmedFromMO(RHS); if (!MaybeShiftVal) return std::nullopt; ShiftVal = *MaybeShiftVal; if (ShiftVal > 4) return std::nullopt; // Look for a valid extend instruction on the LHS of the shift. MachineOperand &LHS = RootDef->getOperand(1); MachineInstr *ExtDef = getDefIgnoringCopies(LHS.getReg(), MRI); if (!ExtDef) return std::nullopt; Ext = getExtendTypeForInst(*ExtDef, MRI); if (Ext == AArch64_AM::InvalidShiftExtend) return std::nullopt; ExtReg = ExtDef->getOperand(1).getReg(); } else { // Didn't get a shift. Try just folding an extend. Ext = getExtendTypeForInst(*RootDef, MRI); if (Ext == AArch64_AM::InvalidShiftExtend) return std::nullopt; ExtReg = RootDef->getOperand(1).getReg(); // If we have a 32 bit instruction which zeroes out the high half of a // register, we get an implicit zero extend for free. Check if we have one. // FIXME: We actually emit the extend right now even though we don't have // to. if (Ext == AArch64_AM::UXTW && MRI.getType(ExtReg).getSizeInBits() == 32) { MachineInstr *ExtInst = MRI.getVRegDef(ExtReg); if (isDef32(*ExtInst)) return std::nullopt; } } // We require a GPR32 here. Narrow the ExtReg if needed using a subregister // copy. MachineIRBuilder MIB(*RootDef); ExtReg = moveScalarRegClass(ExtReg, AArch64::GPR32RegClass, MIB); return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }, [=](MachineInstrBuilder &MIB) { MIB.addImm(getArithExtendImm(Ext, ShiftVal)); }}}; } InstructionSelector::ComplexRendererFns AArch64InstructionSelector::selectExtractHigh(MachineOperand &Root) const { if (!Root.isReg()) return std::nullopt; MachineRegisterInfo &MRI = Root.getParent()->getParent()->getParent()->getRegInfo(); MachineInstr *Extract = getDefIgnoringCopies(Root.getReg(), MRI); if (Extract && Extract->getOpcode() == TargetOpcode::G_UNMERGE_VALUES && Root.getReg() == Extract->getOperand(1).getReg()) { Register ExtReg = Extract->getOperand(2).getReg(); return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }}}; } return std::nullopt; } void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && "Expected G_CONSTANT"); std::optional CstVal = getIConstantVRegSExtVal(MI.getOperand(0).getReg(), MRI); assert(CstVal && "Expected constant value"); MIB.addImm(*CstVal); } void AArch64InstructionSelector::renderLogicalImm32( MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const { assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && "Expected G_CONSTANT"); uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue(); uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 32); MIB.addImm(Enc); } void AArch64InstructionSelector::renderLogicalImm64( MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const { assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && "Expected G_CONSTANT"); uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue(); uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 64); MIB.addImm(Enc); } void AArch64InstructionSelector::renderFPImm16(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 && "Expected G_FCONSTANT"); MIB.addImm( AArch64_AM::getFP16Imm(MI.getOperand(1).getFPImm()->getValueAPF())); } void AArch64InstructionSelector::renderFPImm32(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 && "Expected G_FCONSTANT"); MIB.addImm( AArch64_AM::getFP32Imm(MI.getOperand(1).getFPImm()->getValueAPF())); } void AArch64InstructionSelector::renderFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 && "Expected G_FCONSTANT"); MIB.addImm( AArch64_AM::getFP64Imm(MI.getOperand(1).getFPImm()->getValueAPF())); } void AArch64InstructionSelector::renderFPImm32SIMDModImmType4( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 && "Expected G_FCONSTANT"); MIB.addImm(AArch64_AM::encodeAdvSIMDModImmType4(MI.getOperand(1) .getFPImm() ->getValueAPF() .bitcastToAPInt() .getZExtValue())); } bool AArch64InstructionSelector::isLoadStoreOfNumBytes( const MachineInstr &MI, unsigned NumBytes) const { if (!MI.mayLoadOrStore()) return false; assert(MI.hasOneMemOperand() && "Expected load/store to have only one mem op!"); return (*MI.memoperands_begin())->getSize() == NumBytes; } bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const { const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() != 32) return false; // Only return true if we know the operation will zero-out the high half of // the 64-bit register. Truncates can be subregister copies, which don't // zero out the high bits. Copies and other copy-like instructions can be // fed by truncates, or could be lowered as subregister copies. switch (MI.getOpcode()) { default: return true; case TargetOpcode::COPY: case TargetOpcode::G_BITCAST: case TargetOpcode::G_TRUNC: case TargetOpcode::G_PHI: return false; } } // Perform fixups on the given PHI instruction's operands to force them all // to be the same as the destination regbank. static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI, const AArch64RegisterBankInfo &RBI) { assert(MI.getOpcode() == TargetOpcode::G_PHI && "Expected a G_PHI"); Register DstReg = MI.getOperand(0).getReg(); const RegisterBank *DstRB = MRI.getRegBankOrNull(DstReg); assert(DstRB && "Expected PHI dst to have regbank assigned"); MachineIRBuilder MIB(MI); // Go through each operand and ensure it has the same regbank. for (MachineOperand &MO : llvm::drop_begin(MI.operands())) { if (!MO.isReg()) continue; Register OpReg = MO.getReg(); const RegisterBank *RB = MRI.getRegBankOrNull(OpReg); if (RB != DstRB) { // Insert a cross-bank copy. auto *OpDef = MRI.getVRegDef(OpReg); const LLT &Ty = MRI.getType(OpReg); MachineBasicBlock &OpDefBB = *OpDef->getParent(); // Any instruction we insert must appear after all PHIs in the block // for the block to be valid MIR. MachineBasicBlock::iterator InsertPt = std::next(OpDef->getIterator()); if (InsertPt != OpDefBB.end() && InsertPt->isPHI()) InsertPt = OpDefBB.getFirstNonPHI(); MIB.setInsertPt(*OpDef->getParent(), InsertPt); auto Copy = MIB.buildCopy(Ty, OpReg); MRI.setRegBank(Copy.getReg(0), *DstRB); MO.setReg(Copy.getReg(0)); } } } void AArch64InstructionSelector::processPHIs(MachineFunction &MF) { // We're looking for PHIs, build a list so we don't invalidate iterators. MachineRegisterInfo &MRI = MF.getRegInfo(); SmallVector Phis; for (auto &BB : MF) { for (auto &MI : BB) { if (MI.getOpcode() == TargetOpcode::G_PHI) Phis.emplace_back(&MI); } } for (auto *MI : Phis) { // We need to do some work here if the operand types are < 16 bit and they // are split across fpr/gpr banks. Since all types <32b on gpr // end up being assigned gpr32 regclasses, we can end up with PHIs here // which try to select between a gpr32 and an fpr16. Ideally RBS shouldn't // be selecting heterogenous regbanks for operands if possible, but we // still need to be able to deal with it here. // // To fix this, if we have a gpr-bank operand < 32b in size and at least // one other operand is on the fpr bank, then we add cross-bank copies // to homogenize the operand banks. For simplicity the bank that we choose // to settle on is whatever bank the def operand has. For example: // // %endbb: // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2:fpr(s16), %bb2 // => // %bb2: // ... // %in2_copy:gpr(s16) = COPY %in2:fpr(s16) // ... // %endbb: // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2 bool HasGPROp = false, HasFPROp = false; for (const MachineOperand &MO : llvm::drop_begin(MI->operands())) { if (!MO.isReg()) continue; const LLT &Ty = MRI.getType(MO.getReg()); if (!Ty.isValid() || !Ty.isScalar()) break; if (Ty.getSizeInBits() >= 32) break; const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg()); // If for some reason we don't have a regbank yet. Don't try anything. if (!RB) break; if (RB->getID() == AArch64::GPRRegBankID) HasGPROp = true; else HasFPROp = true; } // We have heterogenous regbanks, need to fixup. if (HasGPROp && HasFPROp) fixupPHIOpBanks(*MI, MRI, RBI); } } namespace llvm { InstructionSelector * createAArch64InstructionSelector(const AArch64TargetMachine &TM, AArch64Subtarget &Subtarget, AArch64RegisterBankInfo &RBI) { return new AArch64InstructionSelector(TM, Subtarget, RBI); } }