1 //===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines the interfaces that ARM uses to lower LLVM code into a 10 // selection DAG. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "ARMISelLowering.h" 15 #include "ARMBaseInstrInfo.h" 16 #include "ARMBaseRegisterInfo.h" 17 #include "ARMCallingConv.h" 18 #include "ARMConstantPoolValue.h" 19 #include "ARMMachineFunctionInfo.h" 20 #include "ARMPerfectShuffle.h" 21 #include "ARMRegisterInfo.h" 22 #include "ARMSelectionDAGInfo.h" 23 #include "ARMSubtarget.h" 24 #include "MCTargetDesc/ARMAddressingModes.h" 25 #include "MCTargetDesc/ARMBaseInfo.h" 26 #include "Utils/ARMBaseInfo.h" 27 #include "llvm/ADT/APFloat.h" 28 #include "llvm/ADT/APInt.h" 29 #include "llvm/ADT/ArrayRef.h" 30 #include "llvm/ADT/BitVector.h" 31 #include "llvm/ADT/DenseMap.h" 32 #include "llvm/ADT/STLExtras.h" 33 #include "llvm/ADT/SmallPtrSet.h" 34 #include "llvm/ADT/SmallVector.h" 35 #include "llvm/ADT/Statistic.h" 36 #include "llvm/ADT/StringExtras.h" 37 #include "llvm/ADT/StringRef.h" 38 #include "llvm/ADT/StringSwitch.h" 39 #include "llvm/ADT/Triple.h" 40 #include "llvm/ADT/Twine.h" 41 #include "llvm/Analysis/VectorUtils.h" 42 #include "llvm/CodeGen/CallingConvLower.h" 43 #include "llvm/CodeGen/ISDOpcodes.h" 44 #include "llvm/CodeGen/IntrinsicLowering.h" 45 #include "llvm/CodeGen/MachineBasicBlock.h" 46 #include "llvm/CodeGen/MachineConstantPool.h" 47 #include "llvm/CodeGen/MachineFrameInfo.h" 48 #include "llvm/CodeGen/MachineFunction.h" 49 #include "llvm/CodeGen/MachineInstr.h" 50 #include "llvm/CodeGen/MachineInstrBuilder.h" 51 #include "llvm/CodeGen/MachineJumpTableInfo.h" 52 #include "llvm/CodeGen/MachineMemOperand.h" 53 #include "llvm/CodeGen/MachineOperand.h" 54 #include "llvm/CodeGen/MachineRegisterInfo.h" 55 #include "llvm/CodeGen/RuntimeLibcalls.h" 56 #include "llvm/CodeGen/SelectionDAG.h" 57 #include "llvm/CodeGen/SelectionDAGNodes.h" 58 #include "llvm/CodeGen/TargetInstrInfo.h" 59 #include "llvm/CodeGen/TargetLowering.h" 60 #include "llvm/CodeGen/TargetOpcodes.h" 61 #include "llvm/CodeGen/TargetRegisterInfo.h" 62 #include "llvm/CodeGen/TargetSubtargetInfo.h" 63 #include "llvm/CodeGen/ValueTypes.h" 64 #include "llvm/IR/Attributes.h" 65 #include "llvm/IR/CallingConv.h" 66 #include "llvm/IR/Constant.h" 67 #include "llvm/IR/Constants.h" 68 #include "llvm/IR/DataLayout.h" 69 #include "llvm/IR/DebugLoc.h" 70 #include "llvm/IR/DerivedTypes.h" 71 #include "llvm/IR/Function.h" 72 #include "llvm/IR/GlobalAlias.h" 73 #include "llvm/IR/GlobalValue.h" 74 #include "llvm/IR/GlobalVariable.h" 75 #include "llvm/IR/IRBuilder.h" 76 #include "llvm/IR/InlineAsm.h" 77 #include "llvm/IR/Instruction.h" 78 #include "llvm/IR/Instructions.h" 79 #include "llvm/IR/IntrinsicInst.h" 80 #include "llvm/IR/Intrinsics.h" 81 #include "llvm/IR/IntrinsicsARM.h" 82 #include "llvm/IR/Module.h" 83 #include "llvm/IR/PatternMatch.h" 84 #include "llvm/IR/Type.h" 85 #include "llvm/IR/User.h" 86 #include "llvm/IR/Value.h" 87 #include "llvm/MC/MCInstrDesc.h" 88 #include "llvm/MC/MCInstrItineraries.h" 89 #include "llvm/MC/MCRegisterInfo.h" 90 #include "llvm/MC/MCSchedule.h" 91 #include "llvm/Support/AtomicOrdering.h" 92 #include "llvm/Support/BranchProbability.h" 93 #include "llvm/Support/Casting.h" 94 #include "llvm/Support/CodeGen.h" 95 #include "llvm/Support/CommandLine.h" 96 #include "llvm/Support/Compiler.h" 97 #include "llvm/Support/Debug.h" 98 #include "llvm/Support/ErrorHandling.h" 99 #include "llvm/Support/KnownBits.h" 100 #include "llvm/Support/MachineValueType.h" 101 #include "llvm/Support/MathExtras.h" 102 #include "llvm/Support/raw_ostream.h" 103 #include "llvm/Target/TargetMachine.h" 104 #include "llvm/Target/TargetOptions.h" 105 #include <algorithm> 106 #include <cassert> 107 #include <cstdint> 108 #include <cstdlib> 109 #include <iterator> 110 #include <limits> 111 #include <string> 112 #include <tuple> 113 #include <utility> 114 #include <vector> 115 116 using namespace llvm; 117 using namespace llvm::PatternMatch; 118 119 #define DEBUG_TYPE "arm-isel" 120 121 STATISTIC(NumTailCalls, "Number of tail calls"); 122 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt"); 123 STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments"); 124 STATISTIC(NumConstpoolPromoted, 125 "Number of constants with their storage promoted into constant pools"); 126 127 static cl::opt<bool> 128 ARMInterworking("arm-interworking", cl::Hidden, 129 cl::desc("Enable / disable ARM interworking (for debugging only)"), 130 cl::init(true)); 131 132 static cl::opt<bool> EnableConstpoolPromotion( 133 "arm-promote-constant", cl::Hidden, 134 cl::desc("Enable / disable promotion of unnamed_addr constants into " 135 "constant pools"), 136 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed 137 static cl::opt<unsigned> ConstpoolPromotionMaxSize( 138 "arm-promote-constant-max-size", cl::Hidden, 139 cl::desc("Maximum size of constant to promote into a constant pool"), 140 cl::init(64)); 141 static cl::opt<unsigned> ConstpoolPromotionMaxTotal( 142 "arm-promote-constant-max-total", cl::Hidden, 143 cl::desc("Maximum size of ALL constants to promote into a constant pool"), 144 cl::init(128)); 145 146 static cl::opt<unsigned> 147 MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, 148 cl::desc("Maximum interleave factor for MVE VLDn to generate."), 149 cl::init(2)); 150 151 // The APCS parameter registers. 152 static const MCPhysReg GPRArgRegs[] = { 153 ARM::R0, ARM::R1, ARM::R2, ARM::R3 154 }; 155 156 void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT, 157 MVT PromotedBitwiseVT) { 158 if (VT != PromotedLdStVT) { 159 setOperationAction(ISD::LOAD, VT, Promote); 160 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT); 161 162 setOperationAction(ISD::STORE, VT, Promote); 163 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT); 164 } 165 166 MVT ElemTy = VT.getVectorElementType(); 167 if (ElemTy != MVT::f64) 168 setOperationAction(ISD::SETCC, VT, Custom); 169 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 170 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 171 if (ElemTy == MVT::i32) { 172 setOperationAction(ISD::SINT_TO_FP, VT, Custom); 173 setOperationAction(ISD::UINT_TO_FP, VT, Custom); 174 setOperationAction(ISD::FP_TO_SINT, VT, Custom); 175 setOperationAction(ISD::FP_TO_UINT, VT, Custom); 176 } else { 177 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 178 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 179 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 180 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 181 } 182 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 183 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 184 setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); 185 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); 186 setOperationAction(ISD::SELECT, VT, Expand); 187 setOperationAction(ISD::SELECT_CC, VT, Expand); 188 setOperationAction(ISD::VSELECT, VT, Expand); 189 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); 190 if (VT.isInteger()) { 191 setOperationAction(ISD::SHL, VT, Custom); 192 setOperationAction(ISD::SRA, VT, Custom); 193 setOperationAction(ISD::SRL, VT, Custom); 194 } 195 196 // Promote all bit-wise operations. 197 if (VT.isInteger() && VT != PromotedBitwiseVT) { 198 setOperationAction(ISD::AND, VT, Promote); 199 AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT); 200 setOperationAction(ISD::OR, VT, Promote); 201 AddPromotedToType (ISD::OR, VT, PromotedBitwiseVT); 202 setOperationAction(ISD::XOR, VT, Promote); 203 AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT); 204 } 205 206 // Neon does not support vector divide/remainder operations. 207 setOperationAction(ISD::SDIV, VT, Expand); 208 setOperationAction(ISD::UDIV, VT, Expand); 209 setOperationAction(ISD::FDIV, VT, Expand); 210 setOperationAction(ISD::SREM, VT, Expand); 211 setOperationAction(ISD::UREM, VT, Expand); 212 setOperationAction(ISD::FREM, VT, Expand); 213 214 if (!VT.isFloatingPoint() && 215 VT != MVT::v2i64 && VT != MVT::v1i64) 216 for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) 217 setOperationAction(Opcode, VT, Legal); 218 if (!VT.isFloatingPoint()) 219 for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT}) 220 setOperationAction(Opcode, VT, Legal); 221 } 222 223 void ARMTargetLowering::addDRTypeForNEON(MVT VT) { 224 addRegisterClass(VT, &ARM::DPRRegClass); 225 addTypeForNEON(VT, MVT::f64, MVT::v2i32); 226 } 227 228 void ARMTargetLowering::addQRTypeForNEON(MVT VT) { 229 addRegisterClass(VT, &ARM::DPairRegClass); 230 addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); 231 } 232 233 void ARMTargetLowering::setAllExpand(MVT VT) { 234 for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc) 235 setOperationAction(Opc, VT, Expand); 236 237 // We support these really simple operations even on types where all 238 // the actual arithmetic has to be broken down into simpler 239 // operations or turned into library calls. 240 setOperationAction(ISD::BITCAST, VT, Legal); 241 setOperationAction(ISD::LOAD, VT, Legal); 242 setOperationAction(ISD::STORE, VT, Legal); 243 setOperationAction(ISD::UNDEF, VT, Legal); 244 } 245 246 void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To, 247 LegalizeAction Action) { 248 setLoadExtAction(ISD::EXTLOAD, From, To, Action); 249 setLoadExtAction(ISD::ZEXTLOAD, From, To, Action); 250 setLoadExtAction(ISD::SEXTLOAD, From, To, Action); 251 } 252 253 void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { 254 const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 }; 255 256 for (auto VT : IntTypes) { 257 addRegisterClass(VT, &ARM::MQPRRegClass); 258 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 259 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 260 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 261 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 262 setOperationAction(ISD::SHL, VT, Custom); 263 setOperationAction(ISD::SRA, VT, Custom); 264 setOperationAction(ISD::SRL, VT, Custom); 265 setOperationAction(ISD::SMIN, VT, Legal); 266 setOperationAction(ISD::SMAX, VT, Legal); 267 setOperationAction(ISD::UMIN, VT, Legal); 268 setOperationAction(ISD::UMAX, VT, Legal); 269 setOperationAction(ISD::ABS, VT, Legal); 270 setOperationAction(ISD::SETCC, VT, Custom); 271 setOperationAction(ISD::MLOAD, VT, Custom); 272 setOperationAction(ISD::MSTORE, VT, Legal); 273 setOperationAction(ISD::CTLZ, VT, Legal); 274 setOperationAction(ISD::CTTZ, VT, Custom); 275 setOperationAction(ISD::BITREVERSE, VT, Legal); 276 setOperationAction(ISD::BSWAP, VT, Legal); 277 setOperationAction(ISD::SADDSAT, VT, Legal); 278 setOperationAction(ISD::UADDSAT, VT, Legal); 279 setOperationAction(ISD::SSUBSAT, VT, Legal); 280 setOperationAction(ISD::USUBSAT, VT, Legal); 281 282 // No native support for these. 283 setOperationAction(ISD::UDIV, VT, Expand); 284 setOperationAction(ISD::SDIV, VT, Expand); 285 setOperationAction(ISD::UREM, VT, Expand); 286 setOperationAction(ISD::SREM, VT, Expand); 287 setOperationAction(ISD::CTPOP, VT, Expand); 288 289 // Vector reductions 290 setOperationAction(ISD::VECREDUCE_ADD, VT, Legal); 291 setOperationAction(ISD::VECREDUCE_SMAX, VT, Legal); 292 setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal); 293 setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal); 294 setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal); 295 296 if (!HasMVEFP) { 297 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 298 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 299 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 300 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 301 } 302 303 // Pre and Post inc are supported on loads and stores 304 for (unsigned im = (unsigned)ISD::PRE_INC; 305 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 306 setIndexedLoadAction(im, VT, Legal); 307 setIndexedStoreAction(im, VT, Legal); 308 setIndexedMaskedLoadAction(im, VT, Legal); 309 setIndexedMaskedStoreAction(im, VT, Legal); 310 } 311 } 312 313 const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 }; 314 for (auto VT : FloatTypes) { 315 addRegisterClass(VT, &ARM::MQPRRegClass); 316 if (!HasMVEFP) 317 setAllExpand(VT); 318 319 // These are legal or custom whether we have MVE.fp or not 320 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 321 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 322 setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getVectorElementType(), Custom); 323 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 324 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 325 setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom); 326 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal); 327 setOperationAction(ISD::SETCC, VT, Custom); 328 setOperationAction(ISD::MLOAD, VT, Custom); 329 setOperationAction(ISD::MSTORE, VT, Legal); 330 331 // Pre and Post inc are supported on loads and stores 332 for (unsigned im = (unsigned)ISD::PRE_INC; 333 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 334 setIndexedLoadAction(im, VT, Legal); 335 setIndexedStoreAction(im, VT, Legal); 336 setIndexedMaskedLoadAction(im, VT, Legal); 337 setIndexedMaskedStoreAction(im, VT, Legal); 338 } 339 340 if (HasMVEFP) { 341 setOperationAction(ISD::FMINNUM, VT, Legal); 342 setOperationAction(ISD::FMAXNUM, VT, Legal); 343 setOperationAction(ISD::FROUND, VT, Legal); 344 345 // No native support for these. 346 setOperationAction(ISD::FDIV, VT, Expand); 347 setOperationAction(ISD::FREM, VT, Expand); 348 setOperationAction(ISD::FSQRT, VT, Expand); 349 setOperationAction(ISD::FSIN, VT, Expand); 350 setOperationAction(ISD::FCOS, VT, Expand); 351 setOperationAction(ISD::FPOW, VT, Expand); 352 setOperationAction(ISD::FLOG, VT, Expand); 353 setOperationAction(ISD::FLOG2, VT, Expand); 354 setOperationAction(ISD::FLOG10, VT, Expand); 355 setOperationAction(ISD::FEXP, VT, Expand); 356 setOperationAction(ISD::FEXP2, VT, Expand); 357 setOperationAction(ISD::FNEARBYINT, VT, Expand); 358 } 359 } 360 361 // We 'support' these types up to bitcast/load/store level, regardless of 362 // MVE integer-only / float support. Only doing FP data processing on the FP 363 // vector types is inhibited at integer-only level. 364 const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 }; 365 for (auto VT : LongTypes) { 366 addRegisterClass(VT, &ARM::MQPRRegClass); 367 setAllExpand(VT); 368 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 369 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 370 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 371 } 372 // We can do bitwise operations on v2i64 vectors 373 setOperationAction(ISD::AND, MVT::v2i64, Legal); 374 setOperationAction(ISD::OR, MVT::v2i64, Legal); 375 setOperationAction(ISD::XOR, MVT::v2i64, Legal); 376 377 // It is legal to extload from v4i8 to v4i16 or v4i32. 378 addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal); 379 addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal); 380 addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal); 381 382 // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16. 383 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Legal); 384 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Legal); 385 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Legal); 386 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i8, Legal); 387 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i16, Legal); 388 389 // Some truncating stores are legal too. 390 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); 391 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); 392 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); 393 394 // Pre and Post inc on these are legal, given the correct extends 395 for (unsigned im = (unsigned)ISD::PRE_INC; 396 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 397 for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) { 398 setIndexedLoadAction(im, VT, Legal); 399 setIndexedStoreAction(im, VT, Legal); 400 setIndexedMaskedLoadAction(im, VT, Legal); 401 setIndexedMaskedStoreAction(im, VT, Legal); 402 } 403 } 404 405 // Predicate types 406 const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1}; 407 for (auto VT : pTypes) { 408 addRegisterClass(VT, &ARM::VCCRRegClass); 409 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 410 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 411 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 412 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 413 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 414 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 415 setOperationAction(ISD::SETCC, VT, Custom); 416 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand); 417 setOperationAction(ISD::LOAD, VT, Custom); 418 setOperationAction(ISD::STORE, VT, Custom); 419 } 420 } 421 422 ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, 423 const ARMSubtarget &STI) 424 : TargetLowering(TM), Subtarget(&STI) { 425 RegInfo = Subtarget->getRegisterInfo(); 426 Itins = Subtarget->getInstrItineraryData(); 427 428 setBooleanContents(ZeroOrOneBooleanContent); 429 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 430 431 if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() && 432 !Subtarget->isTargetWatchOS()) { 433 bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard; 434 for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID) 435 setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID), 436 IsHFTarget ? CallingConv::ARM_AAPCS_VFP 437 : CallingConv::ARM_AAPCS); 438 } 439 440 if (Subtarget->isTargetMachO()) { 441 // Uses VFP for Thumb libfuncs if available. 442 if (Subtarget->isThumb() && Subtarget->hasVFP2Base() && 443 Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) { 444 static const struct { 445 const RTLIB::Libcall Op; 446 const char * const Name; 447 const ISD::CondCode Cond; 448 } LibraryCalls[] = { 449 // Single-precision floating-point arithmetic. 450 { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID }, 451 { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID }, 452 { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID }, 453 { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID }, 454 455 // Double-precision floating-point arithmetic. 456 { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID }, 457 { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID }, 458 { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID }, 459 { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID }, 460 461 // Single-precision comparisons. 462 { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE }, 463 { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE }, 464 { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE }, 465 { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE }, 466 { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE }, 467 { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE }, 468 { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE }, 469 470 // Double-precision comparisons. 471 { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE }, 472 { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE }, 473 { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE }, 474 { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE }, 475 { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE }, 476 { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE }, 477 { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE }, 478 479 // Floating-point to integer conversions. 480 // i64 conversions are done via library routines even when generating VFP 481 // instructions, so use the same ones. 482 { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID }, 483 { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID }, 484 { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID }, 485 { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID }, 486 487 // Conversions between floating types. 488 { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID }, 489 { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID }, 490 491 // Integer to floating-point conversions. 492 // i64 conversions are done via library routines even when generating VFP 493 // instructions, so use the same ones. 494 // FIXME: There appears to be some naming inconsistency in ARM libgcc: 495 // e.g., __floatunsidf vs. __floatunssidfvfp. 496 { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID }, 497 { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID }, 498 { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID }, 499 { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID }, 500 }; 501 502 for (const auto &LC : LibraryCalls) { 503 setLibcallName(LC.Op, LC.Name); 504 if (LC.Cond != ISD::SETCC_INVALID) 505 setCmpLibcallCC(LC.Op, LC.Cond); 506 } 507 } 508 } 509 510 // These libcalls are not available in 32-bit. 511 setLibcallName(RTLIB::SHL_I128, nullptr); 512 setLibcallName(RTLIB::SRL_I128, nullptr); 513 setLibcallName(RTLIB::SRA_I128, nullptr); 514 515 // RTLIB 516 if (Subtarget->isAAPCS_ABI() && 517 (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() || 518 Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) { 519 static const struct { 520 const RTLIB::Libcall Op; 521 const char * const Name; 522 const CallingConv::ID CC; 523 const ISD::CondCode Cond; 524 } LibraryCalls[] = { 525 // Double-precision floating-point arithmetic helper functions 526 // RTABI chapter 4.1.2, Table 2 527 { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 528 { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 529 { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 530 { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 531 532 // Double-precision floating-point comparison helper functions 533 // RTABI chapter 4.1.2, Table 3 534 { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, 535 { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, 536 { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, 537 { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, 538 { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, 539 { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, 540 { RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, 541 542 // Single-precision floating-point arithmetic helper functions 543 // RTABI chapter 4.1.2, Table 4 544 { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 545 { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 546 { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 547 { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 548 549 // Single-precision floating-point comparison helper functions 550 // RTABI chapter 4.1.2, Table 5 551 { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, 552 { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, 553 { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, 554 { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, 555 { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, 556 { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, 557 { RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, 558 559 // Floating-point to integer conversions. 560 // RTABI chapter 4.1.2, Table 6 561 { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 562 { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 563 { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 564 { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 565 { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 566 { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 567 { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 568 { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 569 570 // Conversions between floating types. 571 // RTABI chapter 4.1.2, Table 7 572 { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 573 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 574 { RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 575 576 // Integer to floating-point conversions. 577 // RTABI chapter 4.1.2, Table 8 578 { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 579 { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 580 { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 581 { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 582 { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 583 { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 584 { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 585 { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 586 587 // Long long helper functions 588 // RTABI chapter 4.2, Table 9 589 { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 590 { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 591 { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 592 { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 593 594 // Integer division functions 595 // RTABI chapter 4.3.1 596 { RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 597 { RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 598 { RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 599 { RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 600 { RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 601 { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 602 { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 603 { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 604 }; 605 606 for (const auto &LC : LibraryCalls) { 607 setLibcallName(LC.Op, LC.Name); 608 setLibcallCallingConv(LC.Op, LC.CC); 609 if (LC.Cond != ISD::SETCC_INVALID) 610 setCmpLibcallCC(LC.Op, LC.Cond); 611 } 612 613 // EABI dependent RTLIB 614 if (TM.Options.EABIVersion == EABI::EABI4 || 615 TM.Options.EABIVersion == EABI::EABI5) { 616 static const struct { 617 const RTLIB::Libcall Op; 618 const char *const Name; 619 const CallingConv::ID CC; 620 const ISD::CondCode Cond; 621 } MemOpsLibraryCalls[] = { 622 // Memory operations 623 // RTABI chapter 4.3.4 624 { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 625 { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 626 { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 627 }; 628 629 for (const auto &LC : MemOpsLibraryCalls) { 630 setLibcallName(LC.Op, LC.Name); 631 setLibcallCallingConv(LC.Op, LC.CC); 632 if (LC.Cond != ISD::SETCC_INVALID) 633 setCmpLibcallCC(LC.Op, LC.Cond); 634 } 635 } 636 } 637 638 if (Subtarget->isTargetWindows()) { 639 static const struct { 640 const RTLIB::Libcall Op; 641 const char * const Name; 642 const CallingConv::ID CC; 643 } LibraryCalls[] = { 644 { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP }, 645 { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP }, 646 { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP }, 647 { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP }, 648 { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP }, 649 { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP }, 650 { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP }, 651 { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP }, 652 }; 653 654 for (const auto &LC : LibraryCalls) { 655 setLibcallName(LC.Op, LC.Name); 656 setLibcallCallingConv(LC.Op, LC.CC); 657 } 658 } 659 660 // Use divmod compiler-rt calls for iOS 5.0 and later. 661 if (Subtarget->isTargetMachO() && 662 !(Subtarget->isTargetIOS() && 663 Subtarget->getTargetTriple().isOSVersionLT(5, 0))) { 664 setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); 665 setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); 666 } 667 668 // The half <-> float conversion functions are always soft-float on 669 // non-watchos platforms, but are needed for some targets which use a 670 // hard-float calling convention by default. 671 if (!Subtarget->isTargetWatchABI()) { 672 if (Subtarget->isAAPCS_ABI()) { 673 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS); 674 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS); 675 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS); 676 } else { 677 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS); 678 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS); 679 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS); 680 } 681 } 682 683 // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have 684 // a __gnu_ prefix (which is the default). 685 if (Subtarget->isTargetAEABI()) { 686 static const struct { 687 const RTLIB::Libcall Op; 688 const char * const Name; 689 const CallingConv::ID CC; 690 } LibraryCalls[] = { 691 { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS }, 692 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS }, 693 { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS }, 694 }; 695 696 for (const auto &LC : LibraryCalls) { 697 setLibcallName(LC.Op, LC.Name); 698 setLibcallCallingConv(LC.Op, LC.CC); 699 } 700 } 701 702 if (Subtarget->isThumb1Only()) 703 addRegisterClass(MVT::i32, &ARM::tGPRRegClass); 704 else 705 addRegisterClass(MVT::i32, &ARM::GPRRegClass); 706 707 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() && 708 Subtarget->hasFPRegs()) { 709 addRegisterClass(MVT::f32, &ARM::SPRRegClass); 710 addRegisterClass(MVT::f64, &ARM::DPRRegClass); 711 if (!Subtarget->hasVFP2Base()) 712 setAllExpand(MVT::f32); 713 if (!Subtarget->hasFP64()) 714 setAllExpand(MVT::f64); 715 } 716 717 if (Subtarget->hasFullFP16()) { 718 addRegisterClass(MVT::f16, &ARM::HPRRegClass); 719 setOperationAction(ISD::BITCAST, MVT::i16, Custom); 720 setOperationAction(ISD::BITCAST, MVT::i32, Custom); 721 setOperationAction(ISD::BITCAST, MVT::f16, Custom); 722 723 setOperationAction(ISD::FMINNUM, MVT::f16, Legal); 724 setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); 725 } 726 727 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 728 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { 729 setTruncStoreAction(VT, InnerVT, Expand); 730 addAllExtLoads(VT, InnerVT, Expand); 731 } 732 733 setOperationAction(ISD::MULHS, VT, Expand); 734 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 735 setOperationAction(ISD::MULHU, VT, Expand); 736 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 737 738 setOperationAction(ISD::BSWAP, VT, Expand); 739 } 740 741 setOperationAction(ISD::ConstantFP, MVT::f32, Custom); 742 setOperationAction(ISD::ConstantFP, MVT::f64, Custom); 743 744 setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom); 745 setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom); 746 747 if (Subtarget->hasMVEIntegerOps()) 748 addMVEVectorTypes(Subtarget->hasMVEFloatOps()); 749 750 // Combine low-overhead loop intrinsics so that we can lower i1 types. 751 if (Subtarget->hasLOB()) { 752 setTargetDAGCombine(ISD::BRCOND); 753 setTargetDAGCombine(ISD::BR_CC); 754 } 755 756 if (Subtarget->hasNEON()) { 757 addDRTypeForNEON(MVT::v2f32); 758 addDRTypeForNEON(MVT::v8i8); 759 addDRTypeForNEON(MVT::v4i16); 760 addDRTypeForNEON(MVT::v2i32); 761 addDRTypeForNEON(MVT::v1i64); 762 763 addQRTypeForNEON(MVT::v4f32); 764 addQRTypeForNEON(MVT::v2f64); 765 addQRTypeForNEON(MVT::v16i8); 766 addQRTypeForNEON(MVT::v8i16); 767 addQRTypeForNEON(MVT::v4i32); 768 addQRTypeForNEON(MVT::v2i64); 769 770 if (Subtarget->hasFullFP16()) { 771 addQRTypeForNEON(MVT::v8f16); 772 addDRTypeForNEON(MVT::v4f16); 773 } 774 } 775 776 if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) { 777 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but 778 // none of Neon, MVE or VFP supports any arithmetic operations on it. 779 setOperationAction(ISD::FADD, MVT::v2f64, Expand); 780 setOperationAction(ISD::FSUB, MVT::v2f64, Expand); 781 setOperationAction(ISD::FMUL, MVT::v2f64, Expand); 782 // FIXME: Code duplication: FDIV and FREM are expanded always, see 783 // ARMTargetLowering::addTypeForNEON method for details. 784 setOperationAction(ISD::FDIV, MVT::v2f64, Expand); 785 setOperationAction(ISD::FREM, MVT::v2f64, Expand); 786 // FIXME: Create unittest. 787 // In another words, find a way when "copysign" appears in DAG with vector 788 // operands. 789 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand); 790 // FIXME: Code duplication: SETCC has custom operation action, see 791 // ARMTargetLowering::addTypeForNEON method for details. 792 setOperationAction(ISD::SETCC, MVT::v2f64, Expand); 793 // FIXME: Create unittest for FNEG and for FABS. 794 setOperationAction(ISD::FNEG, MVT::v2f64, Expand); 795 setOperationAction(ISD::FABS, MVT::v2f64, Expand); 796 setOperationAction(ISD::FSQRT, MVT::v2f64, Expand); 797 setOperationAction(ISD::FSIN, MVT::v2f64, Expand); 798 setOperationAction(ISD::FCOS, MVT::v2f64, Expand); 799 setOperationAction(ISD::FPOW, MVT::v2f64, Expand); 800 setOperationAction(ISD::FLOG, MVT::v2f64, Expand); 801 setOperationAction(ISD::FLOG2, MVT::v2f64, Expand); 802 setOperationAction(ISD::FLOG10, MVT::v2f64, Expand); 803 setOperationAction(ISD::FEXP, MVT::v2f64, Expand); 804 setOperationAction(ISD::FEXP2, MVT::v2f64, Expand); 805 // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR. 806 setOperationAction(ISD::FCEIL, MVT::v2f64, Expand); 807 setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand); 808 setOperationAction(ISD::FRINT, MVT::v2f64, Expand); 809 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand); 810 setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand); 811 setOperationAction(ISD::FMA, MVT::v2f64, Expand); 812 } 813 814 if (Subtarget->hasNEON()) { 815 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively 816 // supported for v4f32. 817 setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); 818 setOperationAction(ISD::FSIN, MVT::v4f32, Expand); 819 setOperationAction(ISD::FCOS, MVT::v4f32, Expand); 820 setOperationAction(ISD::FPOW, MVT::v4f32, Expand); 821 setOperationAction(ISD::FLOG, MVT::v4f32, Expand); 822 setOperationAction(ISD::FLOG2, MVT::v4f32, Expand); 823 setOperationAction(ISD::FLOG10, MVT::v4f32, Expand); 824 setOperationAction(ISD::FEXP, MVT::v4f32, Expand); 825 setOperationAction(ISD::FEXP2, MVT::v4f32, Expand); 826 setOperationAction(ISD::FCEIL, MVT::v4f32, Expand); 827 setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand); 828 setOperationAction(ISD::FRINT, MVT::v4f32, Expand); 829 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); 830 setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand); 831 832 // Mark v2f32 intrinsics. 833 setOperationAction(ISD::FSQRT, MVT::v2f32, Expand); 834 setOperationAction(ISD::FSIN, MVT::v2f32, Expand); 835 setOperationAction(ISD::FCOS, MVT::v2f32, Expand); 836 setOperationAction(ISD::FPOW, MVT::v2f32, Expand); 837 setOperationAction(ISD::FLOG, MVT::v2f32, Expand); 838 setOperationAction(ISD::FLOG2, MVT::v2f32, Expand); 839 setOperationAction(ISD::FLOG10, MVT::v2f32, Expand); 840 setOperationAction(ISD::FEXP, MVT::v2f32, Expand); 841 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand); 842 setOperationAction(ISD::FCEIL, MVT::v2f32, Expand); 843 setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand); 844 setOperationAction(ISD::FRINT, MVT::v2f32, Expand); 845 setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand); 846 setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand); 847 848 // Neon does not support some operations on v1i64 and v2i64 types. 849 setOperationAction(ISD::MUL, MVT::v1i64, Expand); 850 // Custom handling for some quad-vector types to detect VMULL. 851 setOperationAction(ISD::MUL, MVT::v8i16, Custom); 852 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 853 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 854 // Custom handling for some vector types to avoid expensive expansions 855 setOperationAction(ISD::SDIV, MVT::v4i16, Custom); 856 setOperationAction(ISD::SDIV, MVT::v8i8, Custom); 857 setOperationAction(ISD::UDIV, MVT::v4i16, Custom); 858 setOperationAction(ISD::UDIV, MVT::v8i8, Custom); 859 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with 860 // a destination type that is wider than the source, and nor does 861 // it have a FP_TO_[SU]INT instruction with a narrower destination than 862 // source. 863 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); 864 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom); 865 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); 866 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); 867 setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); 868 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom); 869 setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom); 870 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); 871 872 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); 873 setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); 874 875 // NEON does not have single instruction CTPOP for vectors with element 876 // types wider than 8-bits. However, custom lowering can leverage the 877 // v8i8/v16i8 vcnt instruction. 878 setOperationAction(ISD::CTPOP, MVT::v2i32, Custom); 879 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); 880 setOperationAction(ISD::CTPOP, MVT::v4i16, Custom); 881 setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); 882 setOperationAction(ISD::CTPOP, MVT::v1i64, Custom); 883 setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); 884 885 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); 886 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); 887 888 // NEON does not have single instruction CTTZ for vectors. 889 setOperationAction(ISD::CTTZ, MVT::v8i8, Custom); 890 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom); 891 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom); 892 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); 893 894 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); 895 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); 896 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); 897 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom); 898 899 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom); 900 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom); 901 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom); 902 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom); 903 904 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom); 905 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom); 906 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); 907 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); 908 909 // NEON only has FMA instructions as of VFP4. 910 if (!Subtarget->hasVFP4Base()) { 911 setOperationAction(ISD::FMA, MVT::v2f32, Expand); 912 setOperationAction(ISD::FMA, MVT::v4f32, Expand); 913 } 914 915 setTargetDAGCombine(ISD::INTRINSIC_VOID); 916 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); 917 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 918 setTargetDAGCombine(ISD::SHL); 919 setTargetDAGCombine(ISD::SRL); 920 setTargetDAGCombine(ISD::SRA); 921 setTargetDAGCombine(ISD::FP_TO_SINT); 922 setTargetDAGCombine(ISD::FP_TO_UINT); 923 setTargetDAGCombine(ISD::FDIV); 924 setTargetDAGCombine(ISD::LOAD); 925 926 // It is legal to extload from v4i8 to v4i16 or v4i32. 927 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16, 928 MVT::v2i32}) { 929 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) { 930 setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal); 931 setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal); 932 setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal); 933 } 934 } 935 } 936 937 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) { 938 setTargetDAGCombine(ISD::BUILD_VECTOR); 939 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 940 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 941 setTargetDAGCombine(ISD::STORE); 942 setTargetDAGCombine(ISD::SIGN_EXTEND); 943 setTargetDAGCombine(ISD::ZERO_EXTEND); 944 setTargetDAGCombine(ISD::ANY_EXTEND); 945 } 946 947 if (!Subtarget->hasFP64()) { 948 // When targeting a floating-point unit with only single-precision 949 // operations, f64 is legal for the few double-precision instructions which 950 // are present However, no double-precision operations other than moves, 951 // loads and stores are provided by the hardware. 952 setOperationAction(ISD::FADD, MVT::f64, Expand); 953 setOperationAction(ISD::FSUB, MVT::f64, Expand); 954 setOperationAction(ISD::FMUL, MVT::f64, Expand); 955 setOperationAction(ISD::FMA, MVT::f64, Expand); 956 setOperationAction(ISD::FDIV, MVT::f64, Expand); 957 setOperationAction(ISD::FREM, MVT::f64, Expand); 958 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 959 setOperationAction(ISD::FGETSIGN, MVT::f64, Expand); 960 setOperationAction(ISD::FNEG, MVT::f64, Expand); 961 setOperationAction(ISD::FABS, MVT::f64, Expand); 962 setOperationAction(ISD::FSQRT, MVT::f64, Expand); 963 setOperationAction(ISD::FSIN, MVT::f64, Expand); 964 setOperationAction(ISD::FCOS, MVT::f64, Expand); 965 setOperationAction(ISD::FPOW, MVT::f64, Expand); 966 setOperationAction(ISD::FLOG, MVT::f64, Expand); 967 setOperationAction(ISD::FLOG2, MVT::f64, Expand); 968 setOperationAction(ISD::FLOG10, MVT::f64, Expand); 969 setOperationAction(ISD::FEXP, MVT::f64, Expand); 970 setOperationAction(ISD::FEXP2, MVT::f64, Expand); 971 setOperationAction(ISD::FCEIL, MVT::f64, Expand); 972 setOperationAction(ISD::FTRUNC, MVT::f64, Expand); 973 setOperationAction(ISD::FRINT, MVT::f64, Expand); 974 setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand); 975 setOperationAction(ISD::FFLOOR, MVT::f64, Expand); 976 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 977 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 978 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 979 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 980 setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom); 981 setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom); 982 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); 983 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom); 984 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom); 985 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::f64, Custom); 986 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::f64, Custom); 987 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom); 988 } 989 990 if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) { 991 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); 992 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom); 993 if (Subtarget->hasFullFP16()) { 994 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); 995 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); 996 } 997 } 998 999 if (!Subtarget->hasFP16()) { 1000 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom); 1001 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom); 1002 } 1003 1004 computeRegisterProperties(Subtarget->getRegisterInfo()); 1005 1006 // ARM does not have floating-point extending loads. 1007 for (MVT VT : MVT::fp_valuetypes()) { 1008 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); 1009 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); 1010 } 1011 1012 // ... or truncating stores 1013 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 1014 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 1015 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 1016 1017 // ARM does not have i1 sign extending load. 1018 for (MVT VT : MVT::integer_valuetypes()) 1019 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 1020 1021 // ARM supports all 4 flavors of integer indexed load / store. 1022 if (!Subtarget->isThumb1Only()) { 1023 for (unsigned im = (unsigned)ISD::PRE_INC; 1024 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 1025 setIndexedLoadAction(im, MVT::i1, Legal); 1026 setIndexedLoadAction(im, MVT::i8, Legal); 1027 setIndexedLoadAction(im, MVT::i16, Legal); 1028 setIndexedLoadAction(im, MVT::i32, Legal); 1029 setIndexedStoreAction(im, MVT::i1, Legal); 1030 setIndexedStoreAction(im, MVT::i8, Legal); 1031 setIndexedStoreAction(im, MVT::i16, Legal); 1032 setIndexedStoreAction(im, MVT::i32, Legal); 1033 } 1034 } else { 1035 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}. 1036 setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal); 1037 setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal); 1038 } 1039 1040 setOperationAction(ISD::SADDO, MVT::i32, Custom); 1041 setOperationAction(ISD::UADDO, MVT::i32, Custom); 1042 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 1043 setOperationAction(ISD::USUBO, MVT::i32, Custom); 1044 1045 setOperationAction(ISD::ADDCARRY, MVT::i32, Custom); 1046 setOperationAction(ISD::SUBCARRY, MVT::i32, Custom); 1047 if (Subtarget->hasDSP()) { 1048 setOperationAction(ISD::SADDSAT, MVT::i8, Custom); 1049 setOperationAction(ISD::SSUBSAT, MVT::i8, Custom); 1050 setOperationAction(ISD::SADDSAT, MVT::i16, Custom); 1051 setOperationAction(ISD::SSUBSAT, MVT::i16, Custom); 1052 } 1053 if (Subtarget->hasBaseDSP()) { 1054 setOperationAction(ISD::SADDSAT, MVT::i32, Legal); 1055 setOperationAction(ISD::SSUBSAT, MVT::i32, Legal); 1056 } 1057 1058 // i64 operation support. 1059 setOperationAction(ISD::MUL, MVT::i64, Expand); 1060 setOperationAction(ISD::MULHU, MVT::i32, Expand); 1061 if (Subtarget->isThumb1Only()) { 1062 setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); 1063 setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); 1064 } 1065 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops() 1066 || (Subtarget->isThumb2() && !Subtarget->hasDSP())) 1067 setOperationAction(ISD::MULHS, MVT::i32, Expand); 1068 1069 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); 1070 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); 1071 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); 1072 setOperationAction(ISD::SRL, MVT::i64, Custom); 1073 setOperationAction(ISD::SRA, MVT::i64, Custom); 1074 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 1075 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); 1076 setOperationAction(ISD::LOAD, MVT::i64, Custom); 1077 setOperationAction(ISD::STORE, MVT::i64, Custom); 1078 1079 // MVE lowers 64 bit shifts to lsll and lsrl 1080 // assuming that ISD::SRL and SRA of i64 are already marked custom 1081 if (Subtarget->hasMVEIntegerOps()) 1082 setOperationAction(ISD::SHL, MVT::i64, Custom); 1083 1084 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1. 1085 if (Subtarget->isThumb1Only()) { 1086 setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand); 1087 setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand); 1088 setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand); 1089 } 1090 1091 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) 1092 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 1093 1094 // ARM does not have ROTL. 1095 setOperationAction(ISD::ROTL, MVT::i32, Expand); 1096 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 1097 setOperationAction(ISD::ROTL, VT, Expand); 1098 setOperationAction(ISD::ROTR, VT, Expand); 1099 } 1100 setOperationAction(ISD::CTTZ, MVT::i32, Custom); 1101 setOperationAction(ISD::CTPOP, MVT::i32, Expand); 1102 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) { 1103 setOperationAction(ISD::CTLZ, MVT::i32, Expand); 1104 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, LibCall); 1105 } 1106 1107 // @llvm.readcyclecounter requires the Performance Monitors extension. 1108 // Default to the 0 expansion on unsupported platforms. 1109 // FIXME: Technically there are older ARM CPUs that have 1110 // implementation-specific ways of obtaining this information. 1111 if (Subtarget->hasPerfMon()) 1112 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom); 1113 1114 // Only ARMv6 has BSWAP. 1115 if (!Subtarget->hasV6Ops()) 1116 setOperationAction(ISD::BSWAP, MVT::i32, Expand); 1117 1118 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() 1119 : Subtarget->hasDivideInARMMode(); 1120 if (!hasDivide) { 1121 // These are expanded into libcalls if the cpu doesn't have HW divider. 1122 setOperationAction(ISD::SDIV, MVT::i32, LibCall); 1123 setOperationAction(ISD::UDIV, MVT::i32, LibCall); 1124 } 1125 1126 if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) { 1127 setOperationAction(ISD::SDIV, MVT::i32, Custom); 1128 setOperationAction(ISD::UDIV, MVT::i32, Custom); 1129 1130 setOperationAction(ISD::SDIV, MVT::i64, Custom); 1131 setOperationAction(ISD::UDIV, MVT::i64, Custom); 1132 } 1133 1134 setOperationAction(ISD::SREM, MVT::i32, Expand); 1135 setOperationAction(ISD::UREM, MVT::i32, Expand); 1136 1137 // Register based DivRem for AEABI (RTABI 4.2) 1138 if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || 1139 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || 1140 Subtarget->isTargetWindows()) { 1141 setOperationAction(ISD::SREM, MVT::i64, Custom); 1142 setOperationAction(ISD::UREM, MVT::i64, Custom); 1143 HasStandaloneRem = false; 1144 1145 if (Subtarget->isTargetWindows()) { 1146 const struct { 1147 const RTLIB::Libcall Op; 1148 const char * const Name; 1149 const CallingConv::ID CC; 1150 } LibraryCalls[] = { 1151 { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS }, 1152 { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS }, 1153 { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS }, 1154 { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS }, 1155 1156 { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS }, 1157 { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS }, 1158 { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS }, 1159 { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS }, 1160 }; 1161 1162 for (const auto &LC : LibraryCalls) { 1163 setLibcallName(LC.Op, LC.Name); 1164 setLibcallCallingConv(LC.Op, LC.CC); 1165 } 1166 } else { 1167 const struct { 1168 const RTLIB::Libcall Op; 1169 const char * const Name; 1170 const CallingConv::ID CC; 1171 } LibraryCalls[] = { 1172 { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 1173 { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 1174 { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 1175 { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS }, 1176 1177 { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 1178 { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 1179 { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 1180 { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS }, 1181 }; 1182 1183 for (const auto &LC : LibraryCalls) { 1184 setLibcallName(LC.Op, LC.Name); 1185 setLibcallCallingConv(LC.Op, LC.CC); 1186 } 1187 } 1188 1189 setOperationAction(ISD::SDIVREM, MVT::i32, Custom); 1190 setOperationAction(ISD::UDIVREM, MVT::i32, Custom); 1191 setOperationAction(ISD::SDIVREM, MVT::i64, Custom); 1192 setOperationAction(ISD::UDIVREM, MVT::i64, Custom); 1193 } else { 1194 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 1195 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 1196 } 1197 1198 if (Subtarget->getTargetTriple().isOSMSVCRT()) { 1199 // MSVCRT doesn't have powi; fall back to pow 1200 setLibcallName(RTLIB::POWI_F32, nullptr); 1201 setLibcallName(RTLIB::POWI_F64, nullptr); 1202 } 1203 1204 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 1205 setOperationAction(ISD::ConstantPool, MVT::i32, Custom); 1206 setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); 1207 setOperationAction(ISD::BlockAddress, MVT::i32, Custom); 1208 1209 setOperationAction(ISD::TRAP, MVT::Other, Legal); 1210 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); 1211 1212 // Use the default implementation. 1213 setOperationAction(ISD::VASTART, MVT::Other, Custom); 1214 setOperationAction(ISD::VAARG, MVT::Other, Expand); 1215 setOperationAction(ISD::VACOPY, MVT::Other, Expand); 1216 setOperationAction(ISD::VAEND, MVT::Other, Expand); 1217 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 1218 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 1219 1220 if (Subtarget->isTargetWindows()) 1221 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 1222 else 1223 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 1224 1225 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use 1226 // the default expansion. 1227 InsertFencesForAtomic = false; 1228 if (Subtarget->hasAnyDataBarrier() && 1229 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) { 1230 // ATOMIC_FENCE needs custom lowering; the others should have been expanded 1231 // to ldrex/strex loops already. 1232 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); 1233 if (!Subtarget->isThumb() || !Subtarget->isMClass()) 1234 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 1235 1236 // On v8, we have particularly efficient implementations of atomic fences 1237 // if they can be combined with nearby atomic loads and stores. 1238 if (!Subtarget->hasAcquireRelease() || 1239 getTargetMachine().getOptLevel() == 0) { 1240 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc. 1241 InsertFencesForAtomic = true; 1242 } 1243 } else { 1244 // If there's anything we can use as a barrier, go through custom lowering 1245 // for ATOMIC_FENCE. 1246 // If target has DMB in thumb, Fences can be inserted. 1247 if (Subtarget->hasDataBarrier()) 1248 InsertFencesForAtomic = true; 1249 1250 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, 1251 Subtarget->hasAnyDataBarrier() ? Custom : Expand); 1252 1253 // Set them all for expansion, which will force libcalls. 1254 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand); 1255 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand); 1256 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand); 1257 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand); 1258 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand); 1259 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand); 1260 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand); 1261 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand); 1262 setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand); 1263 setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand); 1264 setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand); 1265 setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand); 1266 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the 1267 // Unordered/Monotonic case. 1268 if (!InsertFencesForAtomic) { 1269 setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom); 1270 setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom); 1271 } 1272 } 1273 1274 setOperationAction(ISD::PREFETCH, MVT::Other, Custom); 1275 1276 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes. 1277 if (!Subtarget->hasV6Ops()) { 1278 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); 1279 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); 1280 } 1281 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 1282 1283 if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() && 1284 !Subtarget->isThumb1Only()) { 1285 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR 1286 // iff target supports vfp2. 1287 setOperationAction(ISD::BITCAST, MVT::i64, Custom); 1288 setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); 1289 } 1290 1291 // We want to custom lower some of our intrinsics. 1292 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1293 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); 1294 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); 1295 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); 1296 if (Subtarget->useSjLjEH()) 1297 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); 1298 1299 setOperationAction(ISD::SETCC, MVT::i32, Expand); 1300 setOperationAction(ISD::SETCC, MVT::f32, Expand); 1301 setOperationAction(ISD::SETCC, MVT::f64, Expand); 1302 setOperationAction(ISD::SELECT, MVT::i32, Custom); 1303 setOperationAction(ISD::SELECT, MVT::f32, Custom); 1304 setOperationAction(ISD::SELECT, MVT::f64, Custom); 1305 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 1306 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 1307 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 1308 if (Subtarget->hasFullFP16()) { 1309 setOperationAction(ISD::SETCC, MVT::f16, Expand); 1310 setOperationAction(ISD::SELECT, MVT::f16, Custom); 1311 setOperationAction(ISD::SELECT_CC, MVT::f16, Custom); 1312 } 1313 1314 setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom); 1315 1316 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 1317 setOperationAction(ISD::BR_CC, MVT::i32, Custom); 1318 if (Subtarget->hasFullFP16()) 1319 setOperationAction(ISD::BR_CC, MVT::f16, Custom); 1320 setOperationAction(ISD::BR_CC, MVT::f32, Custom); 1321 setOperationAction(ISD::BR_CC, MVT::f64, Custom); 1322 setOperationAction(ISD::BR_JT, MVT::Other, Custom); 1323 1324 // We don't support sin/cos/fmod/copysign/pow 1325 setOperationAction(ISD::FSIN, MVT::f64, Expand); 1326 setOperationAction(ISD::FSIN, MVT::f32, Expand); 1327 setOperationAction(ISD::FCOS, MVT::f32, Expand); 1328 setOperationAction(ISD::FCOS, MVT::f64, Expand); 1329 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 1330 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 1331 setOperationAction(ISD::FREM, MVT::f64, Expand); 1332 setOperationAction(ISD::FREM, MVT::f32, Expand); 1333 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() && 1334 !Subtarget->isThumb1Only()) { 1335 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 1336 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 1337 } 1338 setOperationAction(ISD::FPOW, MVT::f64, Expand); 1339 setOperationAction(ISD::FPOW, MVT::f32, Expand); 1340 1341 if (!Subtarget->hasVFP4Base()) { 1342 setOperationAction(ISD::FMA, MVT::f64, Expand); 1343 setOperationAction(ISD::FMA, MVT::f32, Expand); 1344 } 1345 1346 // Various VFP goodness 1347 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) { 1348 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded. 1349 if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) { 1350 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); 1351 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); 1352 } 1353 1354 // fp16 is a special v7 extension that adds f16 <-> f32 conversions. 1355 if (!Subtarget->hasFP16()) { 1356 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); 1357 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); 1358 } 1359 } 1360 1361 // Use __sincos_stret if available. 1362 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && 1363 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { 1364 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 1365 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 1366 } 1367 1368 // FP-ARMv8 implements a lot of rounding-like FP operations. 1369 if (Subtarget->hasFPARMv8Base()) { 1370 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 1371 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 1372 setOperationAction(ISD::FROUND, MVT::f32, Legal); 1373 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 1374 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 1375 setOperationAction(ISD::FRINT, MVT::f32, Legal); 1376 setOperationAction(ISD::FMINNUM, MVT::f32, Legal); 1377 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); 1378 if (Subtarget->hasNEON()) { 1379 setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal); 1380 setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal); 1381 setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); 1382 setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); 1383 } 1384 1385 if (Subtarget->hasFP64()) { 1386 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 1387 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 1388 setOperationAction(ISD::FROUND, MVT::f64, Legal); 1389 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 1390 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 1391 setOperationAction(ISD::FRINT, MVT::f64, Legal); 1392 setOperationAction(ISD::FMINNUM, MVT::f64, Legal); 1393 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); 1394 } 1395 } 1396 1397 // FP16 often need to be promoted to call lib functions 1398 if (Subtarget->hasFullFP16()) { 1399 setOperationAction(ISD::FREM, MVT::f16, Promote); 1400 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); 1401 setOperationAction(ISD::FSIN, MVT::f16, Promote); 1402 setOperationAction(ISD::FCOS, MVT::f16, Promote); 1403 setOperationAction(ISD::FSINCOS, MVT::f16, Promote); 1404 setOperationAction(ISD::FPOWI, MVT::f16, Promote); 1405 setOperationAction(ISD::FPOW, MVT::f16, Promote); 1406 setOperationAction(ISD::FEXP, MVT::f16, Promote); 1407 setOperationAction(ISD::FEXP2, MVT::f16, Promote); 1408 setOperationAction(ISD::FLOG, MVT::f16, Promote); 1409 setOperationAction(ISD::FLOG10, MVT::f16, Promote); 1410 setOperationAction(ISD::FLOG2, MVT::f16, Promote); 1411 1412 setOperationAction(ISD::FROUND, MVT::f16, Legal); 1413 } 1414 1415 if (Subtarget->hasNEON()) { 1416 // vmin and vmax aren't available in a scalar form, so we use 1417 // a NEON instruction with an undef lane instead. 1418 setOperationAction(ISD::FMINIMUM, MVT::f16, Legal); 1419 setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal); 1420 setOperationAction(ISD::FMINIMUM, MVT::f32, Legal); 1421 setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal); 1422 setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal); 1423 setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal); 1424 setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal); 1425 setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal); 1426 1427 if (Subtarget->hasFullFP16()) { 1428 setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal); 1429 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Legal); 1430 setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal); 1431 setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal); 1432 1433 setOperationAction(ISD::FMINIMUM, MVT::v4f16, Legal); 1434 setOperationAction(ISD::FMAXIMUM, MVT::v4f16, Legal); 1435 setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal); 1436 setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal); 1437 } 1438 } 1439 1440 // We have target-specific dag combine patterns for the following nodes: 1441 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine 1442 setTargetDAGCombine(ISD::ADD); 1443 setTargetDAGCombine(ISD::SUB); 1444 setTargetDAGCombine(ISD::MUL); 1445 setTargetDAGCombine(ISD::AND); 1446 setTargetDAGCombine(ISD::OR); 1447 setTargetDAGCombine(ISD::XOR); 1448 1449 if (Subtarget->hasV6Ops()) 1450 setTargetDAGCombine(ISD::SRL); 1451 if (Subtarget->isThumb1Only()) 1452 setTargetDAGCombine(ISD::SHL); 1453 1454 setStackPointerRegisterToSaveRestore(ARM::SP); 1455 1456 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() || 1457 !Subtarget->hasVFP2Base() || Subtarget->hasMinSize()) 1458 setSchedulingPreference(Sched::RegPressure); 1459 else 1460 setSchedulingPreference(Sched::Hybrid); 1461 1462 //// temporary - rewrite interface to use type 1463 MaxStoresPerMemset = 8; 1464 MaxStoresPerMemsetOptSize = 4; 1465 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores 1466 MaxStoresPerMemcpyOptSize = 2; 1467 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores 1468 MaxStoresPerMemmoveOptSize = 2; 1469 1470 // On ARM arguments smaller than 4 bytes are extended, so all arguments 1471 // are at least 4 bytes aligned. 1472 setMinStackArgumentAlignment(Align(4)); 1473 1474 // Prefer likely predicted branches to selects on out-of-order cores. 1475 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder(); 1476 1477 setPrefLoopAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment())); 1478 1479 setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4)); 1480 1481 if (Subtarget->isThumb() || Subtarget->isThumb2()) 1482 setTargetDAGCombine(ISD::ABS); 1483 } 1484 1485 bool ARMTargetLowering::useSoftFloat() const { 1486 return Subtarget->useSoftFloat(); 1487 } 1488 1489 // FIXME: It might make sense to define the representative register class as the 1490 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is 1491 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently, 1492 // SPR's representative would be DPR_VFP2. This should work well if register 1493 // pressure tracking were modified such that a register use would increment the 1494 // pressure of the register class's representative and all of it's super 1495 // classes' representatives transitively. We have not implemented this because 1496 // of the difficulty prior to coalescing of modeling operand register classes 1497 // due to the common occurrence of cross class copies and subregister insertions 1498 // and extractions. 1499 std::pair<const TargetRegisterClass *, uint8_t> 1500 ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, 1501 MVT VT) const { 1502 const TargetRegisterClass *RRC = nullptr; 1503 uint8_t Cost = 1; 1504 switch (VT.SimpleTy) { 1505 default: 1506 return TargetLowering::findRepresentativeClass(TRI, VT); 1507 // Use DPR as representative register class for all floating point 1508 // and vector types. Since there are 32 SPR registers and 32 DPR registers so 1509 // the cost is 1 for both f32 and f64. 1510 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16: 1511 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32: 1512 RRC = &ARM::DPRRegClass; 1513 // When NEON is used for SP, only half of the register file is available 1514 // because operations that define both SP and DP results will be constrained 1515 // to the VFP2 class (D0-D15). We currently model this constraint prior to 1516 // coalescing by double-counting the SP regs. See the FIXME above. 1517 if (Subtarget->useNEONForSinglePrecisionFP()) 1518 Cost = 2; 1519 break; 1520 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1521 case MVT::v4f32: case MVT::v2f64: 1522 RRC = &ARM::DPRRegClass; 1523 Cost = 2; 1524 break; 1525 case MVT::v4i64: 1526 RRC = &ARM::DPRRegClass; 1527 Cost = 4; 1528 break; 1529 case MVT::v8i64: 1530 RRC = &ARM::DPRRegClass; 1531 Cost = 8; 1532 break; 1533 } 1534 return std::make_pair(RRC, Cost); 1535 } 1536 1537 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { 1538 switch ((ARMISD::NodeType)Opcode) { 1539 case ARMISD::FIRST_NUMBER: break; 1540 case ARMISD::Wrapper: return "ARMISD::Wrapper"; 1541 case ARMISD::WrapperPIC: return "ARMISD::WrapperPIC"; 1542 case ARMISD::WrapperJT: return "ARMISD::WrapperJT"; 1543 case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL"; 1544 case ARMISD::CALL: return "ARMISD::CALL"; 1545 case ARMISD::CALL_PRED: return "ARMISD::CALL_PRED"; 1546 case ARMISD::CALL_NOLINK: return "ARMISD::CALL_NOLINK"; 1547 case ARMISD::BRCOND: return "ARMISD::BRCOND"; 1548 case ARMISD::BR_JT: return "ARMISD::BR_JT"; 1549 case ARMISD::BR2_JT: return "ARMISD::BR2_JT"; 1550 case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG"; 1551 case ARMISD::INTRET_FLAG: return "ARMISD::INTRET_FLAG"; 1552 case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD"; 1553 case ARMISD::CMP: return "ARMISD::CMP"; 1554 case ARMISD::CMN: return "ARMISD::CMN"; 1555 case ARMISD::CMPZ: return "ARMISD::CMPZ"; 1556 case ARMISD::CMPFP: return "ARMISD::CMPFP"; 1557 case ARMISD::CMPFPw0: return "ARMISD::CMPFPw0"; 1558 case ARMISD::BCC_i64: return "ARMISD::BCC_i64"; 1559 case ARMISD::FMSTAT: return "ARMISD::FMSTAT"; 1560 1561 case ARMISD::CMOV: return "ARMISD::CMOV"; 1562 case ARMISD::SUBS: return "ARMISD::SUBS"; 1563 1564 case ARMISD::SSAT: return "ARMISD::SSAT"; 1565 case ARMISD::USAT: return "ARMISD::USAT"; 1566 1567 case ARMISD::ASRL: return "ARMISD::ASRL"; 1568 case ARMISD::LSRL: return "ARMISD::LSRL"; 1569 case ARMISD::LSLL: return "ARMISD::LSLL"; 1570 1571 case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG"; 1572 case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG"; 1573 case ARMISD::RRX: return "ARMISD::RRX"; 1574 1575 case ARMISD::ADDC: return "ARMISD::ADDC"; 1576 case ARMISD::ADDE: return "ARMISD::ADDE"; 1577 case ARMISD::SUBC: return "ARMISD::SUBC"; 1578 case ARMISD::SUBE: return "ARMISD::SUBE"; 1579 case ARMISD::LSLS: return "ARMISD::LSLS"; 1580 1581 case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD"; 1582 case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR"; 1583 case ARMISD::VMOVhr: return "ARMISD::VMOVhr"; 1584 case ARMISD::VMOVrh: return "ARMISD::VMOVrh"; 1585 case ARMISD::VMOVSR: return "ARMISD::VMOVSR"; 1586 1587 case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP"; 1588 case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP"; 1589 case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH"; 1590 1591 case ARMISD::TC_RETURN: return "ARMISD::TC_RETURN"; 1592 1593 case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER"; 1594 1595 case ARMISD::DYN_ALLOC: return "ARMISD::DYN_ALLOC"; 1596 1597 case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR"; 1598 1599 case ARMISD::PRELOAD: return "ARMISD::PRELOAD"; 1600 1601 case ARMISD::LDRD: return "ARMISD::LDRD"; 1602 case ARMISD::STRD: return "ARMISD::STRD"; 1603 1604 case ARMISD::WIN__CHKSTK: return "ARMISD::WIN__CHKSTK"; 1605 case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK"; 1606 1607 case ARMISD::PREDICATE_CAST: return "ARMISD::PREDICATE_CAST"; 1608 case ARMISD::VCMP: return "ARMISD::VCMP"; 1609 case ARMISD::VCMPZ: return "ARMISD::VCMPZ"; 1610 case ARMISD::VTST: return "ARMISD::VTST"; 1611 1612 case ARMISD::VSHLs: return "ARMISD::VSHLs"; 1613 case ARMISD::VSHLu: return "ARMISD::VSHLu"; 1614 case ARMISD::VSHLIMM: return "ARMISD::VSHLIMM"; 1615 case ARMISD::VSHRsIMM: return "ARMISD::VSHRsIMM"; 1616 case ARMISD::VSHRuIMM: return "ARMISD::VSHRuIMM"; 1617 case ARMISD::VRSHRsIMM: return "ARMISD::VRSHRsIMM"; 1618 case ARMISD::VRSHRuIMM: return "ARMISD::VRSHRuIMM"; 1619 case ARMISD::VRSHRNIMM: return "ARMISD::VRSHRNIMM"; 1620 case ARMISD::VQSHLsIMM: return "ARMISD::VQSHLsIMM"; 1621 case ARMISD::VQSHLuIMM: return "ARMISD::VQSHLuIMM"; 1622 case ARMISD::VQSHLsuIMM: return "ARMISD::VQSHLsuIMM"; 1623 case ARMISD::VQSHRNsIMM: return "ARMISD::VQSHRNsIMM"; 1624 case ARMISD::VQSHRNuIMM: return "ARMISD::VQSHRNuIMM"; 1625 case ARMISD::VQSHRNsuIMM: return "ARMISD::VQSHRNsuIMM"; 1626 case ARMISD::VQRSHRNsIMM: return "ARMISD::VQRSHRNsIMM"; 1627 case ARMISD::VQRSHRNuIMM: return "ARMISD::VQRSHRNuIMM"; 1628 case ARMISD::VQRSHRNsuIMM: return "ARMISD::VQRSHRNsuIMM"; 1629 case ARMISD::VSLIIMM: return "ARMISD::VSLIIMM"; 1630 case ARMISD::VSRIIMM: return "ARMISD::VSRIIMM"; 1631 case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu"; 1632 case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs"; 1633 case ARMISD::VMOVIMM: return "ARMISD::VMOVIMM"; 1634 case ARMISD::VMVNIMM: return "ARMISD::VMVNIMM"; 1635 case ARMISD::VMOVFPIMM: return "ARMISD::VMOVFPIMM"; 1636 case ARMISD::VDUP: return "ARMISD::VDUP"; 1637 case ARMISD::VDUPLANE: return "ARMISD::VDUPLANE"; 1638 case ARMISD::VEXT: return "ARMISD::VEXT"; 1639 case ARMISD::VREV64: return "ARMISD::VREV64"; 1640 case ARMISD::VREV32: return "ARMISD::VREV32"; 1641 case ARMISD::VREV16: return "ARMISD::VREV16"; 1642 case ARMISD::VZIP: return "ARMISD::VZIP"; 1643 case ARMISD::VUZP: return "ARMISD::VUZP"; 1644 case ARMISD::VTRN: return "ARMISD::VTRN"; 1645 case ARMISD::VTBL1: return "ARMISD::VTBL1"; 1646 case ARMISD::VTBL2: return "ARMISD::VTBL2"; 1647 case ARMISD::VMOVN: return "ARMISD::VMOVN"; 1648 case ARMISD::VMULLs: return "ARMISD::VMULLs"; 1649 case ARMISD::VMULLu: return "ARMISD::VMULLu"; 1650 case ARMISD::UMAAL: return "ARMISD::UMAAL"; 1651 case ARMISD::UMLAL: return "ARMISD::UMLAL"; 1652 case ARMISD::SMLAL: return "ARMISD::SMLAL"; 1653 case ARMISD::SMLALBB: return "ARMISD::SMLALBB"; 1654 case ARMISD::SMLALBT: return "ARMISD::SMLALBT"; 1655 case ARMISD::SMLALTB: return "ARMISD::SMLALTB"; 1656 case ARMISD::SMLALTT: return "ARMISD::SMLALTT"; 1657 case ARMISD::SMULWB: return "ARMISD::SMULWB"; 1658 case ARMISD::SMULWT: return "ARMISD::SMULWT"; 1659 case ARMISD::SMLALD: return "ARMISD::SMLALD"; 1660 case ARMISD::SMLALDX: return "ARMISD::SMLALDX"; 1661 case ARMISD::SMLSLD: return "ARMISD::SMLSLD"; 1662 case ARMISD::SMLSLDX: return "ARMISD::SMLSLDX"; 1663 case ARMISD::SMMLAR: return "ARMISD::SMMLAR"; 1664 case ARMISD::SMMLSR: return "ARMISD::SMMLSR"; 1665 case ARMISD::QADD16b: return "ARMISD::QADD16b"; 1666 case ARMISD::QSUB16b: return "ARMISD::QSUB16b"; 1667 case ARMISD::QADD8b: return "ARMISD::QADD8b"; 1668 case ARMISD::QSUB8b: return "ARMISD::QSUB8b"; 1669 case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; 1670 case ARMISD::BFI: return "ARMISD::BFI"; 1671 case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; 1672 case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; 1673 case ARMISD::VBSL: return "ARMISD::VBSL"; 1674 case ARMISD::MEMCPY: return "ARMISD::MEMCPY"; 1675 case ARMISD::VLD1DUP: return "ARMISD::VLD1DUP"; 1676 case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; 1677 case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP"; 1678 case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP"; 1679 case ARMISD::VLD1_UPD: return "ARMISD::VLD1_UPD"; 1680 case ARMISD::VLD2_UPD: return "ARMISD::VLD2_UPD"; 1681 case ARMISD::VLD3_UPD: return "ARMISD::VLD3_UPD"; 1682 case ARMISD::VLD4_UPD: return "ARMISD::VLD4_UPD"; 1683 case ARMISD::VLD2LN_UPD: return "ARMISD::VLD2LN_UPD"; 1684 case ARMISD::VLD3LN_UPD: return "ARMISD::VLD3LN_UPD"; 1685 case ARMISD::VLD4LN_UPD: return "ARMISD::VLD4LN_UPD"; 1686 case ARMISD::VLD1DUP_UPD: return "ARMISD::VLD1DUP_UPD"; 1687 case ARMISD::VLD2DUP_UPD: return "ARMISD::VLD2DUP_UPD"; 1688 case ARMISD::VLD3DUP_UPD: return "ARMISD::VLD3DUP_UPD"; 1689 case ARMISD::VLD4DUP_UPD: return "ARMISD::VLD4DUP_UPD"; 1690 case ARMISD::VST1_UPD: return "ARMISD::VST1_UPD"; 1691 case ARMISD::VST2_UPD: return "ARMISD::VST2_UPD"; 1692 case ARMISD::VST3_UPD: return "ARMISD::VST3_UPD"; 1693 case ARMISD::VST4_UPD: return "ARMISD::VST4_UPD"; 1694 case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD"; 1695 case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD"; 1696 case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD"; 1697 case ARMISD::WLS: return "ARMISD::WLS"; 1698 case ARMISD::LE: return "ARMISD::LE"; 1699 case ARMISD::LOOP_DEC: return "ARMISD::LOOP_DEC"; 1700 case ARMISD::CSINV: return "ARMISD::CSINV"; 1701 case ARMISD::CSNEG: return "ARMISD::CSNEG"; 1702 case ARMISD::CSINC: return "ARMISD::CSINC"; 1703 } 1704 return nullptr; 1705 } 1706 1707 EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, 1708 EVT VT) const { 1709 if (!VT.isVector()) 1710 return getPointerTy(DL); 1711 1712 // MVE has a predicate register. 1713 if (Subtarget->hasMVEIntegerOps() && 1714 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8)) 1715 return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount()); 1716 return VT.changeVectorElementTypeToInteger(); 1717 } 1718 1719 /// getRegClassFor - Return the register class that should be used for the 1720 /// specified value type. 1721 const TargetRegisterClass * 1722 ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { 1723 (void)isDivergent; 1724 // Map v4i64 to QQ registers but do not make the type legal. Similarly map 1725 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to 1726 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive 1727 // MVE Q registers. 1728 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) { 1729 if (VT == MVT::v4i64) 1730 return &ARM::QQPRRegClass; 1731 if (VT == MVT::v8i64) 1732 return &ARM::QQQQPRRegClass; 1733 } 1734 return TargetLowering::getRegClassFor(VT); 1735 } 1736 1737 // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the 1738 // source/dest is aligned and the copy size is large enough. We therefore want 1739 // to align such objects passed to memory intrinsics. 1740 bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, 1741 unsigned &PrefAlign) const { 1742 if (!isa<MemIntrinsic>(CI)) 1743 return false; 1744 MinSize = 8; 1745 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1 1746 // cycle faster than 4-byte aligned LDM. 1747 PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4); 1748 return true; 1749 } 1750 1751 // Create a fast isel object. 1752 FastISel * 1753 ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 1754 const TargetLibraryInfo *libInfo) const { 1755 return ARM::createFastISel(funcInfo, libInfo); 1756 } 1757 1758 Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { 1759 unsigned NumVals = N->getNumValues(); 1760 if (!NumVals) 1761 return Sched::RegPressure; 1762 1763 for (unsigned i = 0; i != NumVals; ++i) { 1764 EVT VT = N->getValueType(i); 1765 if (VT == MVT::Glue || VT == MVT::Other) 1766 continue; 1767 if (VT.isFloatingPoint() || VT.isVector()) 1768 return Sched::ILP; 1769 } 1770 1771 if (!N->isMachineOpcode()) 1772 return Sched::RegPressure; 1773 1774 // Load are scheduled for latency even if there instruction itinerary 1775 // is not available. 1776 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 1777 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); 1778 1779 if (MCID.getNumDefs() == 0) 1780 return Sched::RegPressure; 1781 if (!Itins->isEmpty() && 1782 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2) 1783 return Sched::ILP; 1784 1785 return Sched::RegPressure; 1786 } 1787 1788 //===----------------------------------------------------------------------===// 1789 // Lowering Code 1790 //===----------------------------------------------------------------------===// 1791 1792 static bool isSRL16(const SDValue &Op) { 1793 if (Op.getOpcode() != ISD::SRL) 1794 return false; 1795 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1796 return Const->getZExtValue() == 16; 1797 return false; 1798 } 1799 1800 static bool isSRA16(const SDValue &Op) { 1801 if (Op.getOpcode() != ISD::SRA) 1802 return false; 1803 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1804 return Const->getZExtValue() == 16; 1805 return false; 1806 } 1807 1808 static bool isSHL16(const SDValue &Op) { 1809 if (Op.getOpcode() != ISD::SHL) 1810 return false; 1811 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1812 return Const->getZExtValue() == 16; 1813 return false; 1814 } 1815 1816 // Check for a signed 16-bit value. We special case SRA because it makes it 1817 // more simple when also looking for SRAs that aren't sign extending a 1818 // smaller value. Without the check, we'd need to take extra care with 1819 // checking order for some operations. 1820 static bool isS16(const SDValue &Op, SelectionDAG &DAG) { 1821 if (isSRA16(Op)) 1822 return isSHL16(Op.getOperand(0)); 1823 return DAG.ComputeNumSignBits(Op) == 17; 1824 } 1825 1826 /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC 1827 static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) { 1828 switch (CC) { 1829 default: llvm_unreachable("Unknown condition code!"); 1830 case ISD::SETNE: return ARMCC::NE; 1831 case ISD::SETEQ: return ARMCC::EQ; 1832 case ISD::SETGT: return ARMCC::GT; 1833 case ISD::SETGE: return ARMCC::GE; 1834 case ISD::SETLT: return ARMCC::LT; 1835 case ISD::SETLE: return ARMCC::LE; 1836 case ISD::SETUGT: return ARMCC::HI; 1837 case ISD::SETUGE: return ARMCC::HS; 1838 case ISD::SETULT: return ARMCC::LO; 1839 case ISD::SETULE: return ARMCC::LS; 1840 } 1841 } 1842 1843 /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC. 1844 static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, 1845 ARMCC::CondCodes &CondCode2) { 1846 CondCode2 = ARMCC::AL; 1847 switch (CC) { 1848 default: llvm_unreachable("Unknown FP condition!"); 1849 case ISD::SETEQ: 1850 case ISD::SETOEQ: CondCode = ARMCC::EQ; break; 1851 case ISD::SETGT: 1852 case ISD::SETOGT: CondCode = ARMCC::GT; break; 1853 case ISD::SETGE: 1854 case ISD::SETOGE: CondCode = ARMCC::GE; break; 1855 case ISD::SETOLT: CondCode = ARMCC::MI; break; 1856 case ISD::SETOLE: CondCode = ARMCC::LS; break; 1857 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break; 1858 case ISD::SETO: CondCode = ARMCC::VC; break; 1859 case ISD::SETUO: CondCode = ARMCC::VS; break; 1860 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break; 1861 case ISD::SETUGT: CondCode = ARMCC::HI; break; 1862 case ISD::SETUGE: CondCode = ARMCC::PL; break; 1863 case ISD::SETLT: 1864 case ISD::SETULT: CondCode = ARMCC::LT; break; 1865 case ISD::SETLE: 1866 case ISD::SETULE: CondCode = ARMCC::LE; break; 1867 case ISD::SETNE: 1868 case ISD::SETUNE: CondCode = ARMCC::NE; break; 1869 } 1870 } 1871 1872 //===----------------------------------------------------------------------===// 1873 // Calling Convention Implementation 1874 //===----------------------------------------------------------------------===// 1875 1876 /// getEffectiveCallingConv - Get the effective calling convention, taking into 1877 /// account presence of floating point hardware and calling convention 1878 /// limitations, such as support for variadic functions. 1879 CallingConv::ID 1880 ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC, 1881 bool isVarArg) const { 1882 switch (CC) { 1883 default: 1884 report_fatal_error("Unsupported calling convention"); 1885 case CallingConv::ARM_AAPCS: 1886 case CallingConv::ARM_APCS: 1887 case CallingConv::GHC: 1888 case CallingConv::CFGuard_Check: 1889 return CC; 1890 case CallingConv::PreserveMost: 1891 return CallingConv::PreserveMost; 1892 case CallingConv::ARM_AAPCS_VFP: 1893 case CallingConv::Swift: 1894 return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP; 1895 case CallingConv::C: 1896 if (!Subtarget->isAAPCS_ABI()) 1897 return CallingConv::ARM_APCS; 1898 else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && 1899 getTargetMachine().Options.FloatABIType == FloatABI::Hard && 1900 !isVarArg) 1901 return CallingConv::ARM_AAPCS_VFP; 1902 else 1903 return CallingConv::ARM_AAPCS; 1904 case CallingConv::Fast: 1905 case CallingConv::CXX_FAST_TLS: 1906 if (!Subtarget->isAAPCS_ABI()) { 1907 if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg) 1908 return CallingConv::Fast; 1909 return CallingConv::ARM_APCS; 1910 } else if (Subtarget->hasVFP2Base() && 1911 !Subtarget->isThumb1Only() && !isVarArg) 1912 return CallingConv::ARM_AAPCS_VFP; 1913 else 1914 return CallingConv::ARM_AAPCS; 1915 } 1916 } 1917 1918 CCAssignFn *ARMTargetLowering::CCAssignFnForCall(CallingConv::ID CC, 1919 bool isVarArg) const { 1920 return CCAssignFnForNode(CC, false, isVarArg); 1921 } 1922 1923 CCAssignFn *ARMTargetLowering::CCAssignFnForReturn(CallingConv::ID CC, 1924 bool isVarArg) const { 1925 return CCAssignFnForNode(CC, true, isVarArg); 1926 } 1927 1928 /// CCAssignFnForNode - Selects the correct CCAssignFn for the given 1929 /// CallingConvention. 1930 CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, 1931 bool Return, 1932 bool isVarArg) const { 1933 switch (getEffectiveCallingConv(CC, isVarArg)) { 1934 default: 1935 report_fatal_error("Unsupported calling convention"); 1936 case CallingConv::ARM_APCS: 1937 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS); 1938 case CallingConv::ARM_AAPCS: 1939 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); 1940 case CallingConv::ARM_AAPCS_VFP: 1941 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP); 1942 case CallingConv::Fast: 1943 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS); 1944 case CallingConv::GHC: 1945 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC); 1946 case CallingConv::PreserveMost: 1947 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); 1948 case CallingConv::CFGuard_Check: 1949 return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check); 1950 } 1951 } 1952 1953 /// LowerCallResult - Lower the result values of a call into the 1954 /// appropriate copies out of appropriate physical registers. 1955 SDValue ARMTargetLowering::LowerCallResult( 1956 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, 1957 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 1958 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn, 1959 SDValue ThisVal) const { 1960 // Assign locations to each value returned by this call. 1961 SmallVector<CCValAssign, 16> RVLocs; 1962 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 1963 *DAG.getContext()); 1964 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg)); 1965 1966 // Copy all of the result registers out of their specified physreg. 1967 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1968 CCValAssign VA = RVLocs[i]; 1969 1970 // Pass 'this' value directly from the argument to return value, to avoid 1971 // reg unit interference 1972 if (i == 0 && isThisReturn) { 1973 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 && 1974 "unexpected return calling convention register assignment"); 1975 InVals.push_back(ThisVal); 1976 continue; 1977 } 1978 1979 SDValue Val; 1980 if (VA.needsCustom()) { 1981 // Handle f64 or half of a v2f64. 1982 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 1983 InFlag); 1984 Chain = Lo.getValue(1); 1985 InFlag = Lo.getValue(2); 1986 VA = RVLocs[++i]; // skip ahead to next loc 1987 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 1988 InFlag); 1989 Chain = Hi.getValue(1); 1990 InFlag = Hi.getValue(2); 1991 if (!Subtarget->isLittle()) 1992 std::swap (Lo, Hi); 1993 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 1994 1995 if (VA.getLocVT() == MVT::v2f64) { 1996 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 1997 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 1998 DAG.getConstant(0, dl, MVT::i32)); 1999 2000 VA = RVLocs[++i]; // skip ahead to next loc 2001 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 2002 Chain = Lo.getValue(1); 2003 InFlag = Lo.getValue(2); 2004 VA = RVLocs[++i]; // skip ahead to next loc 2005 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 2006 Chain = Hi.getValue(1); 2007 InFlag = Hi.getValue(2); 2008 if (!Subtarget->isLittle()) 2009 std::swap (Lo, Hi); 2010 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 2011 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 2012 DAG.getConstant(1, dl, MVT::i32)); 2013 } 2014 } else { 2015 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), 2016 InFlag); 2017 Chain = Val.getValue(1); 2018 InFlag = Val.getValue(2); 2019 } 2020 2021 switch (VA.getLocInfo()) { 2022 default: llvm_unreachable("Unknown loc info!"); 2023 case CCValAssign::Full: break; 2024 case CCValAssign::BCvt: 2025 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val); 2026 break; 2027 } 2028 2029 InVals.push_back(Val); 2030 } 2031 2032 return Chain; 2033 } 2034 2035 /// LowerMemOpCallTo - Store the argument to the stack. 2036 SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, 2037 SDValue Arg, const SDLoc &dl, 2038 SelectionDAG &DAG, 2039 const CCValAssign &VA, 2040 ISD::ArgFlagsTy Flags) const { 2041 unsigned LocMemOffset = VA.getLocMemOffset(); 2042 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 2043 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), 2044 StackPtr, PtrOff); 2045 return DAG.getStore( 2046 Chain, dl, Arg, PtrOff, 2047 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset)); 2048 } 2049 2050 void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, 2051 SDValue Chain, SDValue &Arg, 2052 RegsToPassVector &RegsToPass, 2053 CCValAssign &VA, CCValAssign &NextVA, 2054 SDValue &StackPtr, 2055 SmallVectorImpl<SDValue> &MemOpChains, 2056 ISD::ArgFlagsTy Flags) const { 2057 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 2058 DAG.getVTList(MVT::i32, MVT::i32), Arg); 2059 unsigned id = Subtarget->isLittle() ? 0 : 1; 2060 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id))); 2061 2062 if (NextVA.isRegLoc()) 2063 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id))); 2064 else { 2065 assert(NextVA.isMemLoc()); 2066 if (!StackPtr.getNode()) 2067 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, 2068 getPointerTy(DAG.getDataLayout())); 2069 2070 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id), 2071 dl, DAG, NextVA, 2072 Flags)); 2073 } 2074 } 2075 2076 /// LowerCall - Lowering a call into a callseq_start <- 2077 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter 2078 /// nodes. 2079 SDValue 2080 ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 2081 SmallVectorImpl<SDValue> &InVals) const { 2082 SelectionDAG &DAG = CLI.DAG; 2083 SDLoc &dl = CLI.DL; 2084 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 2085 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 2086 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 2087 SDValue Chain = CLI.Chain; 2088 SDValue Callee = CLI.Callee; 2089 bool &isTailCall = CLI.IsTailCall; 2090 CallingConv::ID CallConv = CLI.CallConv; 2091 bool doesNotRet = CLI.DoesNotReturn; 2092 bool isVarArg = CLI.IsVarArg; 2093 2094 MachineFunction &MF = DAG.getMachineFunction(); 2095 MachineFunction::CallSiteInfo CSInfo; 2096 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); 2097 bool isThisReturn = false; 2098 bool PreferIndirect = false; 2099 2100 // Disable tail calls if they're not supported. 2101 if (!Subtarget->supportsTailCall()) 2102 isTailCall = false; 2103 2104 if (isa<GlobalAddressSDNode>(Callee)) { 2105 // If we're optimizing for minimum size and the function is called three or 2106 // more times in this block, we can improve codesize by calling indirectly 2107 // as BLXr has a 16-bit encoding. 2108 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal(); 2109 if (CLI.CS) { 2110 auto *BB = CLI.CS.getParent(); 2111 PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() && 2112 count_if(GV->users(), [&BB](const User *U) { 2113 return isa<Instruction>(U) && 2114 cast<Instruction>(U)->getParent() == BB; 2115 }) > 2; 2116 } 2117 } 2118 if (isTailCall) { 2119 // Check if it's really possible to do a tail call. 2120 isTailCall = IsEligibleForTailCallOptimization( 2121 Callee, CallConv, isVarArg, isStructRet, 2122 MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG, 2123 PreferIndirect); 2124 if (!isTailCall && CLI.CS && CLI.CS.isMustTailCall()) 2125 report_fatal_error("failed to perform tail call elimination on a call " 2126 "site marked musttail"); 2127 // We don't support GuaranteedTailCallOpt for ARM, only automatically 2128 // detected sibcalls. 2129 if (isTailCall) 2130 ++NumTailCalls; 2131 } 2132 2133 // Analyze operands of the call, assigning locations to each operand. 2134 SmallVector<CCValAssign, 16> ArgLocs; 2135 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 2136 *DAG.getContext()); 2137 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg)); 2138 2139 // Get a count of how many bytes are to be pushed on the stack. 2140 unsigned NumBytes = CCInfo.getNextStackOffset(); 2141 2142 if (isTailCall) { 2143 // For tail calls, memory operands are available in our caller's stack. 2144 NumBytes = 0; 2145 } else { 2146 // Adjust the stack pointer for the new arguments... 2147 // These operations are automatically eliminated by the prolog/epilog pass 2148 Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); 2149 } 2150 2151 SDValue StackPtr = 2152 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout())); 2153 2154 RegsToPassVector RegsToPass; 2155 SmallVector<SDValue, 8> MemOpChains; 2156 2157 // Walk the register/memloc assignments, inserting copies/loads. In the case 2158 // of tail call optimization, arguments are handled later. 2159 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 2160 i != e; 2161 ++i, ++realArgIdx) { 2162 CCValAssign &VA = ArgLocs[i]; 2163 SDValue Arg = OutVals[realArgIdx]; 2164 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 2165 bool isByVal = Flags.isByVal(); 2166 2167 // Promote the value if needed. 2168 switch (VA.getLocInfo()) { 2169 default: llvm_unreachable("Unknown loc info!"); 2170 case CCValAssign::Full: break; 2171 case CCValAssign::SExt: 2172 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); 2173 break; 2174 case CCValAssign::ZExt: 2175 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); 2176 break; 2177 case CCValAssign::AExt: 2178 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); 2179 break; 2180 case CCValAssign::BCvt: 2181 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 2182 break; 2183 } 2184 2185 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces 2186 if (VA.needsCustom()) { 2187 if (VA.getLocVT() == MVT::v2f64) { 2188 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2189 DAG.getConstant(0, dl, MVT::i32)); 2190 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2191 DAG.getConstant(1, dl, MVT::i32)); 2192 2193 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, 2194 VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); 2195 2196 VA = ArgLocs[++i]; // skip ahead to next loc 2197 if (VA.isRegLoc()) { 2198 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, 2199 VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); 2200 } else { 2201 assert(VA.isMemLoc()); 2202 2203 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1, 2204 dl, DAG, VA, Flags)); 2205 } 2206 } else { 2207 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], 2208 StackPtr, MemOpChains, Flags); 2209 } 2210 } else if (VA.isRegLoc()) { 2211 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() && 2212 Outs[0].VT == MVT::i32) { 2213 assert(VA.getLocVT() == MVT::i32 && 2214 "unexpected calling convention register assignment"); 2215 assert(!Ins.empty() && Ins[0].VT == MVT::i32 && 2216 "unexpected use of 'returned'"); 2217 isThisReturn = true; 2218 } 2219 const TargetOptions &Options = DAG.getTarget().Options; 2220 if (Options.EnableDebugEntryValues) 2221 CSInfo.emplace_back(VA.getLocReg(), i); 2222 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2223 } else if (isByVal) { 2224 assert(VA.isMemLoc()); 2225 unsigned offset = 0; 2226 2227 // True if this byval aggregate will be split between registers 2228 // and memory. 2229 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount(); 2230 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed(); 2231 2232 if (CurByValIdx < ByValArgsCount) { 2233 2234 unsigned RegBegin, RegEnd; 2235 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd); 2236 2237 EVT PtrVT = 2238 DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 2239 unsigned int i, j; 2240 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) { 2241 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32); 2242 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); 2243 SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg, 2244 MachinePointerInfo(), 2245 DAG.InferPtrAlignment(AddArg)); 2246 MemOpChains.push_back(Load.getValue(1)); 2247 RegsToPass.push_back(std::make_pair(j, Load)); 2248 } 2249 2250 // If parameter size outsides register area, "offset" value 2251 // helps us to calculate stack slot for remained part properly. 2252 offset = RegEnd - RegBegin; 2253 2254 CCInfo.nextInRegsParam(); 2255 } 2256 2257 if (Flags.getByValSize() > 4*offset) { 2258 auto PtrVT = getPointerTy(DAG.getDataLayout()); 2259 unsigned LocMemOffset = VA.getLocMemOffset(); 2260 SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 2261 SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff); 2262 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl); 2263 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset); 2264 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl, 2265 MVT::i32); 2266 SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl, 2267 MVT::i32); 2268 2269 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); 2270 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode}; 2271 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, 2272 Ops)); 2273 } 2274 } else if (!isTailCall) { 2275 assert(VA.isMemLoc()); 2276 2277 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2278 dl, DAG, VA, Flags)); 2279 } 2280 } 2281 2282 if (!MemOpChains.empty()) 2283 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 2284 2285 // Build a sequence of copy-to-reg nodes chained together with token chain 2286 // and flag operands which copy the outgoing args into the appropriate regs. 2287 SDValue InFlag; 2288 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2289 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2290 RegsToPass[i].second, InFlag); 2291 InFlag = Chain.getValue(1); 2292 } 2293 2294 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every 2295 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol 2296 // node so that legalize doesn't hack it. 2297 bool isDirect = false; 2298 2299 const TargetMachine &TM = getTargetMachine(); 2300 const Module *Mod = MF.getFunction().getParent(); 2301 const GlobalValue *GV = nullptr; 2302 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 2303 GV = G->getGlobal(); 2304 bool isStub = 2305 !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO(); 2306 2307 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass()); 2308 bool isLocalARMFunc = false; 2309 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2310 auto PtrVt = getPointerTy(DAG.getDataLayout()); 2311 2312 if (Subtarget->genLongCalls()) { 2313 assert((!isPositionIndependent() || Subtarget->isTargetWindows()) && 2314 "long-calls codegen is not position independent!"); 2315 // Handle a global address or an external symbol. If it's not one of 2316 // those, the target's already in a register, so we don't need to do 2317 // anything extra. 2318 if (isa<GlobalAddressSDNode>(Callee)) { 2319 // Create a constant pool entry for the callee address 2320 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2321 ARMConstantPoolValue *CPV = 2322 ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0); 2323 2324 // Get the address of the callee into a register 2325 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); 2326 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2327 Callee = DAG.getLoad( 2328 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2329 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2330 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) { 2331 const char *Sym = S->getSymbol(); 2332 2333 // Create a constant pool entry for the callee address 2334 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2335 ARMConstantPoolValue *CPV = 2336 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 2337 ARMPCLabelIndex, 0); 2338 // Get the address of the callee into a register 2339 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); 2340 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2341 Callee = DAG.getLoad( 2342 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2343 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2344 } 2345 } else if (isa<GlobalAddressSDNode>(Callee)) { 2346 if (!PreferIndirect) { 2347 isDirect = true; 2348 bool isDef = GV->isStrongDefinitionForLinker(); 2349 2350 // ARM call to a local ARM function is predicable. 2351 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking); 2352 // tBX takes a register source operand. 2353 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 2354 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?"); 2355 Callee = DAG.getNode( 2356 ARMISD::WrapperPIC, dl, PtrVt, 2357 DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY)); 2358 Callee = DAG.getLoad( 2359 PtrVt, dl, DAG.getEntryNode(), Callee, 2360 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 2361 /* Alignment = */ 0, MachineMemOperand::MODereferenceable | 2362 MachineMemOperand::MOInvariant); 2363 } else if (Subtarget->isTargetCOFF()) { 2364 assert(Subtarget->isTargetWindows() && 2365 "Windows is the only supported COFF target"); 2366 unsigned TargetFlags = ARMII::MO_NO_FLAG; 2367 if (GV->hasDLLImportStorageClass()) 2368 TargetFlags = ARMII::MO_DLLIMPORT; 2369 else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 2370 TargetFlags = ARMII::MO_COFFSTUB; 2371 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*offset=*/0, 2372 TargetFlags); 2373 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB)) 2374 Callee = 2375 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), 2376 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee), 2377 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 2378 } else { 2379 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0); 2380 } 2381 } 2382 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2383 isDirect = true; 2384 // tBX takes a register source operand. 2385 const char *Sym = S->getSymbol(); 2386 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 2387 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2388 ARMConstantPoolValue *CPV = 2389 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 2390 ARMPCLabelIndex, 4); 2391 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); 2392 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2393 Callee = DAG.getLoad( 2394 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2395 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2396 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 2397 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel); 2398 } else { 2399 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0); 2400 } 2401 } 2402 2403 // FIXME: handle tail calls differently. 2404 unsigned CallOpc; 2405 if (Subtarget->isThumb()) { 2406 if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) 2407 CallOpc = ARMISD::CALL_NOLINK; 2408 else 2409 CallOpc = ARMISD::CALL; 2410 } else { 2411 if (!isDirect && !Subtarget->hasV5TOps()) 2412 CallOpc = ARMISD::CALL_NOLINK; 2413 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() && 2414 // Emit regular call when code size is the priority 2415 !Subtarget->hasMinSize()) 2416 // "mov lr, pc; b _foo" to avoid confusing the RSP 2417 CallOpc = ARMISD::CALL_NOLINK; 2418 else 2419 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL; 2420 } 2421 2422 std::vector<SDValue> Ops; 2423 Ops.push_back(Chain); 2424 Ops.push_back(Callee); 2425 2426 // Add argument registers to the end of the list so that they are known live 2427 // into the call. 2428 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2429 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2430 RegsToPass[i].second.getValueType())); 2431 2432 // Add a register mask operand representing the call-preserved registers. 2433 if (!isTailCall) { 2434 const uint32_t *Mask; 2435 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); 2436 if (isThisReturn) { 2437 // For 'this' returns, use the R0-preserving mask if applicable 2438 Mask = ARI->getThisReturnPreservedMask(MF, CallConv); 2439 if (!Mask) { 2440 // Set isThisReturn to false if the calling convention is not one that 2441 // allows 'returned' to be modeled in this way, so LowerCallResult does 2442 // not try to pass 'this' straight through 2443 isThisReturn = false; 2444 Mask = ARI->getCallPreservedMask(MF, CallConv); 2445 } 2446 } else 2447 Mask = ARI->getCallPreservedMask(MF, CallConv); 2448 2449 assert(Mask && "Missing call preserved mask for calling convention"); 2450 Ops.push_back(DAG.getRegisterMask(Mask)); 2451 } 2452 2453 if (InFlag.getNode()) 2454 Ops.push_back(InFlag); 2455 2456 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2457 if (isTailCall) { 2458 MF.getFrameInfo().setHasTailCall(); 2459 SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops); 2460 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); 2461 return Ret; 2462 } 2463 2464 // Returns a chain and a flag for retval copy to use. 2465 Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); 2466 InFlag = Chain.getValue(1); 2467 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); 2468 2469 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), 2470 DAG.getIntPtrConstant(0, dl, true), InFlag, dl); 2471 if (!Ins.empty()) 2472 InFlag = Chain.getValue(1); 2473 2474 // Handle result values, copying them out of physregs into vregs that we 2475 // return. 2476 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, 2477 InVals, isThisReturn, 2478 isThisReturn ? OutVals[0] : SDValue()); 2479 } 2480 2481 /// HandleByVal - Every parameter *after* a byval parameter is passed 2482 /// on the stack. Remember the next parameter register to allocate, 2483 /// and then confiscate the rest of the parameter registers to insure 2484 /// this. 2485 void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size, 2486 unsigned Align) const { 2487 // Byval (as with any stack) slots are always at least 4 byte aligned. 2488 Align = std::max(Align, 4U); 2489 2490 unsigned Reg = State->AllocateReg(GPRArgRegs); 2491 if (!Reg) 2492 return; 2493 2494 unsigned AlignInRegs = Align / 4; 2495 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs; 2496 for (unsigned i = 0; i < Waste; ++i) 2497 Reg = State->AllocateReg(GPRArgRegs); 2498 2499 if (!Reg) 2500 return; 2501 2502 unsigned Excess = 4 * (ARM::R4 - Reg); 2503 2504 // Special case when NSAA != SP and parameter size greater than size of 2505 // all remained GPR regs. In that case we can't split parameter, we must 2506 // send it to stack. We also must set NCRN to R4, so waste all 2507 // remained registers. 2508 const unsigned NSAAOffset = State->getNextStackOffset(); 2509 if (NSAAOffset != 0 && Size > Excess) { 2510 while (State->AllocateReg(GPRArgRegs)) 2511 ; 2512 return; 2513 } 2514 2515 // First register for byval parameter is the first register that wasn't 2516 // allocated before this method call, so it would be "reg". 2517 // If parameter is small enough to be saved in range [reg, r4), then 2518 // the end (first after last) register would be reg + param-size-in-regs, 2519 // else parameter would be splitted between registers and stack, 2520 // end register would be r4 in this case. 2521 unsigned ByValRegBegin = Reg; 2522 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4); 2523 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd); 2524 // Note, first register is allocated in the beginning of function already, 2525 // allocate remained amount of registers we need. 2526 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i) 2527 State->AllocateReg(GPRArgRegs); 2528 // A byval parameter that is split between registers and memory needs its 2529 // size truncated here. 2530 // In the case where the entire structure fits in registers, we set the 2531 // size in memory to zero. 2532 Size = std::max<int>(Size - Excess, 0); 2533 } 2534 2535 /// MatchingStackOffset - Return true if the given stack call argument is 2536 /// already available in the same position (relatively) of the caller's 2537 /// incoming argument stack. 2538 static 2539 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2540 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI, 2541 const TargetInstrInfo *TII) { 2542 unsigned Bytes = Arg.getValueSizeInBits() / 8; 2543 int FI = std::numeric_limits<int>::max(); 2544 if (Arg.getOpcode() == ISD::CopyFromReg) { 2545 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2546 if (!Register::isVirtualRegister(VR)) 2547 return false; 2548 MachineInstr *Def = MRI->getVRegDef(VR); 2549 if (!Def) 2550 return false; 2551 if (!Flags.isByVal()) { 2552 if (!TII->isLoadFromStackSlot(*Def, FI)) 2553 return false; 2554 } else { 2555 return false; 2556 } 2557 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2558 if (Flags.isByVal()) 2559 // ByVal argument is passed in as a pointer but it's now being 2560 // dereferenced. e.g. 2561 // define @foo(%struct.X* %A) { 2562 // tail call @bar(%struct.X* byval %A) 2563 // } 2564 return false; 2565 SDValue Ptr = Ld->getBasePtr(); 2566 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2567 if (!FINode) 2568 return false; 2569 FI = FINode->getIndex(); 2570 } else 2571 return false; 2572 2573 assert(FI != std::numeric_limits<int>::max()); 2574 if (!MFI.isFixedObjectIndex(FI)) 2575 return false; 2576 return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI); 2577 } 2578 2579 /// IsEligibleForTailCallOptimization - Check whether the call is eligible 2580 /// for tail call optimization. Targets which want to do tail call 2581 /// optimization should implement this function. 2582 bool ARMTargetLowering::IsEligibleForTailCallOptimization( 2583 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, 2584 bool isCalleeStructRet, bool isCallerStructRet, 2585 const SmallVectorImpl<ISD::OutputArg> &Outs, 2586 const SmallVectorImpl<SDValue> &OutVals, 2587 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG, 2588 const bool isIndirect) const { 2589 MachineFunction &MF = DAG.getMachineFunction(); 2590 const Function &CallerF = MF.getFunction(); 2591 CallingConv::ID CallerCC = CallerF.getCallingConv(); 2592 2593 assert(Subtarget->supportsTailCall()); 2594 2595 // Indirect tail calls cannot be optimized for Thumb1 if the args 2596 // to the call take up r0-r3. The reason is that there are no legal registers 2597 // left to hold the pointer to the function to be called. 2598 if (Subtarget->isThumb1Only() && Outs.size() >= 4 && 2599 (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect)) 2600 return false; 2601 2602 // Look for obvious safe cases to perform tail call optimization that do not 2603 // require ABI changes. This is what gcc calls sibcall. 2604 2605 // Exception-handling functions need a special set of instructions to indicate 2606 // a return to the hardware. Tail-calling another function would probably 2607 // break this. 2608 if (CallerF.hasFnAttribute("interrupt")) 2609 return false; 2610 2611 // Also avoid sibcall optimization if either caller or callee uses struct 2612 // return semantics. 2613 if (isCalleeStructRet || isCallerStructRet) 2614 return false; 2615 2616 // Externally-defined functions with weak linkage should not be 2617 // tail-called on ARM when the OS does not support dynamic 2618 // pre-emption of symbols, as the AAELF spec requires normal calls 2619 // to undefined weak functions to be replaced with a NOP or jump to the 2620 // next instruction. The behaviour of branch instructions in this 2621 // situation (as used for tail calls) is implementation-defined, so we 2622 // cannot rely on the linker replacing the tail call with a return. 2623 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2624 const GlobalValue *GV = G->getGlobal(); 2625 const Triple &TT = getTargetMachine().getTargetTriple(); 2626 if (GV->hasExternalWeakLinkage() && 2627 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) 2628 return false; 2629 } 2630 2631 // Check that the call results are passed in the same way. 2632 LLVMContext &C = *DAG.getContext(); 2633 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, 2634 CCAssignFnForReturn(CalleeCC, isVarArg), 2635 CCAssignFnForReturn(CallerCC, isVarArg))) 2636 return false; 2637 // The callee has to preserve all registers the caller needs to preserve. 2638 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 2639 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); 2640 if (CalleeCC != CallerCC) { 2641 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); 2642 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) 2643 return false; 2644 } 2645 2646 // If Caller's vararg or byval argument has been split between registers and 2647 // stack, do not perform tail call, since part of the argument is in caller's 2648 // local frame. 2649 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>(); 2650 if (AFI_Caller->getArgRegsSaveSize()) 2651 return false; 2652 2653 // If the callee takes no arguments then go on to check the results of the 2654 // call. 2655 if (!Outs.empty()) { 2656 // Check if stack adjustment is needed. For now, do not do this if any 2657 // argument is passed on the stack. 2658 SmallVector<CCValAssign, 16> ArgLocs; 2659 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); 2660 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); 2661 if (CCInfo.getNextStackOffset()) { 2662 // Check if the arguments are already laid out in the right way as 2663 // the caller's fixed stack objects. 2664 MachineFrameInfo &MFI = MF.getFrameInfo(); 2665 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2666 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 2667 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 2668 i != e; 2669 ++i, ++realArgIdx) { 2670 CCValAssign &VA = ArgLocs[i]; 2671 EVT RegVT = VA.getLocVT(); 2672 SDValue Arg = OutVals[realArgIdx]; 2673 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 2674 if (VA.getLocInfo() == CCValAssign::Indirect) 2675 return false; 2676 if (VA.needsCustom()) { 2677 // f64 and vector types are split into multiple registers or 2678 // register/stack-slot combinations. The types will not match 2679 // the registers; give up on memory f64 refs until we figure 2680 // out what to do about this. 2681 if (!VA.isRegLoc()) 2682 return false; 2683 if (!ArgLocs[++i].isRegLoc()) 2684 return false; 2685 if (RegVT == MVT::v2f64) { 2686 if (!ArgLocs[++i].isRegLoc()) 2687 return false; 2688 if (!ArgLocs[++i].isRegLoc()) 2689 return false; 2690 } 2691 } else if (!VA.isRegLoc()) { 2692 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2693 MFI, MRI, TII)) 2694 return false; 2695 } 2696 } 2697 } 2698 2699 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2700 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) 2701 return false; 2702 } 2703 2704 return true; 2705 } 2706 2707 bool 2708 ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv, 2709 MachineFunction &MF, bool isVarArg, 2710 const SmallVectorImpl<ISD::OutputArg> &Outs, 2711 LLVMContext &Context) const { 2712 SmallVector<CCValAssign, 16> RVLocs; 2713 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 2714 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); 2715 } 2716 2717 static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps, 2718 const SDLoc &DL, SelectionDAG &DAG) { 2719 const MachineFunction &MF = DAG.getMachineFunction(); 2720 const Function &F = MF.getFunction(); 2721 2722 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString(); 2723 2724 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset 2725 // version of the "preferred return address". These offsets affect the return 2726 // instruction if this is a return from PL1 without hypervisor extensions. 2727 // IRQ/FIQ: +4 "subs pc, lr, #4" 2728 // SWI: 0 "subs pc, lr, #0" 2729 // ABORT: +4 "subs pc, lr, #4" 2730 // UNDEF: +4/+2 "subs pc, lr, #0" 2731 // UNDEF varies depending on where the exception came from ARM or Thumb 2732 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0. 2733 2734 int64_t LROffset; 2735 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" || 2736 IntKind == "ABORT") 2737 LROffset = 4; 2738 else if (IntKind == "SWI" || IntKind == "UNDEF") 2739 LROffset = 0; 2740 else 2741 report_fatal_error("Unsupported interrupt attribute. If present, value " 2742 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF"); 2743 2744 RetOps.insert(RetOps.begin() + 1, 2745 DAG.getConstant(LROffset, DL, MVT::i32, false)); 2746 2747 return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps); 2748 } 2749 2750 SDValue 2751 ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 2752 bool isVarArg, 2753 const SmallVectorImpl<ISD::OutputArg> &Outs, 2754 const SmallVectorImpl<SDValue> &OutVals, 2755 const SDLoc &dl, SelectionDAG &DAG) const { 2756 // CCValAssign - represent the assignment of the return value to a location. 2757 SmallVector<CCValAssign, 16> RVLocs; 2758 2759 // CCState - Info about the registers and stack slots. 2760 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 2761 *DAG.getContext()); 2762 2763 // Analyze outgoing return values. 2764 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); 2765 2766 SDValue Flag; 2767 SmallVector<SDValue, 4> RetOps; 2768 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 2769 bool isLittleEndian = Subtarget->isLittle(); 2770 2771 MachineFunction &MF = DAG.getMachineFunction(); 2772 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2773 AFI->setReturnRegsCount(RVLocs.size()); 2774 2775 // Copy the result values into the output registers. 2776 for (unsigned i = 0, realRVLocIdx = 0; 2777 i != RVLocs.size(); 2778 ++i, ++realRVLocIdx) { 2779 CCValAssign &VA = RVLocs[i]; 2780 assert(VA.isRegLoc() && "Can only return in registers!"); 2781 2782 SDValue Arg = OutVals[realRVLocIdx]; 2783 bool ReturnF16 = false; 2784 2785 if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) { 2786 // Half-precision return values can be returned like this: 2787 // 2788 // t11 f16 = fadd ... 2789 // t12: i16 = bitcast t11 2790 // t13: i32 = zero_extend t12 2791 // t14: f32 = bitcast t13 <~~~~~~~ Arg 2792 // 2793 // to avoid code generation for bitcasts, we simply set Arg to the node 2794 // that produces the f16 value, t11 in this case. 2795 // 2796 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) { 2797 SDValue ZE = Arg.getOperand(0); 2798 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) { 2799 SDValue BC = ZE.getOperand(0); 2800 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) { 2801 Arg = BC.getOperand(0); 2802 ReturnF16 = true; 2803 } 2804 } 2805 } 2806 } 2807 2808 switch (VA.getLocInfo()) { 2809 default: llvm_unreachable("Unknown loc info!"); 2810 case CCValAssign::Full: break; 2811 case CCValAssign::BCvt: 2812 if (!ReturnF16) 2813 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 2814 break; 2815 } 2816 2817 if (VA.needsCustom()) { 2818 if (VA.getLocVT() == MVT::v2f64) { 2819 // Extract the first half and return it in two registers. 2820 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2821 DAG.getConstant(0, dl, MVT::i32)); 2822 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl, 2823 DAG.getVTList(MVT::i32, MVT::i32), Half); 2824 2825 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2826 HalfGPRs.getValue(isLittleEndian ? 0 : 1), 2827 Flag); 2828 Flag = Chain.getValue(1); 2829 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2830 VA = RVLocs[++i]; // skip ahead to next loc 2831 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2832 HalfGPRs.getValue(isLittleEndian ? 1 : 0), 2833 Flag); 2834 Flag = Chain.getValue(1); 2835 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2836 VA = RVLocs[++i]; // skip ahead to next loc 2837 2838 // Extract the 2nd half and fall through to handle it as an f64 value. 2839 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2840 DAG.getConstant(1, dl, MVT::i32)); 2841 } 2842 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is 2843 // available. 2844 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 2845 DAG.getVTList(MVT::i32, MVT::i32), Arg); 2846 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2847 fmrrd.getValue(isLittleEndian ? 0 : 1), 2848 Flag); 2849 Flag = Chain.getValue(1); 2850 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2851 VA = RVLocs[++i]; // skip ahead to next loc 2852 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2853 fmrrd.getValue(isLittleEndian ? 1 : 0), 2854 Flag); 2855 } else 2856 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); 2857 2858 // Guarantee that all emitted copies are 2859 // stuck together, avoiding something bad. 2860 Flag = Chain.getValue(1); 2861 RetOps.push_back(DAG.getRegister(VA.getLocReg(), 2862 ReturnF16 ? MVT::f16 : VA.getLocVT())); 2863 } 2864 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 2865 const MCPhysReg *I = 2866 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 2867 if (I) { 2868 for (; *I; ++I) { 2869 if (ARM::GPRRegClass.contains(*I)) 2870 RetOps.push_back(DAG.getRegister(*I, MVT::i32)); 2871 else if (ARM::DPRRegClass.contains(*I)) 2872 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); 2873 else 2874 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 2875 } 2876 } 2877 2878 // Update chain and glue. 2879 RetOps[0] = Chain; 2880 if (Flag.getNode()) 2881 RetOps.push_back(Flag); 2882 2883 // CPUs which aren't M-class use a special sequence to return from 2884 // exceptions (roughly, any instruction setting pc and cpsr simultaneously, 2885 // though we use "subs pc, lr, #N"). 2886 // 2887 // M-class CPUs actually use a normal return sequence with a special 2888 // (hardware-provided) value in LR, so the normal code path works. 2889 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") && 2890 !Subtarget->isMClass()) { 2891 if (Subtarget->isThumb1Only()) 2892 report_fatal_error("interrupt attribute is not supported in Thumb1"); 2893 return LowerInterruptReturn(RetOps, dl, DAG); 2894 } 2895 2896 return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps); 2897 } 2898 2899 bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { 2900 if (N->getNumValues() != 1) 2901 return false; 2902 if (!N->hasNUsesOfValue(1, 0)) 2903 return false; 2904 2905 SDValue TCChain = Chain; 2906 SDNode *Copy = *N->use_begin(); 2907 if (Copy->getOpcode() == ISD::CopyToReg) { 2908 // If the copy has a glue operand, we conservatively assume it isn't safe to 2909 // perform a tail call. 2910 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 2911 return false; 2912 TCChain = Copy->getOperand(0); 2913 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) { 2914 SDNode *VMov = Copy; 2915 // f64 returned in a pair of GPRs. 2916 SmallPtrSet<SDNode*, 2> Copies; 2917 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); 2918 UI != UE; ++UI) { 2919 if (UI->getOpcode() != ISD::CopyToReg) 2920 return false; 2921 Copies.insert(*UI); 2922 } 2923 if (Copies.size() > 2) 2924 return false; 2925 2926 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); 2927 UI != UE; ++UI) { 2928 SDValue UseChain = UI->getOperand(0); 2929 if (Copies.count(UseChain.getNode())) 2930 // Second CopyToReg 2931 Copy = *UI; 2932 else { 2933 // We are at the top of this chain. 2934 // If the copy has a glue operand, we conservatively assume it 2935 // isn't safe to perform a tail call. 2936 if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue) 2937 return false; 2938 // First CopyToReg 2939 TCChain = UseChain; 2940 } 2941 } 2942 } else if (Copy->getOpcode() == ISD::BITCAST) { 2943 // f32 returned in a single GPR. 2944 if (!Copy->hasOneUse()) 2945 return false; 2946 Copy = *Copy->use_begin(); 2947 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0)) 2948 return false; 2949 // If the copy has a glue operand, we conservatively assume it isn't safe to 2950 // perform a tail call. 2951 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 2952 return false; 2953 TCChain = Copy->getOperand(0); 2954 } else { 2955 return false; 2956 } 2957 2958 bool HasRet = false; 2959 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 2960 UI != UE; ++UI) { 2961 if (UI->getOpcode() != ARMISD::RET_FLAG && 2962 UI->getOpcode() != ARMISD::INTRET_FLAG) 2963 return false; 2964 HasRet = true; 2965 } 2966 2967 if (!HasRet) 2968 return false; 2969 2970 Chain = TCChain; 2971 return true; 2972 } 2973 2974 bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { 2975 if (!Subtarget->supportsTailCall()) 2976 return false; 2977 2978 if (!CI->isTailCall()) 2979 return false; 2980 2981 return true; 2982 } 2983 2984 // Trying to write a 64 bit value so need to split into two 32 bit values first, 2985 // and pass the lower and high parts through. 2986 static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) { 2987 SDLoc DL(Op); 2988 SDValue WriteValue = Op->getOperand(2); 2989 2990 // This function is only supposed to be called for i64 type argument. 2991 assert(WriteValue.getValueType() == MVT::i64 2992 && "LowerWRITE_REGISTER called for non-i64 type argument."); 2993 2994 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, 2995 DAG.getConstant(0, DL, MVT::i32)); 2996 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, 2997 DAG.getConstant(1, DL, MVT::i32)); 2998 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi }; 2999 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops); 3000 } 3001 3002 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 3003 // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is 3004 // one of the above mentioned nodes. It has to be wrapped because otherwise 3005 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 3006 // be used to form addressing mode. These wrapped nodes will be selected 3007 // into MOVi. 3008 SDValue ARMTargetLowering::LowerConstantPool(SDValue Op, 3009 SelectionDAG &DAG) const { 3010 EVT PtrVT = Op.getValueType(); 3011 // FIXME there is no actual debug info here 3012 SDLoc dl(Op); 3013 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 3014 SDValue Res; 3015 3016 // When generating execute-only code Constant Pools must be promoted to the 3017 // global data section. It's a bit ugly that we can't share them across basic 3018 // blocks, but this way we guarantee that execute-only behaves correct with 3019 // position-independent addressing modes. 3020 if (Subtarget->genExecuteOnly()) { 3021 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>(); 3022 auto T = const_cast<Type*>(CP->getType()); 3023 auto C = const_cast<Constant*>(CP->getConstVal()); 3024 auto M = const_cast<Module*>(DAG.getMachineFunction(). 3025 getFunction().getParent()); 3026 auto GV = new GlobalVariable( 3027 *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C, 3028 Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" + 3029 Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" + 3030 Twine(AFI->createPICLabelUId()) 3031 ); 3032 SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV), 3033 dl, PtrVT); 3034 return LowerGlobalAddress(GA, DAG); 3035 } 3036 3037 if (CP->isMachineConstantPoolEntry()) 3038 Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, 3039 CP->getAlignment()); 3040 else 3041 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, 3042 CP->getAlignment()); 3043 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res); 3044 } 3045 3046 unsigned ARMTargetLowering::getJumpTableEncoding() const { 3047 return MachineJumpTableInfo::EK_Inline; 3048 } 3049 3050 SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, 3051 SelectionDAG &DAG) const { 3052 MachineFunction &MF = DAG.getMachineFunction(); 3053 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3054 unsigned ARMPCLabelIndex = 0; 3055 SDLoc DL(Op); 3056 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3057 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 3058 SDValue CPAddr; 3059 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI(); 3060 if (!IsPositionIndependent) { 3061 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4); 3062 } else { 3063 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; 3064 ARMPCLabelIndex = AFI->createPICLabelUId(); 3065 ARMConstantPoolValue *CPV = 3066 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex, 3067 ARMCP::CPBlockAddress, PCAdj); 3068 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3069 } 3070 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr); 3071 SDValue Result = DAG.getLoad( 3072 PtrVT, DL, DAG.getEntryNode(), CPAddr, 3073 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3074 if (!IsPositionIndependent) 3075 return Result; 3076 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32); 3077 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel); 3078 } 3079 3080 /// Convert a TLS address reference into the correct sequence of loads 3081 /// and calls to compute the variable's address for Darwin, and return an 3082 /// SDValue containing the final node. 3083 3084 /// Darwin only has one TLS scheme which must be capable of dealing with the 3085 /// fully general situation, in the worst case. This means: 3086 /// + "extern __thread" declaration. 3087 /// + Defined in a possibly unknown dynamic library. 3088 /// 3089 /// The general system is that each __thread variable has a [3 x i32] descriptor 3090 /// which contains information used by the runtime to calculate the address. The 3091 /// only part of this the compiler needs to know about is the first word, which 3092 /// contains a function pointer that must be called with the address of the 3093 /// entire descriptor in "r0". 3094 /// 3095 /// Since this descriptor may be in a different unit, in general access must 3096 /// proceed along the usual ARM rules. A common sequence to produce is: 3097 /// 3098 /// movw rT1, :lower16:_var$non_lazy_ptr 3099 /// movt rT1, :upper16:_var$non_lazy_ptr 3100 /// ldr r0, [rT1] 3101 /// ldr rT2, [r0] 3102 /// blx rT2 3103 /// [...address now in r0...] 3104 SDValue 3105 ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op, 3106 SelectionDAG &DAG) const { 3107 assert(Subtarget->isTargetDarwin() && 3108 "This function expects a Darwin target"); 3109 SDLoc DL(Op); 3110 3111 // First step is to get the address of the actua global symbol. This is where 3112 // the TLS descriptor lives. 3113 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG); 3114 3115 // The first entry in the descriptor is a function pointer that we must call 3116 // to obtain the address of the variable. 3117 SDValue Chain = DAG.getEntryNode(); 3118 SDValue FuncTLVGet = DAG.getLoad( 3119 MVT::i32, DL, Chain, DescAddr, 3120 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 3121 /* Alignment = */ 4, 3122 MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable | 3123 MachineMemOperand::MOInvariant); 3124 Chain = FuncTLVGet.getValue(1); 3125 3126 MachineFunction &F = DAG.getMachineFunction(); 3127 MachineFrameInfo &MFI = F.getFrameInfo(); 3128 MFI.setAdjustsStack(true); 3129 3130 // TLS calls preserve all registers except those that absolutely must be 3131 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be 3132 // silly). 3133 auto TRI = 3134 getTargetMachine().getSubtargetImpl(F.getFunction())->getRegisterInfo(); 3135 auto ARI = static_cast<const ARMRegisterInfo *>(TRI); 3136 const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction()); 3137 3138 // Finally, we can make the call. This is just a degenerate version of a 3139 // normal AArch64 call node: r0 takes the address of the descriptor, and 3140 // returns the address of the variable in this thread. 3141 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue()); 3142 Chain = 3143 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), 3144 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32), 3145 DAG.getRegisterMask(Mask), Chain.getValue(1)); 3146 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1)); 3147 } 3148 3149 SDValue 3150 ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op, 3151 SelectionDAG &DAG) const { 3152 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering"); 3153 3154 SDValue Chain = DAG.getEntryNode(); 3155 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3156 SDLoc DL(Op); 3157 3158 // Load the current TEB (thread environment block) 3159 SDValue Ops[] = {Chain, 3160 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32), 3161 DAG.getTargetConstant(15, DL, MVT::i32), 3162 DAG.getTargetConstant(0, DL, MVT::i32), 3163 DAG.getTargetConstant(13, DL, MVT::i32), 3164 DAG.getTargetConstant(0, DL, MVT::i32), 3165 DAG.getTargetConstant(2, DL, MVT::i32)}; 3166 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, 3167 DAG.getVTList(MVT::i32, MVT::Other), Ops); 3168 3169 SDValue TEB = CurrentTEB.getValue(0); 3170 Chain = CurrentTEB.getValue(1); 3171 3172 // Load the ThreadLocalStoragePointer from the TEB 3173 // A pointer to the TLS array is located at offset 0x2c from the TEB. 3174 SDValue TLSArray = 3175 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL)); 3176 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo()); 3177 3178 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4 3179 // offset into the TLSArray. 3180 3181 // Load the TLS index from the C runtime 3182 SDValue TLSIndex = 3183 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG); 3184 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex); 3185 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo()); 3186 3187 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex, 3188 DAG.getConstant(2, DL, MVT::i32)); 3189 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain, 3190 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot), 3191 MachinePointerInfo()); 3192 3193 // Get the offset of the start of the .tls section (section base) 3194 const auto *GA = cast<GlobalAddressSDNode>(Op); 3195 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL); 3196 SDValue Offset = DAG.getLoad( 3197 PtrVT, DL, Chain, DAG.getNode(ARMISD::Wrapper, DL, MVT::i32, 3198 DAG.getTargetConstantPool(CPV, PtrVT, 4)), 3199 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3200 3201 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset); 3202 } 3203 3204 // Lower ISD::GlobalTLSAddress using the "general dynamic" model 3205 SDValue 3206 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, 3207 SelectionDAG &DAG) const { 3208 SDLoc dl(GA); 3209 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3210 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 3211 MachineFunction &MF = DAG.getMachineFunction(); 3212 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3213 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 3214 ARMConstantPoolValue *CPV = 3215 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 3216 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true); 3217 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3218 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument); 3219 Argument = DAG.getLoad( 3220 PtrVT, dl, DAG.getEntryNode(), Argument, 3221 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3222 SDValue Chain = Argument.getValue(1); 3223 3224 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3225 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel); 3226 3227 // call __tls_get_addr. 3228 ArgListTy Args; 3229 ArgListEntry Entry; 3230 Entry.Node = Argument; 3231 Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext()); 3232 Args.push_back(Entry); 3233 3234 // FIXME: is there useful debug info available here? 3235 TargetLowering::CallLoweringInfo CLI(DAG); 3236 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee( 3237 CallingConv::C, Type::getInt32Ty(*DAG.getContext()), 3238 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args)); 3239 3240 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 3241 return CallResult.first; 3242 } 3243 3244 // Lower ISD::GlobalTLSAddress using the "initial exec" or 3245 // "local exec" model. 3246 SDValue 3247 ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, 3248 SelectionDAG &DAG, 3249 TLSModel::Model model) const { 3250 const GlobalValue *GV = GA->getGlobal(); 3251 SDLoc dl(GA); 3252 SDValue Offset; 3253 SDValue Chain = DAG.getEntryNode(); 3254 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3255 // Get the Thread Pointer 3256 SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 3257 3258 if (model == TLSModel::InitialExec) { 3259 MachineFunction &MF = DAG.getMachineFunction(); 3260 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3261 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 3262 // Initial exec model. 3263 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 3264 ARMConstantPoolValue *CPV = 3265 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 3266 ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF, 3267 true); 3268 Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3269 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 3270 Offset = DAG.getLoad( 3271 PtrVT, dl, Chain, Offset, 3272 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3273 Chain = Offset.getValue(1); 3274 3275 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3276 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel); 3277 3278 Offset = DAG.getLoad( 3279 PtrVT, dl, Chain, Offset, 3280 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3281 } else { 3282 // local exec model 3283 assert(model == TLSModel::LocalExec); 3284 ARMConstantPoolValue *CPV = 3285 ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF); 3286 Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3287 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 3288 Offset = DAG.getLoad( 3289 PtrVT, dl, Chain, Offset, 3290 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3291 } 3292 3293 // The address of the thread local variable is the add of the thread 3294 // pointer with the offset of the variable. 3295 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 3296 } 3297 3298 SDValue 3299 ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 3300 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 3301 if (DAG.getTarget().useEmulatedTLS()) 3302 return LowerToTLSEmulatedModel(GA, DAG); 3303 3304 if (Subtarget->isTargetDarwin()) 3305 return LowerGlobalTLSAddressDarwin(Op, DAG); 3306 3307 if (Subtarget->isTargetWindows()) 3308 return LowerGlobalTLSAddressWindows(Op, DAG); 3309 3310 // TODO: implement the "local dynamic" model 3311 assert(Subtarget->isTargetELF() && "Only ELF implemented here"); 3312 TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal()); 3313 3314 switch (model) { 3315 case TLSModel::GeneralDynamic: 3316 case TLSModel::LocalDynamic: 3317 return LowerToTLSGeneralDynamicModel(GA, DAG); 3318 case TLSModel::InitialExec: 3319 case TLSModel::LocalExec: 3320 return LowerToTLSExecModels(GA, DAG, model); 3321 } 3322 llvm_unreachable("bogus TLS model"); 3323 } 3324 3325 /// Return true if all users of V are within function F, looking through 3326 /// ConstantExprs. 3327 static bool allUsersAreInFunction(const Value *V, const Function *F) { 3328 SmallVector<const User*,4> Worklist; 3329 for (auto *U : V->users()) 3330 Worklist.push_back(U); 3331 while (!Worklist.empty()) { 3332 auto *U = Worklist.pop_back_val(); 3333 if (isa<ConstantExpr>(U)) { 3334 for (auto *UU : U->users()) 3335 Worklist.push_back(UU); 3336 continue; 3337 } 3338 3339 auto *I = dyn_cast<Instruction>(U); 3340 if (!I || I->getParent()->getParent() != F) 3341 return false; 3342 } 3343 return true; 3344 } 3345 3346 static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, 3347 const GlobalValue *GV, SelectionDAG &DAG, 3348 EVT PtrVT, const SDLoc &dl) { 3349 // If we're creating a pool entry for a constant global with unnamed address, 3350 // and the global is small enough, we can emit it inline into the constant pool 3351 // to save ourselves an indirection. 3352 // 3353 // This is a win if the constant is only used in one function (so it doesn't 3354 // need to be duplicated) or duplicating the constant wouldn't increase code 3355 // size (implying the constant is no larger than 4 bytes). 3356 const Function &F = DAG.getMachineFunction().getFunction(); 3357 3358 // We rely on this decision to inline being idemopotent and unrelated to the 3359 // use-site. We know that if we inline a variable at one use site, we'll 3360 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel 3361 // doesn't know about this optimization, so bail out if it's enabled else 3362 // we could decide to inline here (and thus never emit the GV) but require 3363 // the GV from fast-isel generated code. 3364 if (!EnableConstpoolPromotion || 3365 DAG.getMachineFunction().getTarget().Options.EnableFastISel) 3366 return SDValue(); 3367 3368 auto *GVar = dyn_cast<GlobalVariable>(GV); 3369 if (!GVar || !GVar->hasInitializer() || 3370 !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() || 3371 !GVar->hasLocalLinkage()) 3372 return SDValue(); 3373 3374 // If we inline a value that contains relocations, we move the relocations 3375 // from .data to .text. This is not allowed in position-independent code. 3376 auto *Init = GVar->getInitializer(); 3377 if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) && 3378 Init->needsRelocation()) 3379 return SDValue(); 3380 3381 // The constant islands pass can only really deal with alignment requests 3382 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote 3383 // any type wanting greater alignment requirements than 4 bytes. We also 3384 // can only promote constants that are multiples of 4 bytes in size or 3385 // are paddable to a multiple of 4. Currently we only try and pad constants 3386 // that are strings for simplicity. 3387 auto *CDAInit = dyn_cast<ConstantDataArray>(Init); 3388 unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType()); 3389 unsigned Align = DAG.getDataLayout().getPreferredAlignment(GVar); 3390 unsigned RequiredPadding = 4 - (Size % 4); 3391 bool PaddingPossible = 3392 RequiredPadding == 4 || (CDAInit && CDAInit->isString()); 3393 if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize || 3394 Size == 0) 3395 return SDValue(); 3396 3397 unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding); 3398 MachineFunction &MF = DAG.getMachineFunction(); 3399 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3400 3401 // We can't bloat the constant pool too much, else the ConstantIslands pass 3402 // may fail to converge. If we haven't promoted this global yet (it may have 3403 // multiple uses), and promoting it would increase the constant pool size (Sz 3404 // > 4), ensure we have space to do so up to MaxTotal. 3405 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4) 3406 if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >= 3407 ConstpoolPromotionMaxTotal) 3408 return SDValue(); 3409 3410 // This is only valid if all users are in a single function; we can't clone 3411 // the constant in general. The LLVM IR unnamed_addr allows merging 3412 // constants, but not cloning them. 3413 // 3414 // We could potentially allow cloning if we could prove all uses of the 3415 // constant in the current function don't care about the address, like 3416 // printf format strings. But that isn't implemented for now. 3417 if (!allUsersAreInFunction(GVar, &F)) 3418 return SDValue(); 3419 3420 // We're going to inline this global. Pad it out if needed. 3421 if (RequiredPadding != 4) { 3422 StringRef S = CDAInit->getAsString(); 3423 3424 SmallVector<uint8_t,16> V(S.size()); 3425 std::copy(S.bytes_begin(), S.bytes_end(), V.begin()); 3426 while (RequiredPadding--) 3427 V.push_back(0); 3428 Init = ConstantDataArray::get(*DAG.getContext(), V); 3429 } 3430 3431 auto CPVal = ARMConstantPoolConstant::Create(GVar, Init); 3432 SDValue CPAddr = 3433 DAG.getTargetConstantPool(CPVal, PtrVT, /*Align=*/4); 3434 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) { 3435 AFI->markGlobalAsPromotedToConstantPool(GVar); 3436 AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() + 3437 PaddedSize - 4); 3438 } 3439 ++NumConstpoolPromoted; 3440 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3441 } 3442 3443 bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const { 3444 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 3445 if (!(GV = GA->getBaseObject())) 3446 return false; 3447 if (const auto *V = dyn_cast<GlobalVariable>(GV)) 3448 return V->isConstant(); 3449 return isa<Function>(GV); 3450 } 3451 3452 SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op, 3453 SelectionDAG &DAG) const { 3454 switch (Subtarget->getTargetTriple().getObjectFormat()) { 3455 default: llvm_unreachable("unknown object format"); 3456 case Triple::COFF: 3457 return LowerGlobalAddressWindows(Op, DAG); 3458 case Triple::ELF: 3459 return LowerGlobalAddressELF(Op, DAG); 3460 case Triple::MachO: 3461 return LowerGlobalAddressDarwin(Op, DAG); 3462 } 3463 } 3464 3465 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, 3466 SelectionDAG &DAG) const { 3467 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3468 SDLoc dl(Op); 3469 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3470 const TargetMachine &TM = getTargetMachine(); 3471 bool IsRO = isReadOnly(GV); 3472 3473 // promoteToConstantPool only if not generating XO text section 3474 if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV) && !Subtarget->genExecuteOnly()) 3475 if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl)) 3476 return V; 3477 3478 if (isPositionIndependent()) { 3479 bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV); 3480 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 3481 UseGOT_PREL ? ARMII::MO_GOT : 0); 3482 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G); 3483 if (UseGOT_PREL) 3484 Result = 3485 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, 3486 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3487 return Result; 3488 } else if (Subtarget->isROPI() && IsRO) { 3489 // PC-relative. 3490 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT); 3491 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G); 3492 return Result; 3493 } else if (Subtarget->isRWPI() && !IsRO) { 3494 // SB-relative. 3495 SDValue RelAddr; 3496 if (Subtarget->useMovt()) { 3497 ++NumMovwMovt; 3498 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL); 3499 RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G); 3500 } else { // use literal pool for address constant 3501 ARMConstantPoolValue *CPV = 3502 ARMConstantPoolConstant::Create(GV, ARMCP::SBREL); 3503 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3504 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3505 RelAddr = DAG.getLoad( 3506 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3507 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3508 } 3509 SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT); 3510 SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr); 3511 return Result; 3512 } 3513 3514 // If we have T2 ops, we can materialize the address directly via movt/movw 3515 // pair. This is always cheaper. 3516 if (Subtarget->useMovt()) { 3517 ++NumMovwMovt; 3518 // FIXME: Once remat is capable of dealing with instructions with register 3519 // operands, expand this into two nodes. 3520 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, 3521 DAG.getTargetGlobalAddress(GV, dl, PtrVT)); 3522 } else { 3523 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); 3524 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3525 return DAG.getLoad( 3526 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3527 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3528 } 3529 } 3530 3531 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, 3532 SelectionDAG &DAG) const { 3533 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 3534 "ROPI/RWPI not currently supported for Darwin"); 3535 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3536 SDLoc dl(Op); 3537 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3538 3539 if (Subtarget->useMovt()) 3540 ++NumMovwMovt; 3541 3542 // FIXME: Once remat is capable of dealing with instructions with register 3543 // operands, expand this into multiple nodes 3544 unsigned Wrapper = 3545 isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper; 3546 3547 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY); 3548 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G); 3549 3550 if (Subtarget->isGVIndirectSymbol(GV)) 3551 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, 3552 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3553 return Result; 3554 } 3555 3556 SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, 3557 SelectionDAG &DAG) const { 3558 assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported"); 3559 assert(Subtarget->useMovt() && 3560 "Windows on ARM expects to use movw/movt"); 3561 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 3562 "ROPI/RWPI not currently supported for Windows"); 3563 3564 const TargetMachine &TM = getTargetMachine(); 3565 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3566 ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG; 3567 if (GV->hasDLLImportStorageClass()) 3568 TargetFlags = ARMII::MO_DLLIMPORT; 3569 else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 3570 TargetFlags = ARMII::MO_COFFSTUB; 3571 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3572 SDValue Result; 3573 SDLoc DL(Op); 3574 3575 ++NumMovwMovt; 3576 3577 // FIXME: Once remat is capable of dealing with instructions with register 3578 // operands, expand this into two nodes. 3579 Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, 3580 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0, 3581 TargetFlags)); 3582 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB)) 3583 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, 3584 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3585 return Result; 3586 } 3587 3588 SDValue 3589 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { 3590 SDLoc dl(Op); 3591 SDValue Val = DAG.getConstant(0, dl, MVT::i32); 3592 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, 3593 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), 3594 Op.getOperand(1), Val); 3595 } 3596 3597 SDValue 3598 ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { 3599 SDLoc dl(Op); 3600 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0), 3601 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32)); 3602 } 3603 3604 SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, 3605 SelectionDAG &DAG) const { 3606 SDLoc dl(Op); 3607 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other, 3608 Op.getOperand(0)); 3609 } 3610 3611 SDValue ARMTargetLowering::LowerINTRINSIC_VOID( 3612 SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const { 3613 unsigned IntNo = 3614 cast<ConstantSDNode>( 3615 Op.getOperand(Op.getOperand(0).getValueType() == MVT::Other)) 3616 ->getZExtValue(); 3617 switch (IntNo) { 3618 default: 3619 return SDValue(); // Don't custom lower most intrinsics. 3620 case Intrinsic::arm_gnu_eabi_mcount: { 3621 MachineFunction &MF = DAG.getMachineFunction(); 3622 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3623 SDLoc dl(Op); 3624 SDValue Chain = Op.getOperand(0); 3625 // call "\01__gnu_mcount_nc" 3626 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); 3627 const uint32_t *Mask = 3628 ARI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C); 3629 assert(Mask && "Missing call preserved mask for calling convention"); 3630 // Mark LR an implicit live-in. 3631 unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); 3632 SDValue ReturnAddress = 3633 DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT); 3634 std::vector<EVT> ResultTys = {MVT::Other, MVT::Glue}; 3635 SDValue Callee = 3636 DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0); 3637 SDValue RegisterMask = DAG.getRegisterMask(Mask); 3638 if (Subtarget->isThumb()) 3639 return SDValue( 3640 DAG.getMachineNode( 3641 ARM::tBL_PUSHLR, dl, ResultTys, 3642 {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT), 3643 DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}), 3644 0); 3645 return SDValue( 3646 DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys, 3647 {ReturnAddress, Callee, RegisterMask, Chain}), 3648 0); 3649 } 3650 } 3651 } 3652 3653 SDValue 3654 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, 3655 const ARMSubtarget *Subtarget) const { 3656 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 3657 SDLoc dl(Op); 3658 switch (IntNo) { 3659 default: return SDValue(); // Don't custom lower most intrinsics. 3660 case Intrinsic::thread_pointer: { 3661 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3662 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 3663 } 3664 case Intrinsic::arm_cls: { 3665 const SDValue &Operand = Op.getOperand(1); 3666 const EVT VTy = Op.getValueType(); 3667 SDValue SRA = 3668 DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy)); 3669 SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand); 3670 SDValue SHL = 3671 DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy)); 3672 SDValue OR = 3673 DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy)); 3674 SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR); 3675 return Result; 3676 } 3677 case Intrinsic::arm_cls64: { 3678 // cls(x) = if cls(hi(x)) != 31 then cls(hi(x)) 3679 // else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x))) 3680 const SDValue &Operand = Op.getOperand(1); 3681 const EVT VTy = Op.getValueType(); 3682 3683 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand, 3684 DAG.getConstant(1, dl, VTy)); 3685 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand, 3686 DAG.getConstant(0, dl, VTy)); 3687 SDValue Constant0 = DAG.getConstant(0, dl, VTy); 3688 SDValue Constant1 = DAG.getConstant(1, dl, VTy); 3689 SDValue Constant31 = DAG.getConstant(31, dl, VTy); 3690 SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31); 3691 SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi); 3692 SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1); 3693 SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1); 3694 SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi); 3695 SDValue CheckLo = 3696 DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ); 3697 SDValue HiIsZero = 3698 DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ); 3699 SDValue AdjustedLo = 3700 DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy)); 3701 SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VTy, AdjustedLo); 3702 SDValue Result = 3703 DAG.getSelect(dl, VTy, CheckLo, 3704 DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi); 3705 return Result; 3706 } 3707 case Intrinsic::eh_sjlj_lsda: { 3708 MachineFunction &MF = DAG.getMachineFunction(); 3709 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3710 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 3711 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3712 SDValue CPAddr; 3713 bool IsPositionIndependent = isPositionIndependent(); 3714 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0; 3715 ARMConstantPoolValue *CPV = 3716 ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex, 3717 ARMCP::CPLSDA, PCAdj); 3718 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3719 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3720 SDValue Result = DAG.getLoad( 3721 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3722 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3723 3724 if (IsPositionIndependent) { 3725 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3726 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); 3727 } 3728 return Result; 3729 } 3730 case Intrinsic::arm_neon_vabs: 3731 return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(), 3732 Op.getOperand(1)); 3733 case Intrinsic::arm_neon_vmulls: 3734 case Intrinsic::arm_neon_vmullu: { 3735 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls) 3736 ? ARMISD::VMULLs : ARMISD::VMULLu; 3737 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3738 Op.getOperand(1), Op.getOperand(2)); 3739 } 3740 case Intrinsic::arm_neon_vminnm: 3741 case Intrinsic::arm_neon_vmaxnm: { 3742 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm) 3743 ? ISD::FMINNUM : ISD::FMAXNUM; 3744 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3745 Op.getOperand(1), Op.getOperand(2)); 3746 } 3747 case Intrinsic::arm_neon_vminu: 3748 case Intrinsic::arm_neon_vmaxu: { 3749 if (Op.getValueType().isFloatingPoint()) 3750 return SDValue(); 3751 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu) 3752 ? ISD::UMIN : ISD::UMAX; 3753 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3754 Op.getOperand(1), Op.getOperand(2)); 3755 } 3756 case Intrinsic::arm_neon_vmins: 3757 case Intrinsic::arm_neon_vmaxs: { 3758 // v{min,max}s is overloaded between signed integers and floats. 3759 if (!Op.getValueType().isFloatingPoint()) { 3760 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) 3761 ? ISD::SMIN : ISD::SMAX; 3762 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3763 Op.getOperand(1), Op.getOperand(2)); 3764 } 3765 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) 3766 ? ISD::FMINIMUM : ISD::FMAXIMUM; 3767 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3768 Op.getOperand(1), Op.getOperand(2)); 3769 } 3770 case Intrinsic::arm_neon_vtbl1: 3771 return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(), 3772 Op.getOperand(1), Op.getOperand(2)); 3773 case Intrinsic::arm_neon_vtbl2: 3774 return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(), 3775 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 3776 case Intrinsic::arm_mve_pred_i2v: 3777 case Intrinsic::arm_mve_pred_v2i: 3778 return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(), 3779 Op.getOperand(1)); 3780 } 3781 } 3782 3783 static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, 3784 const ARMSubtarget *Subtarget) { 3785 SDLoc dl(Op); 3786 ConstantSDNode *SSIDNode = cast<ConstantSDNode>(Op.getOperand(2)); 3787 auto SSID = static_cast<SyncScope::ID>(SSIDNode->getZExtValue()); 3788 if (SSID == SyncScope::SingleThread) 3789 return Op; 3790 3791 if (!Subtarget->hasDataBarrier()) { 3792 // Some ARMv6 cpus can support data barriers with an mcr instruction. 3793 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 3794 // here. 3795 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() && 3796 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!"); 3797 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0), 3798 DAG.getConstant(0, dl, MVT::i32)); 3799 } 3800 3801 ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1)); 3802 AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue()); 3803 ARM_MB::MemBOpt Domain = ARM_MB::ISH; 3804 if (Subtarget->isMClass()) { 3805 // Only a full system barrier exists in the M-class architectures. 3806 Domain = ARM_MB::SY; 3807 } else if (Subtarget->preferISHSTBarriers() && 3808 Ord == AtomicOrdering::Release) { 3809 // Swift happens to implement ISHST barriers in a way that's compatible with 3810 // Release semantics but weaker than ISH so we'd be fools not to use 3811 // it. Beware: other processors probably don't! 3812 Domain = ARM_MB::ISHST; 3813 } 3814 3815 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0), 3816 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32), 3817 DAG.getConstant(Domain, dl, MVT::i32)); 3818 } 3819 3820 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG, 3821 const ARMSubtarget *Subtarget) { 3822 // ARM pre v5TE and Thumb1 does not have preload instructions. 3823 if (!(Subtarget->isThumb2() || 3824 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps()))) 3825 // Just preserve the chain. 3826 return Op.getOperand(0); 3827 3828 SDLoc dl(Op); 3829 unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1; 3830 if (!isRead && 3831 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension())) 3832 // ARMv7 with MP extension has PLDW. 3833 return Op.getOperand(0); 3834 3835 unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 3836 if (Subtarget->isThumb()) { 3837 // Invert the bits. 3838 isRead = ~isRead & 1; 3839 isData = ~isData & 1; 3840 } 3841 3842 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0), 3843 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32), 3844 DAG.getConstant(isData, dl, MVT::i32)); 3845 } 3846 3847 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { 3848 MachineFunction &MF = DAG.getMachineFunction(); 3849 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>(); 3850 3851 // vastart just stores the address of the VarArgsFrameIndex slot into the 3852 // memory location argument. 3853 SDLoc dl(Op); 3854 EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 3855 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 3856 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 3857 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), 3858 MachinePointerInfo(SV)); 3859 } 3860 3861 SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, 3862 CCValAssign &NextVA, 3863 SDValue &Root, 3864 SelectionDAG &DAG, 3865 const SDLoc &dl) const { 3866 MachineFunction &MF = DAG.getMachineFunction(); 3867 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3868 3869 const TargetRegisterClass *RC; 3870 if (AFI->isThumb1OnlyFunction()) 3871 RC = &ARM::tGPRRegClass; 3872 else 3873 RC = &ARM::GPRRegClass; 3874 3875 // Transform the arguments stored in physical registers into virtual ones. 3876 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 3877 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 3878 3879 SDValue ArgValue2; 3880 if (NextVA.isMemLoc()) { 3881 MachineFrameInfo &MFI = MF.getFrameInfo(); 3882 int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true); 3883 3884 // Create load node to retrieve arguments from the stack. 3885 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 3886 ArgValue2 = DAG.getLoad( 3887 MVT::i32, dl, Root, FIN, 3888 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); 3889 } else { 3890 Reg = MF.addLiveIn(NextVA.getLocReg(), RC); 3891 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 3892 } 3893 if (!Subtarget->isLittle()) 3894 std::swap (ArgValue, ArgValue2); 3895 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2); 3896 } 3897 3898 // The remaining GPRs hold either the beginning of variable-argument 3899 // data, or the beginning of an aggregate passed by value (usually 3900 // byval). Either way, we allocate stack slots adjacent to the data 3901 // provided by our caller, and store the unallocated registers there. 3902 // If this is a variadic function, the va_list pointer will begin with 3903 // these values; otherwise, this reassembles a (byval) structure that 3904 // was split between registers and memory. 3905 // Return: The frame index registers were stored into. 3906 int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, 3907 const SDLoc &dl, SDValue &Chain, 3908 const Value *OrigArg, 3909 unsigned InRegsParamRecordIdx, 3910 int ArgOffset, unsigned ArgSize) const { 3911 // Currently, two use-cases possible: 3912 // Case #1. Non-var-args function, and we meet first byval parameter. 3913 // Setup first unallocated register as first byval register; 3914 // eat all remained registers 3915 // (these two actions are performed by HandleByVal method). 3916 // Then, here, we initialize stack frame with 3917 // "store-reg" instructions. 3918 // Case #2. Var-args function, that doesn't contain byval parameters. 3919 // The same: eat all remained unallocated registers, 3920 // initialize stack frame. 3921 3922 MachineFunction &MF = DAG.getMachineFunction(); 3923 MachineFrameInfo &MFI = MF.getFrameInfo(); 3924 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3925 unsigned RBegin, REnd; 3926 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) { 3927 CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd); 3928 } else { 3929 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs); 3930 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx]; 3931 REnd = ARM::R4; 3932 } 3933 3934 if (REnd != RBegin) 3935 ArgOffset = -4 * (ARM::R4 - RBegin); 3936 3937 auto PtrVT = getPointerTy(DAG.getDataLayout()); 3938 int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false); 3939 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT); 3940 3941 SmallVector<SDValue, 4> MemOps; 3942 const TargetRegisterClass *RC = 3943 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass; 3944 3945 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) { 3946 unsigned VReg = MF.addLiveIn(Reg, RC); 3947 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); 3948 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, 3949 MachinePointerInfo(OrigArg, 4 * i)); 3950 MemOps.push_back(Store); 3951 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT)); 3952 } 3953 3954 if (!MemOps.empty()) 3955 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); 3956 return FrameIndex; 3957 } 3958 3959 // Setup stack frame, the va_list pointer will start from. 3960 void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, 3961 const SDLoc &dl, SDValue &Chain, 3962 unsigned ArgOffset, 3963 unsigned TotalArgRegsSaveSize, 3964 bool ForceMutable) const { 3965 MachineFunction &MF = DAG.getMachineFunction(); 3966 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3967 3968 // Try to store any remaining integer argument regs 3969 // to their spots on the stack so that they may be loaded by dereferencing 3970 // the result of va_next. 3971 // If there is no regs to be stored, just point address after last 3972 // argument passed via stack. 3973 int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr, 3974 CCInfo.getInRegsParamsCount(), 3975 CCInfo.getNextStackOffset(), 3976 std::max(4U, TotalArgRegsSaveSize)); 3977 AFI->setVarArgsFrameIndex(FrameIndex); 3978 } 3979 3980 SDValue ARMTargetLowering::LowerFormalArguments( 3981 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 3982 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 3983 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 3984 MachineFunction &MF = DAG.getMachineFunction(); 3985 MachineFrameInfo &MFI = MF.getFrameInfo(); 3986 3987 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3988 3989 // Assign locations to all of the incoming arguments. 3990 SmallVector<CCValAssign, 16> ArgLocs; 3991 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 3992 *DAG.getContext()); 3993 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg)); 3994 3995 SmallVector<SDValue, 16> ArgValues; 3996 SDValue ArgValue; 3997 Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin(); 3998 unsigned CurArgIdx = 0; 3999 4000 // Initially ArgRegsSaveSize is zero. 4001 // Then we increase this value each time we meet byval parameter. 4002 // We also increase this value in case of varargs function. 4003 AFI->setArgRegsSaveSize(0); 4004 4005 // Calculate the amount of stack space that we need to allocate to store 4006 // byval and variadic arguments that are passed in registers. 4007 // We need to know this before we allocate the first byval or variadic 4008 // argument, as they will be allocated a stack slot below the CFA (Canonical 4009 // Frame Address, the stack pointer at entry to the function). 4010 unsigned ArgRegBegin = ARM::R4; 4011 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 4012 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount()) 4013 break; 4014 4015 CCValAssign &VA = ArgLocs[i]; 4016 unsigned Index = VA.getValNo(); 4017 ISD::ArgFlagsTy Flags = Ins[Index].Flags; 4018 if (!Flags.isByVal()) 4019 continue; 4020 4021 assert(VA.isMemLoc() && "unexpected byval pointer in reg"); 4022 unsigned RBegin, REnd; 4023 CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd); 4024 ArgRegBegin = std::min(ArgRegBegin, RBegin); 4025 4026 CCInfo.nextInRegsParam(); 4027 } 4028 CCInfo.rewindByValRegsInfo(); 4029 4030 int lastInsIndex = -1; 4031 if (isVarArg && MFI.hasVAStart()) { 4032 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs); 4033 if (RegIdx != array_lengthof(GPRArgRegs)) 4034 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]); 4035 } 4036 4037 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin); 4038 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize); 4039 auto PtrVT = getPointerTy(DAG.getDataLayout()); 4040 4041 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 4042 CCValAssign &VA = ArgLocs[i]; 4043 if (Ins[VA.getValNo()].isOrigArg()) { 4044 std::advance(CurOrigArg, 4045 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx); 4046 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex(); 4047 } 4048 // Arguments stored in registers. 4049 if (VA.isRegLoc()) { 4050 EVT RegVT = VA.getLocVT(); 4051 4052 if (VA.needsCustom()) { 4053 // f64 and vector types are split up into multiple registers or 4054 // combinations of registers and stack slots. 4055 if (VA.getLocVT() == MVT::v2f64) { 4056 SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i], 4057 Chain, DAG, dl); 4058 VA = ArgLocs[++i]; // skip ahead to next loc 4059 SDValue ArgValue2; 4060 if (VA.isMemLoc()) { 4061 int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true); 4062 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 4063 ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN, 4064 MachinePointerInfo::getFixedStack( 4065 DAG.getMachineFunction(), FI)); 4066 } else { 4067 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], 4068 Chain, DAG, dl); 4069 } 4070 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 4071 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, 4072 ArgValue, ArgValue1, 4073 DAG.getIntPtrConstant(0, dl)); 4074 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, 4075 ArgValue, ArgValue2, 4076 DAG.getIntPtrConstant(1, dl)); 4077 } else 4078 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); 4079 } else { 4080 const TargetRegisterClass *RC; 4081 4082 4083 if (RegVT == MVT::f16) 4084 RC = &ARM::HPRRegClass; 4085 else if (RegVT == MVT::f32) 4086 RC = &ARM::SPRRegClass; 4087 else if (RegVT == MVT::f64 || RegVT == MVT::v4f16) 4088 RC = &ARM::DPRRegClass; 4089 else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16) 4090 RC = &ARM::QPRRegClass; 4091 else if (RegVT == MVT::i32) 4092 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass 4093 : &ARM::GPRRegClass; 4094 else 4095 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); 4096 4097 // Transform the arguments in physical registers into virtual ones. 4098 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 4099 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 4100 4101 // If this value is passed in r0 and has the returned attribute (e.g. 4102 // C++ 'structors), record this fact for later use. 4103 if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) { 4104 AFI->setPreservesR0(); 4105 } 4106 } 4107 4108 // If this is an 8 or 16-bit value, it is really passed promoted 4109 // to 32 bits. Insert an assert[sz]ext to capture this, then 4110 // truncate to the right size. 4111 switch (VA.getLocInfo()) { 4112 default: llvm_unreachable("Unknown loc info!"); 4113 case CCValAssign::Full: break; 4114 case CCValAssign::BCvt: 4115 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 4116 break; 4117 case CCValAssign::SExt: 4118 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 4119 DAG.getValueType(VA.getValVT())); 4120 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 4121 break; 4122 case CCValAssign::ZExt: 4123 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 4124 DAG.getValueType(VA.getValVT())); 4125 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 4126 break; 4127 } 4128 4129 InVals.push_back(ArgValue); 4130 } else { // VA.isRegLoc() 4131 // sanity check 4132 assert(VA.isMemLoc()); 4133 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered"); 4134 4135 int index = VA.getValNo(); 4136 4137 // Some Ins[] entries become multiple ArgLoc[] entries. 4138 // Process them only once. 4139 if (index != lastInsIndex) 4140 { 4141 ISD::ArgFlagsTy Flags = Ins[index].Flags; 4142 // FIXME: For now, all byval parameter objects are marked mutable. 4143 // This can be changed with more analysis. 4144 // In case of tail call optimization mark all arguments mutable. 4145 // Since they could be overwritten by lowering of arguments in case of 4146 // a tail call. 4147 if (Flags.isByVal()) { 4148 assert(Ins[index].isOrigArg() && 4149 "Byval arguments cannot be implicit"); 4150 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed(); 4151 4152 int FrameIndex = StoreByValRegs( 4153 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex, 4154 VA.getLocMemOffset(), Flags.getByValSize()); 4155 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT)); 4156 CCInfo.nextInRegsParam(); 4157 } else { 4158 unsigned FIOffset = VA.getLocMemOffset(); 4159 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8, 4160 FIOffset, true); 4161 4162 // Create load nodes to retrieve arguments from the stack. 4163 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 4164 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, 4165 MachinePointerInfo::getFixedStack( 4166 DAG.getMachineFunction(), FI))); 4167 } 4168 lastInsIndex = index; 4169 } 4170 } 4171 } 4172 4173 // varargs 4174 if (isVarArg && MFI.hasVAStart()) 4175 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, 4176 CCInfo.getNextStackOffset(), 4177 TotalArgRegsSaveSize); 4178 4179 AFI->setArgumentStackSize(CCInfo.getNextStackOffset()); 4180 4181 return Chain; 4182 } 4183 4184 /// isFloatingPointZero - Return true if this is +0.0. 4185 static bool isFloatingPointZero(SDValue Op) { 4186 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) 4187 return CFP->getValueAPF().isPosZero(); 4188 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) { 4189 // Maybe this has already been legalized into the constant pool? 4190 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) { 4191 SDValue WrapperOp = Op.getOperand(1).getOperand(0); 4192 if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp)) 4193 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal())) 4194 return CFP->getValueAPF().isPosZero(); 4195 } 4196 } else if (Op->getOpcode() == ISD::BITCAST && 4197 Op->getValueType(0) == MVT::f64) { 4198 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64) 4199 // created by LowerConstantFP(). 4200 SDValue BitcastOp = Op->getOperand(0); 4201 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM && 4202 isNullConstant(BitcastOp->getOperand(0))) 4203 return true; 4204 } 4205 return false; 4206 } 4207 4208 /// Returns appropriate ARM CMP (cmp) and corresponding condition code for 4209 /// the given operands. 4210 SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, 4211 SDValue &ARMcc, SelectionDAG &DAG, 4212 const SDLoc &dl) const { 4213 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { 4214 unsigned C = RHSC->getZExtValue(); 4215 if (!isLegalICmpImmediate((int32_t)C)) { 4216 // Constant does not fit, try adjusting it by one. 4217 switch (CC) { 4218 default: break; 4219 case ISD::SETLT: 4220 case ISD::SETGE: 4221 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) { 4222 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; 4223 RHS = DAG.getConstant(C - 1, dl, MVT::i32); 4224 } 4225 break; 4226 case ISD::SETULT: 4227 case ISD::SETUGE: 4228 if (C != 0 && isLegalICmpImmediate(C-1)) { 4229 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; 4230 RHS = DAG.getConstant(C - 1, dl, MVT::i32); 4231 } 4232 break; 4233 case ISD::SETLE: 4234 case ISD::SETGT: 4235 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) { 4236 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; 4237 RHS = DAG.getConstant(C + 1, dl, MVT::i32); 4238 } 4239 break; 4240 case ISD::SETULE: 4241 case ISD::SETUGT: 4242 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) { 4243 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; 4244 RHS = DAG.getConstant(C + 1, dl, MVT::i32); 4245 } 4246 break; 4247 } 4248 } 4249 } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) && 4250 (ARM_AM::getShiftOpcForNode(RHS.getOpcode()) == ARM_AM::no_shift)) { 4251 // In ARM and Thumb-2, the compare instructions can shift their second 4252 // operand. 4253 CC = ISD::getSetCCSwappedOperands(CC); 4254 std::swap(LHS, RHS); 4255 } 4256 4257 // Thumb1 has very limited immediate modes, so turning an "and" into a 4258 // shift can save multiple instructions. 4259 // 4260 // If we have (x & C1), and C1 is an appropriate mask, we can transform it 4261 // into "((x << n) >> n)". But that isn't necessarily profitable on its 4262 // own. If it's the operand to an unsigned comparison with an immediate, 4263 // we can eliminate one of the shifts: we transform 4264 // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)". 4265 // 4266 // We avoid transforming cases which aren't profitable due to encoding 4267 // details: 4268 // 4269 // 1. C2 fits into the immediate field of a cmp, and the transformed version 4270 // would not; in that case, we're essentially trading one immediate load for 4271 // another. 4272 // 2. C1 is 255 or 65535, so we can use uxtb or uxth. 4273 // 3. C2 is zero; we have other code for this special case. 4274 // 4275 // FIXME: Figure out profitability for Thumb2; we usually can't save an 4276 // instruction, since the AND is always one instruction anyway, but we could 4277 // use narrow instructions in some cases. 4278 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND && 4279 LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) && 4280 LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) && 4281 !isSignedIntSetCC(CC)) { 4282 unsigned Mask = cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue(); 4283 auto *RHSC = cast<ConstantSDNode>(RHS.getNode()); 4284 uint64_t RHSV = RHSC->getZExtValue(); 4285 if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) { 4286 unsigned ShiftBits = countLeadingZeros(Mask); 4287 if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) { 4288 SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32); 4289 LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt); 4290 RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32); 4291 } 4292 } 4293 } 4294 4295 // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a 4296 // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same 4297 // way a cmp would. 4298 // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and 4299 // some tweaks to the heuristics for the previous and->shift transform. 4300 // FIXME: Optimize cases where the LHS isn't a shift. 4301 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL && 4302 isa<ConstantSDNode>(RHS) && 4303 cast<ConstantSDNode>(RHS)->getZExtValue() == 0x80000000U && 4304 CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) && 4305 cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() < 31) { 4306 unsigned ShiftAmt = 4307 cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() + 1; 4308 SDValue Shift = DAG.getNode(ARMISD::LSLS, dl, 4309 DAG.getVTList(MVT::i32, MVT::i32), 4310 LHS.getOperand(0), 4311 DAG.getConstant(ShiftAmt, dl, MVT::i32)); 4312 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, 4313 Shift.getValue(1), SDValue()); 4314 ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32); 4315 return Chain.getValue(1); 4316 } 4317 4318 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 4319 4320 // If the RHS is a constant zero then the V (overflow) flag will never be 4321 // set. This can allow us to simplify GE to PL or LT to MI, which can be 4322 // simpler for other passes (like the peephole optimiser) to deal with. 4323 if (isNullConstant(RHS)) { 4324 switch (CondCode) { 4325 default: break; 4326 case ARMCC::GE: 4327 CondCode = ARMCC::PL; 4328 break; 4329 case ARMCC::LT: 4330 CondCode = ARMCC::MI; 4331 break; 4332 } 4333 } 4334 4335 ARMISD::NodeType CompareType; 4336 switch (CondCode) { 4337 default: 4338 CompareType = ARMISD::CMP; 4339 break; 4340 case ARMCC::EQ: 4341 case ARMCC::NE: 4342 // Uses only Z Flag 4343 CompareType = ARMISD::CMPZ; 4344 break; 4345 } 4346 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 4347 return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS); 4348 } 4349 4350 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. 4351 SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, 4352 SelectionDAG &DAG, const SDLoc &dl) const { 4353 assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64); 4354 SDValue Cmp; 4355 if (!isFloatingPointZero(RHS)) 4356 Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS); 4357 else 4358 Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS); 4359 return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp); 4360 } 4361 4362 /// duplicateCmp - Glue values can have only one use, so this function 4363 /// duplicates a comparison node. 4364 SDValue 4365 ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const { 4366 unsigned Opc = Cmp.getOpcode(); 4367 SDLoc DL(Cmp); 4368 if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ) 4369 return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); 4370 4371 assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation"); 4372 Cmp = Cmp.getOperand(0); 4373 Opc = Cmp.getOpcode(); 4374 if (Opc == ARMISD::CMPFP) 4375 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); 4376 else { 4377 assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT"); 4378 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0)); 4379 } 4380 return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp); 4381 } 4382 4383 // This function returns three things: the arithmetic computation itself 4384 // (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The 4385 // comparison and the condition code define the case in which the arithmetic 4386 // computation *does not* overflow. 4387 std::pair<SDValue, SDValue> 4388 ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG, 4389 SDValue &ARMcc) const { 4390 assert(Op.getValueType() == MVT::i32 && "Unsupported value type"); 4391 4392 SDValue Value, OverflowCmp; 4393 SDValue LHS = Op.getOperand(0); 4394 SDValue RHS = Op.getOperand(1); 4395 SDLoc dl(Op); 4396 4397 // FIXME: We are currently always generating CMPs because we don't support 4398 // generating CMN through the backend. This is not as good as the natural 4399 // CMP case because it causes a register dependency and cannot be folded 4400 // later. 4401 4402 switch (Op.getOpcode()) { 4403 default: 4404 llvm_unreachable("Unknown overflow instruction!"); 4405 case ISD::SADDO: 4406 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); 4407 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS); 4408 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); 4409 break; 4410 case ISD::UADDO: 4411 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); 4412 // We use ADDC here to correspond to its use in LowerUnsignedALUO. 4413 // We do not use it in the USUBO case as Value may not be used. 4414 Value = DAG.getNode(ARMISD::ADDC, dl, 4415 DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS) 4416 .getValue(0); 4417 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); 4418 break; 4419 case ISD::SSUBO: 4420 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); 4421 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); 4422 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); 4423 break; 4424 case ISD::USUBO: 4425 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); 4426 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); 4427 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); 4428 break; 4429 case ISD::UMULO: 4430 // We generate a UMUL_LOHI and then check if the high word is 0. 4431 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32); 4432 Value = DAG.getNode(ISD::UMUL_LOHI, dl, 4433 DAG.getVTList(Op.getValueType(), Op.getValueType()), 4434 LHS, RHS); 4435 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1), 4436 DAG.getConstant(0, dl, MVT::i32)); 4437 Value = Value.getValue(0); // We only want the low 32 bits for the result. 4438 break; 4439 case ISD::SMULO: 4440 // We generate a SMUL_LOHI and then check if all the bits of the high word 4441 // are the same as the sign bit of the low word. 4442 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32); 4443 Value = DAG.getNode(ISD::SMUL_LOHI, dl, 4444 DAG.getVTList(Op.getValueType(), Op.getValueType()), 4445 LHS, RHS); 4446 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1), 4447 DAG.getNode(ISD::SRA, dl, Op.getValueType(), 4448 Value.getValue(0), 4449 DAG.getConstant(31, dl, MVT::i32))); 4450 Value = Value.getValue(0); // We only want the low 32 bits for the result. 4451 break; 4452 } // switch (...) 4453 4454 return std::make_pair(Value, OverflowCmp); 4455 } 4456 4457 SDValue 4458 ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const { 4459 // Let legalize expand this if it isn't a legal type yet. 4460 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 4461 return SDValue(); 4462 4463 SDValue Value, OverflowCmp; 4464 SDValue ARMcc; 4465 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc); 4466 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4467 SDLoc dl(Op); 4468 // We use 0 and 1 as false and true values. 4469 SDValue TVal = DAG.getConstant(1, dl, MVT::i32); 4470 SDValue FVal = DAG.getConstant(0, dl, MVT::i32); 4471 EVT VT = Op.getValueType(); 4472 4473 SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal, 4474 ARMcc, CCR, OverflowCmp); 4475 4476 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 4477 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 4478 } 4479 4480 static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry, 4481 SelectionDAG &DAG) { 4482 SDLoc DL(BoolCarry); 4483 EVT CarryVT = BoolCarry.getValueType(); 4484 4485 // This converts the boolean value carry into the carry flag by doing 4486 // ARMISD::SUBC Carry, 1 4487 SDValue Carry = DAG.getNode(ARMISD::SUBC, DL, 4488 DAG.getVTList(CarryVT, MVT::i32), 4489 BoolCarry, DAG.getConstant(1, DL, CarryVT)); 4490 return Carry.getValue(1); 4491 } 4492 4493 static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT, 4494 SelectionDAG &DAG) { 4495 SDLoc DL(Flags); 4496 4497 // Now convert the carry flag into a boolean carry. We do this 4498 // using ARMISD:ADDE 0, 0, Carry 4499 return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32), 4500 DAG.getConstant(0, DL, MVT::i32), 4501 DAG.getConstant(0, DL, MVT::i32), Flags); 4502 } 4503 4504 SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op, 4505 SelectionDAG &DAG) const { 4506 // Let legalize expand this if it isn't a legal type yet. 4507 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 4508 return SDValue(); 4509 4510 SDValue LHS = Op.getOperand(0); 4511 SDValue RHS = Op.getOperand(1); 4512 SDLoc dl(Op); 4513 4514 EVT VT = Op.getValueType(); 4515 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 4516 SDValue Value; 4517 SDValue Overflow; 4518 switch (Op.getOpcode()) { 4519 default: 4520 llvm_unreachable("Unknown overflow instruction!"); 4521 case ISD::UADDO: 4522 Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS); 4523 // Convert the carry flag into a boolean value. 4524 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG); 4525 break; 4526 case ISD::USUBO: { 4527 Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS); 4528 // Convert the carry flag into a boolean value. 4529 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG); 4530 // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow 4531 // value. So compute 1 - C. 4532 Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32, 4533 DAG.getConstant(1, dl, MVT::i32), Overflow); 4534 break; 4535 } 4536 } 4537 4538 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 4539 } 4540 4541 static SDValue LowerSADDSUBSAT(SDValue Op, SelectionDAG &DAG, 4542 const ARMSubtarget *Subtarget) { 4543 EVT VT = Op.getValueType(); 4544 if (!Subtarget->hasDSP()) 4545 return SDValue(); 4546 if (!VT.isSimple()) 4547 return SDValue(); 4548 4549 unsigned NewOpcode; 4550 bool IsAdd = Op->getOpcode() == ISD::SADDSAT; 4551 switch (VT.getSimpleVT().SimpleTy) { 4552 default: 4553 return SDValue(); 4554 case MVT::i8: 4555 NewOpcode = IsAdd ? ARMISD::QADD8b : ARMISD::QSUB8b; 4556 break; 4557 case MVT::i16: 4558 NewOpcode = IsAdd ? ARMISD::QADD16b : ARMISD::QSUB16b; 4559 break; 4560 } 4561 4562 SDLoc dl(Op); 4563 SDValue Add = 4564 DAG.getNode(NewOpcode, dl, MVT::i32, 4565 DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32), 4566 DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32)); 4567 return DAG.getNode(ISD::TRUNCATE, dl, VT, Add); 4568 } 4569 4570 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 4571 SDValue Cond = Op.getOperand(0); 4572 SDValue SelectTrue = Op.getOperand(1); 4573 SDValue SelectFalse = Op.getOperand(2); 4574 SDLoc dl(Op); 4575 unsigned Opc = Cond.getOpcode(); 4576 4577 if (Cond.getResNo() == 1 && 4578 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 4579 Opc == ISD::USUBO)) { 4580 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) 4581 return SDValue(); 4582 4583 SDValue Value, OverflowCmp; 4584 SDValue ARMcc; 4585 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); 4586 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4587 EVT VT = Op.getValueType(); 4588 4589 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR, 4590 OverflowCmp, DAG); 4591 } 4592 4593 // Convert: 4594 // 4595 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond) 4596 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond) 4597 // 4598 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) { 4599 const ConstantSDNode *CMOVTrue = 4600 dyn_cast<ConstantSDNode>(Cond.getOperand(0)); 4601 const ConstantSDNode *CMOVFalse = 4602 dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 4603 4604 if (CMOVTrue && CMOVFalse) { 4605 unsigned CMOVTrueVal = CMOVTrue->getZExtValue(); 4606 unsigned CMOVFalseVal = CMOVFalse->getZExtValue(); 4607 4608 SDValue True; 4609 SDValue False; 4610 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) { 4611 True = SelectTrue; 4612 False = SelectFalse; 4613 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) { 4614 True = SelectFalse; 4615 False = SelectTrue; 4616 } 4617 4618 if (True.getNode() && False.getNode()) { 4619 EVT VT = Op.getValueType(); 4620 SDValue ARMcc = Cond.getOperand(2); 4621 SDValue CCR = Cond.getOperand(3); 4622 SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG); 4623 assert(True.getValueType() == VT); 4624 return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG); 4625 } 4626 } 4627 } 4628 4629 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the 4630 // undefined bits before doing a full-word comparison with zero. 4631 Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond, 4632 DAG.getConstant(1, dl, Cond.getValueType())); 4633 4634 return DAG.getSelectCC(dl, Cond, 4635 DAG.getConstant(0, dl, Cond.getValueType()), 4636 SelectTrue, SelectFalse, ISD::SETNE); 4637 } 4638 4639 static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, 4640 bool &swpCmpOps, bool &swpVselOps) { 4641 // Start by selecting the GE condition code for opcodes that return true for 4642 // 'equality' 4643 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE || 4644 CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE) 4645 CondCode = ARMCC::GE; 4646 4647 // and GT for opcodes that return false for 'equality'. 4648 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT || 4649 CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT) 4650 CondCode = ARMCC::GT; 4651 4652 // Since we are constrained to GE/GT, if the opcode contains 'less', we need 4653 // to swap the compare operands. 4654 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT || 4655 CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT) 4656 swpCmpOps = true; 4657 4658 // Both GT and GE are ordered comparisons, and return false for 'unordered'. 4659 // If we have an unordered opcode, we need to swap the operands to the VSEL 4660 // instruction (effectively negating the condition). 4661 // 4662 // This also has the effect of swapping which one of 'less' or 'greater' 4663 // returns true, so we also swap the compare operands. It also switches 4664 // whether we return true for 'equality', so we compensate by picking the 4665 // opposite condition code to our original choice. 4666 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE || 4667 CC == ISD::SETUGT) { 4668 swpCmpOps = !swpCmpOps; 4669 swpVselOps = !swpVselOps; 4670 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT; 4671 } 4672 4673 // 'ordered' is 'anything but unordered', so use the VS condition code and 4674 // swap the VSEL operands. 4675 if (CC == ISD::SETO) { 4676 CondCode = ARMCC::VS; 4677 swpVselOps = true; 4678 } 4679 4680 // 'unordered or not equal' is 'anything but equal', so use the EQ condition 4681 // code and swap the VSEL operands. Also do this if we don't care about the 4682 // unordered case. 4683 if (CC == ISD::SETUNE || CC == ISD::SETNE) { 4684 CondCode = ARMCC::EQ; 4685 swpVselOps = true; 4686 } 4687 } 4688 4689 SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, 4690 SDValue TrueVal, SDValue ARMcc, SDValue CCR, 4691 SDValue Cmp, SelectionDAG &DAG) const { 4692 if (!Subtarget->hasFP64() && VT == MVT::f64) { 4693 FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl, 4694 DAG.getVTList(MVT::i32, MVT::i32), FalseVal); 4695 TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl, 4696 DAG.getVTList(MVT::i32, MVT::i32), TrueVal); 4697 4698 SDValue TrueLow = TrueVal.getValue(0); 4699 SDValue TrueHigh = TrueVal.getValue(1); 4700 SDValue FalseLow = FalseVal.getValue(0); 4701 SDValue FalseHigh = FalseVal.getValue(1); 4702 4703 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow, 4704 ARMcc, CCR, Cmp); 4705 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh, 4706 ARMcc, CCR, duplicateCmp(Cmp, DAG)); 4707 4708 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High); 4709 } else { 4710 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR, 4711 Cmp); 4712 } 4713 } 4714 4715 static bool isGTorGE(ISD::CondCode CC) { 4716 return CC == ISD::SETGT || CC == ISD::SETGE; 4717 } 4718 4719 static bool isLTorLE(ISD::CondCode CC) { 4720 return CC == ISD::SETLT || CC == ISD::SETLE; 4721 } 4722 4723 // See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating. 4724 // All of these conditions (and their <= and >= counterparts) will do: 4725 // x < k ? k : x 4726 // x > k ? x : k 4727 // k < x ? x : k 4728 // k > x ? k : x 4729 static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, 4730 const SDValue TrueVal, const SDValue FalseVal, 4731 const ISD::CondCode CC, const SDValue K) { 4732 return (isGTorGE(CC) && 4733 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) || 4734 (isLTorLE(CC) && 4735 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))); 4736 } 4737 4738 // Similar to isLowerSaturate(), but checks for upper-saturating conditions. 4739 static bool isUpperSaturate(const SDValue LHS, const SDValue RHS, 4740 const SDValue TrueVal, const SDValue FalseVal, 4741 const ISD::CondCode CC, const SDValue K) { 4742 return (isGTorGE(CC) && 4743 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))) || 4744 (isLTorLE(CC) && 4745 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))); 4746 } 4747 4748 // Check if two chained conditionals could be converted into SSAT or USAT. 4749 // 4750 // SSAT can replace a set of two conditional selectors that bound a number to an 4751 // interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples: 4752 // 4753 // x < -k ? -k : (x > k ? k : x) 4754 // x < -k ? -k : (x < k ? x : k) 4755 // x > -k ? (x > k ? k : x) : -k 4756 // x < k ? (x < -k ? -k : x) : k 4757 // etc. 4758 // 4759 // USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1 is 4760 // a power of 2. 4761 // 4762 // It returns true if the conversion can be done, false otherwise. 4763 // Additionally, the variable is returned in parameter V, the constant in K and 4764 // usat is set to true if the conditional represents an unsigned saturation 4765 static bool isSaturatingConditional(const SDValue &Op, SDValue &V, 4766 uint64_t &K, bool &usat) { 4767 SDValue LHS1 = Op.getOperand(0); 4768 SDValue RHS1 = Op.getOperand(1); 4769 SDValue TrueVal1 = Op.getOperand(2); 4770 SDValue FalseVal1 = Op.getOperand(3); 4771 ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 4772 4773 const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1; 4774 if (Op2.getOpcode() != ISD::SELECT_CC) 4775 return false; 4776 4777 SDValue LHS2 = Op2.getOperand(0); 4778 SDValue RHS2 = Op2.getOperand(1); 4779 SDValue TrueVal2 = Op2.getOperand(2); 4780 SDValue FalseVal2 = Op2.getOperand(3); 4781 ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get(); 4782 4783 // Find out which are the constants and which are the variables 4784 // in each conditional 4785 SDValue *K1 = isa<ConstantSDNode>(LHS1) ? &LHS1 : isa<ConstantSDNode>(RHS1) 4786 ? &RHS1 4787 : nullptr; 4788 SDValue *K2 = isa<ConstantSDNode>(LHS2) ? &LHS2 : isa<ConstantSDNode>(RHS2) 4789 ? &RHS2 4790 : nullptr; 4791 SDValue K2Tmp = isa<ConstantSDNode>(TrueVal2) ? TrueVal2 : FalseVal2; 4792 SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1; 4793 SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2; 4794 SDValue V2 = (K2Tmp == TrueVal2) ? FalseVal2 : TrueVal2; 4795 4796 // We must detect cases where the original operations worked with 16- or 4797 // 8-bit values. In such case, V2Tmp != V2 because the comparison operations 4798 // must work with sign-extended values but the select operations return 4799 // the original non-extended value. 4800 SDValue V2TmpReg = V2Tmp; 4801 if (V2Tmp->getOpcode() == ISD::SIGN_EXTEND_INREG) 4802 V2TmpReg = V2Tmp->getOperand(0); 4803 4804 // Check that the registers and the constants have the correct values 4805 // in both conditionals 4806 if (!K1 || !K2 || *K1 == Op2 || *K2 != K2Tmp || V1Tmp != V2Tmp || 4807 V2TmpReg != V2) 4808 return false; 4809 4810 // Figure out which conditional is saturating the lower/upper bound. 4811 const SDValue *LowerCheckOp = 4812 isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) 4813 ? &Op 4814 : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) 4815 ? &Op2 4816 : nullptr; 4817 const SDValue *UpperCheckOp = 4818 isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) 4819 ? &Op 4820 : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) 4821 ? &Op2 4822 : nullptr; 4823 4824 if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp) 4825 return false; 4826 4827 // Check that the constant in the lower-bound check is 4828 // the opposite of the constant in the upper-bound check 4829 // in 1's complement. 4830 int64_t Val1 = cast<ConstantSDNode>(*K1)->getSExtValue(); 4831 int64_t Val2 = cast<ConstantSDNode>(*K2)->getSExtValue(); 4832 int64_t PosVal = std::max(Val1, Val2); 4833 int64_t NegVal = std::min(Val1, Val2); 4834 4835 if (((Val1 > Val2 && UpperCheckOp == &Op) || 4836 (Val1 < Val2 && UpperCheckOp == &Op2)) && 4837 isPowerOf2_64(PosVal + 1)) { 4838 4839 // Handle the difference between USAT (unsigned) and SSAT (signed) saturation 4840 if (Val1 == ~Val2) 4841 usat = false; 4842 else if (NegVal == 0) 4843 usat = true; 4844 else 4845 return false; 4846 4847 V = V2; 4848 K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive 4849 4850 return true; 4851 } 4852 4853 return false; 4854 } 4855 4856 // Check if a condition of the type x < k ? k : x can be converted into a 4857 // bit operation instead of conditional moves. 4858 // Currently this is allowed given: 4859 // - The conditions and values match up 4860 // - k is 0 or -1 (all ones) 4861 // This function will not check the last condition, thats up to the caller 4862 // It returns true if the transformation can be made, and in such case 4863 // returns x in V, and k in SatK. 4864 static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, 4865 SDValue &SatK) 4866 { 4867 SDValue LHS = Op.getOperand(0); 4868 SDValue RHS = Op.getOperand(1); 4869 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 4870 SDValue TrueVal = Op.getOperand(2); 4871 SDValue FalseVal = Op.getOperand(3); 4872 4873 SDValue *K = isa<ConstantSDNode>(LHS) ? &LHS : isa<ConstantSDNode>(RHS) 4874 ? &RHS 4875 : nullptr; 4876 4877 // No constant operation in comparison, early out 4878 if (!K) 4879 return false; 4880 4881 SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal; 4882 V = (KTmp == TrueVal) ? FalseVal : TrueVal; 4883 SDValue VTmp = (K && *K == LHS) ? RHS : LHS; 4884 4885 // If the constant on left and right side, or variable on left and right, 4886 // does not match, early out 4887 if (*K != KTmp || V != VTmp) 4888 return false; 4889 4890 if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) { 4891 SatK = *K; 4892 return true; 4893 } 4894 4895 return false; 4896 } 4897 4898 bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const { 4899 if (VT == MVT::f32) 4900 return !Subtarget->hasVFP2Base(); 4901 if (VT == MVT::f64) 4902 return !Subtarget->hasFP64(); 4903 if (VT == MVT::f16) 4904 return !Subtarget->hasFullFP16(); 4905 return false; 4906 } 4907 4908 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 4909 EVT VT = Op.getValueType(); 4910 SDLoc dl(Op); 4911 4912 // Try to convert two saturating conditional selects into a single SSAT 4913 SDValue SatValue; 4914 uint64_t SatConstant; 4915 bool SatUSat; 4916 if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) && 4917 isSaturatingConditional(Op, SatValue, SatConstant, SatUSat)) { 4918 if (SatUSat) 4919 return DAG.getNode(ARMISD::USAT, dl, VT, SatValue, 4920 DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); 4921 else 4922 return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue, 4923 DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); 4924 } 4925 4926 // Try to convert expressions of the form x < k ? k : x (and similar forms) 4927 // into more efficient bit operations, which is possible when k is 0 or -1 4928 // On ARM and Thumb-2 which have flexible operand 2 this will result in 4929 // single instructions. On Thumb the shift and the bit operation will be two 4930 // instructions. 4931 // Only allow this transformation on full-width (32-bit) operations 4932 SDValue LowerSatConstant; 4933 if (VT == MVT::i32 && 4934 isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) { 4935 SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue, 4936 DAG.getConstant(31, dl, VT)); 4937 if (isNullConstant(LowerSatConstant)) { 4938 SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV, 4939 DAG.getAllOnesConstant(dl, VT)); 4940 return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV); 4941 } else if (isAllOnesConstant(LowerSatConstant)) 4942 return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV); 4943 } 4944 4945 SDValue LHS = Op.getOperand(0); 4946 SDValue RHS = Op.getOperand(1); 4947 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 4948 SDValue TrueVal = Op.getOperand(2); 4949 SDValue FalseVal = Op.getOperand(3); 4950 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal); 4951 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal); 4952 4953 if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal && 4954 LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) { 4955 unsigned TVal = CTVal->getZExtValue(); 4956 unsigned FVal = CFVal->getZExtValue(); 4957 unsigned Opcode = 0; 4958 4959 if (TVal == ~FVal) { 4960 Opcode = ARMISD::CSINV; 4961 } else if (TVal == ~FVal + 1) { 4962 Opcode = ARMISD::CSNEG; 4963 } else if (TVal + 1 == FVal) { 4964 Opcode = ARMISD::CSINC; 4965 } else if (TVal == FVal + 1) { 4966 Opcode = ARMISD::CSINC; 4967 std::swap(TrueVal, FalseVal); 4968 std::swap(TVal, FVal); 4969 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 4970 } 4971 4972 if (Opcode) { 4973 // If one of the constants is cheaper than another, materialise the 4974 // cheaper one and let the csel generate the other. 4975 if (Opcode != ARMISD::CSINC && 4976 HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) { 4977 std::swap(TrueVal, FalseVal); 4978 std::swap(TVal, FVal); 4979 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 4980 } 4981 4982 // Attempt to use ZR checking TVal is 0, possibly inverting the condition 4983 // to get there. CSINC not is invertable like the other two (~(~a) == a, 4984 // -(-a) == a, but (a+1)+1 != a). 4985 if (FVal == 0 && Opcode != ARMISD::CSINC) { 4986 std::swap(TrueVal, FalseVal); 4987 std::swap(TVal, FVal); 4988 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 4989 } 4990 if (TVal == 0) 4991 TrueVal = DAG.getRegister(ARM::ZR, MVT::i32); 4992 4993 // Drops F's value because we can get it by inverting/negating TVal. 4994 FalseVal = TrueVal; 4995 4996 SDValue ARMcc; 4997 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 4998 EVT VT = TrueVal.getValueType(); 4999 return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp); 5000 } 5001 } 5002 5003 if (isUnsupportedFloatingType(LHS.getValueType())) { 5004 DAG.getTargetLoweringInfo().softenSetCCOperands( 5005 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS); 5006 5007 // If softenSetCCOperands only returned one value, we should compare it to 5008 // zero. 5009 if (!RHS.getNode()) { 5010 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 5011 CC = ISD::SETNE; 5012 } 5013 } 5014 5015 if (LHS.getValueType() == MVT::i32) { 5016 // Try to generate VSEL on ARMv8. 5017 // The VSEL instruction can't use all the usual ARM condition 5018 // codes: it only has two bits to select the condition code, so it's 5019 // constrained to use only GE, GT, VS and EQ. 5020 // 5021 // To implement all the various ISD::SETXXX opcodes, we sometimes need to 5022 // swap the operands of the previous compare instruction (effectively 5023 // inverting the compare condition, swapping 'less' and 'greater') and 5024 // sometimes need to swap the operands to the VSEL (which inverts the 5025 // condition in the sense of firing whenever the previous condition didn't) 5026 if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 || 5027 TrueVal.getValueType() == MVT::f32 || 5028 TrueVal.getValueType() == MVT::f64)) { 5029 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 5030 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE || 5031 CondCode == ARMCC::VC || CondCode == ARMCC::NE) { 5032 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 5033 std::swap(TrueVal, FalseVal); 5034 } 5035 } 5036 5037 SDValue ARMcc; 5038 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5039 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5040 // Choose GE over PL, which vsel does now support 5041 if (cast<ConstantSDNode>(ARMcc)->getZExtValue() == ARMCC::PL) 5042 ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32); 5043 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); 5044 } 5045 5046 ARMCC::CondCodes CondCode, CondCode2; 5047 FPCCToARMCC(CC, CondCode, CondCode2); 5048 5049 // Normalize the fp compare. If RHS is zero we prefer to keep it there so we 5050 // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we 5051 // must use VSEL (limited condition codes), due to not having conditional f16 5052 // moves. 5053 if (Subtarget->hasFPARMv8Base() && 5054 !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) && 5055 (TrueVal.getValueType() == MVT::f16 || 5056 TrueVal.getValueType() == MVT::f32 || 5057 TrueVal.getValueType() == MVT::f64)) { 5058 bool swpCmpOps = false; 5059 bool swpVselOps = false; 5060 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps); 5061 5062 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE || 5063 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) { 5064 if (swpCmpOps) 5065 std::swap(LHS, RHS); 5066 if (swpVselOps) 5067 std::swap(TrueVal, FalseVal); 5068 } 5069 } 5070 5071 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 5072 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); 5073 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5074 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); 5075 if (CondCode2 != ARMCC::AL) { 5076 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32); 5077 // FIXME: Needs another CMP because flag can have but one use. 5078 SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl); 5079 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG); 5080 } 5081 return Result; 5082 } 5083 5084 /// canChangeToInt - Given the fp compare operand, return true if it is suitable 5085 /// to morph to an integer compare sequence. 5086 static bool canChangeToInt(SDValue Op, bool &SeenZero, 5087 const ARMSubtarget *Subtarget) { 5088 SDNode *N = Op.getNode(); 5089 if (!N->hasOneUse()) 5090 // Otherwise it requires moving the value from fp to integer registers. 5091 return false; 5092 if (!N->getNumValues()) 5093 return false; 5094 EVT VT = Op.getValueType(); 5095 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow()) 5096 // f32 case is generally profitable. f64 case only makes sense when vcmpe + 5097 // vmrs are very slow, e.g. cortex-a8. 5098 return false; 5099 5100 if (isFloatingPointZero(Op)) { 5101 SeenZero = true; 5102 return true; 5103 } 5104 return ISD::isNormalLoad(N); 5105 } 5106 5107 static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) { 5108 if (isFloatingPointZero(Op)) 5109 return DAG.getConstant(0, SDLoc(Op), MVT::i32); 5110 5111 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) 5112 return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(), 5113 Ld->getPointerInfo(), Ld->getAlignment(), 5114 Ld->getMemOperand()->getFlags()); 5115 5116 llvm_unreachable("Unknown VFP cmp argument!"); 5117 } 5118 5119 static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, 5120 SDValue &RetVal1, SDValue &RetVal2) { 5121 SDLoc dl(Op); 5122 5123 if (isFloatingPointZero(Op)) { 5124 RetVal1 = DAG.getConstant(0, dl, MVT::i32); 5125 RetVal2 = DAG.getConstant(0, dl, MVT::i32); 5126 return; 5127 } 5128 5129 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) { 5130 SDValue Ptr = Ld->getBasePtr(); 5131 RetVal1 = 5132 DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), 5133 Ld->getAlignment(), Ld->getMemOperand()->getFlags()); 5134 5135 EVT PtrType = Ptr.getValueType(); 5136 unsigned NewAlign = MinAlign(Ld->getAlignment(), 4); 5137 SDValue NewPtr = DAG.getNode(ISD::ADD, dl, 5138 PtrType, Ptr, DAG.getConstant(4, dl, PtrType)); 5139 RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr, 5140 Ld->getPointerInfo().getWithOffset(4), NewAlign, 5141 Ld->getMemOperand()->getFlags()); 5142 return; 5143 } 5144 5145 llvm_unreachable("Unknown VFP cmp argument!"); 5146 } 5147 5148 /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some 5149 /// f32 and even f64 comparisons to integer ones. 5150 SDValue 5151 ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { 5152 SDValue Chain = Op.getOperand(0); 5153 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 5154 SDValue LHS = Op.getOperand(2); 5155 SDValue RHS = Op.getOperand(3); 5156 SDValue Dest = Op.getOperand(4); 5157 SDLoc dl(Op); 5158 5159 bool LHSSeenZero = false; 5160 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget); 5161 bool RHSSeenZero = false; 5162 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget); 5163 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) { 5164 // If unsafe fp math optimization is enabled and there are no other uses of 5165 // the CMP operands, and the condition code is EQ or NE, we can optimize it 5166 // to an integer comparison. 5167 if (CC == ISD::SETOEQ) 5168 CC = ISD::SETEQ; 5169 else if (CC == ISD::SETUNE) 5170 CC = ISD::SETNE; 5171 5172 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32); 5173 SDValue ARMcc; 5174 if (LHS.getValueType() == MVT::f32) { 5175 LHS = DAG.getNode(ISD::AND, dl, MVT::i32, 5176 bitcastf32Toi32(LHS, DAG), Mask); 5177 RHS = DAG.getNode(ISD::AND, dl, MVT::i32, 5178 bitcastf32Toi32(RHS, DAG), Mask); 5179 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5180 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5181 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 5182 Chain, Dest, ARMcc, CCR, Cmp); 5183 } 5184 5185 SDValue LHS1, LHS2; 5186 SDValue RHS1, RHS2; 5187 expandf64Toi32(LHS, DAG, LHS1, LHS2); 5188 expandf64Toi32(RHS, DAG, RHS1, RHS2); 5189 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask); 5190 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask); 5191 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 5192 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 5193 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 5194 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest }; 5195 return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops); 5196 } 5197 5198 return SDValue(); 5199 } 5200 5201 SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 5202 SDValue Chain = Op.getOperand(0); 5203 SDValue Cond = Op.getOperand(1); 5204 SDValue Dest = Op.getOperand(2); 5205 SDLoc dl(Op); 5206 5207 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 5208 // instruction. 5209 unsigned Opc = Cond.getOpcode(); 5210 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) && 5211 !Subtarget->isThumb1Only(); 5212 if (Cond.getResNo() == 1 && 5213 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 5214 Opc == ISD::USUBO || OptimizeMul)) { 5215 // Only lower legal XALUO ops. 5216 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) 5217 return SDValue(); 5218 5219 // The actual operation with overflow check. 5220 SDValue Value, OverflowCmp; 5221 SDValue ARMcc; 5222 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); 5223 5224 // Reverse the condition code. 5225 ARMCC::CondCodes CondCode = 5226 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue(); 5227 CondCode = ARMCC::getOppositeCondition(CondCode); 5228 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); 5229 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5230 5231 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, 5232 OverflowCmp); 5233 } 5234 5235 return SDValue(); 5236 } 5237 5238 SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { 5239 SDValue Chain = Op.getOperand(0); 5240 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 5241 SDValue LHS = Op.getOperand(2); 5242 SDValue RHS = Op.getOperand(3); 5243 SDValue Dest = Op.getOperand(4); 5244 SDLoc dl(Op); 5245 5246 if (isUnsupportedFloatingType(LHS.getValueType())) { 5247 DAG.getTargetLoweringInfo().softenSetCCOperands( 5248 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS); 5249 5250 // If softenSetCCOperands only returned one value, we should compare it to 5251 // zero. 5252 if (!RHS.getNode()) { 5253 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 5254 CC = ISD::SETNE; 5255 } 5256 } 5257 5258 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 5259 // instruction. 5260 unsigned Opc = LHS.getOpcode(); 5261 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) && 5262 !Subtarget->isThumb1Only(); 5263 if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) && 5264 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 5265 Opc == ISD::USUBO || OptimizeMul) && 5266 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 5267 // Only lower legal XALUO ops. 5268 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) 5269 return SDValue(); 5270 5271 // The actual operation with overflow check. 5272 SDValue Value, OverflowCmp; 5273 SDValue ARMcc; 5274 std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc); 5275 5276 if ((CC == ISD::SETNE) != isOneConstant(RHS)) { 5277 // Reverse the condition code. 5278 ARMCC::CondCodes CondCode = 5279 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue(); 5280 CondCode = ARMCC::getOppositeCondition(CondCode); 5281 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); 5282 } 5283 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5284 5285 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, 5286 OverflowCmp); 5287 } 5288 5289 if (LHS.getValueType() == MVT::i32) { 5290 SDValue ARMcc; 5291 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5292 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5293 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 5294 Chain, Dest, ARMcc, CCR, Cmp); 5295 } 5296 5297 if (getTargetMachine().Options.UnsafeFPMath && 5298 (CC == ISD::SETEQ || CC == ISD::SETOEQ || 5299 CC == ISD::SETNE || CC == ISD::SETUNE)) { 5300 if (SDValue Result = OptimizeVFPBrcond(Op, DAG)) 5301 return Result; 5302 } 5303 5304 ARMCC::CondCodes CondCode, CondCode2; 5305 FPCCToARMCC(CC, CondCode, CondCode2); 5306 5307 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 5308 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); 5309 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5310 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 5311 SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp }; 5312 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); 5313 if (CondCode2 != ARMCC::AL) { 5314 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32); 5315 SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) }; 5316 Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); 5317 } 5318 return Res; 5319 } 5320 5321 SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { 5322 SDValue Chain = Op.getOperand(0); 5323 SDValue Table = Op.getOperand(1); 5324 SDValue Index = Op.getOperand(2); 5325 SDLoc dl(Op); 5326 5327 EVT PTy = getPointerTy(DAG.getDataLayout()); 5328 JumpTableSDNode *JT = cast<JumpTableSDNode>(Table); 5329 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy); 5330 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI); 5331 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy)); 5332 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index); 5333 if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) { 5334 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table 5335 // which does another jump to the destination. This also makes it easier 5336 // to translate it to TBB / TBH later (Thumb2 only). 5337 // FIXME: This might not work if the function is extremely large. 5338 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain, 5339 Addr, Op.getOperand(2), JTI); 5340 } 5341 if (isPositionIndependent() || Subtarget->isROPI()) { 5342 Addr = 5343 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, 5344 MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); 5345 Chain = Addr.getValue(1); 5346 Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr); 5347 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); 5348 } else { 5349 Addr = 5350 DAG.getLoad(PTy, dl, Chain, Addr, 5351 MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); 5352 Chain = Addr.getValue(1); 5353 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); 5354 } 5355 } 5356 5357 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { 5358 EVT VT = Op.getValueType(); 5359 SDLoc dl(Op); 5360 5361 if (Op.getValueType().getVectorElementType() == MVT::i32) { 5362 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32) 5363 return Op; 5364 return DAG.UnrollVectorOp(Op.getNode()); 5365 } 5366 5367 const bool HasFullFP16 = 5368 static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16(); 5369 5370 EVT NewTy; 5371 const EVT OpTy = Op.getOperand(0).getValueType(); 5372 if (OpTy == MVT::v4f32) 5373 NewTy = MVT::v4i32; 5374 else if (OpTy == MVT::v4f16 && HasFullFP16) 5375 NewTy = MVT::v4i16; 5376 else if (OpTy == MVT::v8f16 && HasFullFP16) 5377 NewTy = MVT::v8i16; 5378 else 5379 llvm_unreachable("Invalid type for custom lowering!"); 5380 5381 if (VT != MVT::v4i16 && VT != MVT::v8i16) 5382 return DAG.UnrollVectorOp(Op.getNode()); 5383 5384 Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0)); 5385 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op); 5386 } 5387 5388 SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { 5389 EVT VT = Op.getValueType(); 5390 if (VT.isVector()) 5391 return LowerVectorFP_TO_INT(Op, DAG); 5392 5393 bool IsStrict = Op->isStrictFPOpcode(); 5394 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 5395 5396 if (isUnsupportedFloatingType(SrcVal.getValueType())) { 5397 RTLIB::Libcall LC; 5398 if (Op.getOpcode() == ISD::FP_TO_SINT || 5399 Op.getOpcode() == ISD::STRICT_FP_TO_SINT) 5400 LC = RTLIB::getFPTOSINT(SrcVal.getValueType(), 5401 Op.getValueType()); 5402 else 5403 LC = RTLIB::getFPTOUINT(SrcVal.getValueType(), 5404 Op.getValueType()); 5405 SDLoc Loc(Op); 5406 MakeLibCallOptions CallOptions; 5407 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); 5408 SDValue Result; 5409 std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal, 5410 CallOptions, Loc, Chain); 5411 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result; 5412 } 5413 5414 // FIXME: Remove this when we have strict fp instruction selection patterns 5415 if (IsStrict) { 5416 DAG.mutateStrictFPToFP(Op.getNode()); 5417 } 5418 5419 return Op; 5420 } 5421 5422 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 5423 EVT VT = Op.getValueType(); 5424 SDLoc dl(Op); 5425 5426 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) { 5427 if (VT.getVectorElementType() == MVT::f32) 5428 return Op; 5429 return DAG.UnrollVectorOp(Op.getNode()); 5430 } 5431 5432 assert((Op.getOperand(0).getValueType() == MVT::v4i16 || 5433 Op.getOperand(0).getValueType() == MVT::v8i16) && 5434 "Invalid type for custom lowering!"); 5435 5436 const bool HasFullFP16 = 5437 static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16(); 5438 5439 EVT DestVecType; 5440 if (VT == MVT::v4f32) 5441 DestVecType = MVT::v4i32; 5442 else if (VT == MVT::v4f16 && HasFullFP16) 5443 DestVecType = MVT::v4i16; 5444 else if (VT == MVT::v8f16 && HasFullFP16) 5445 DestVecType = MVT::v8i16; 5446 else 5447 return DAG.UnrollVectorOp(Op.getNode()); 5448 5449 unsigned CastOpc; 5450 unsigned Opc; 5451 switch (Op.getOpcode()) { 5452 default: llvm_unreachable("Invalid opcode!"); 5453 case ISD::SINT_TO_FP: 5454 CastOpc = ISD::SIGN_EXTEND; 5455 Opc = ISD::SINT_TO_FP; 5456 break; 5457 case ISD::UINT_TO_FP: 5458 CastOpc = ISD::ZERO_EXTEND; 5459 Opc = ISD::UINT_TO_FP; 5460 break; 5461 } 5462 5463 Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0)); 5464 return DAG.getNode(Opc, dl, VT, Op); 5465 } 5466 5467 SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { 5468 EVT VT = Op.getValueType(); 5469 if (VT.isVector()) 5470 return LowerVectorINT_TO_FP(Op, DAG); 5471 if (isUnsupportedFloatingType(VT)) { 5472 RTLIB::Libcall LC; 5473 if (Op.getOpcode() == ISD::SINT_TO_FP) 5474 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), 5475 Op.getValueType()); 5476 else 5477 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), 5478 Op.getValueType()); 5479 MakeLibCallOptions CallOptions; 5480 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), 5481 CallOptions, SDLoc(Op)).first; 5482 } 5483 5484 return Op; 5485 } 5486 5487 SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 5488 // Implement fcopysign with a fabs and a conditional fneg. 5489 SDValue Tmp0 = Op.getOperand(0); 5490 SDValue Tmp1 = Op.getOperand(1); 5491 SDLoc dl(Op); 5492 EVT VT = Op.getValueType(); 5493 EVT SrcVT = Tmp1.getValueType(); 5494 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST || 5495 Tmp0.getOpcode() == ARMISD::VMOVDRR; 5496 bool UseNEON = !InGPR && Subtarget->hasNEON(); 5497 5498 if (UseNEON) { 5499 // Use VBSL to copy the sign bit. 5500 unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80); 5501 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32, 5502 DAG.getTargetConstant(EncodedVal, dl, MVT::i32)); 5503 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64; 5504 if (VT == MVT::f64) 5505 Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT, 5506 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask), 5507 DAG.getConstant(32, dl, MVT::i32)); 5508 else /*if (VT == MVT::f32)*/ 5509 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0); 5510 if (SrcVT == MVT::f32) { 5511 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1); 5512 if (VT == MVT::f64) 5513 Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT, 5514 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1), 5515 DAG.getConstant(32, dl, MVT::i32)); 5516 } else if (VT == MVT::f32) 5517 Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64, 5518 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1), 5519 DAG.getConstant(32, dl, MVT::i32)); 5520 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0); 5521 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1); 5522 5523 SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), 5524 dl, MVT::i32); 5525 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes); 5526 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask, 5527 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes)); 5528 5529 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT, 5530 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask), 5531 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot)); 5532 if (VT == MVT::f32) { 5533 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res); 5534 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res, 5535 DAG.getConstant(0, dl, MVT::i32)); 5536 } else { 5537 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res); 5538 } 5539 5540 return Res; 5541 } 5542 5543 // Bitcast operand 1 to i32. 5544 if (SrcVT == MVT::f64) 5545 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 5546 Tmp1).getValue(1); 5547 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1); 5548 5549 // Or in the signbit with integer operations. 5550 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32); 5551 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32); 5552 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1); 5553 if (VT == MVT::f32) { 5554 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32, 5555 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2); 5556 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, 5557 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1)); 5558 } 5559 5560 // f64: Or the high part with signbit and then combine two parts. 5561 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 5562 Tmp0); 5563 SDValue Lo = Tmp0.getValue(0); 5564 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2); 5565 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1); 5566 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 5567 } 5568 5569 SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ 5570 MachineFunction &MF = DAG.getMachineFunction(); 5571 MachineFrameInfo &MFI = MF.getFrameInfo(); 5572 MFI.setReturnAddressIsTaken(true); 5573 5574 if (verifyReturnAddressArgumentIsConstant(Op, DAG)) 5575 return SDValue(); 5576 5577 EVT VT = Op.getValueType(); 5578 SDLoc dl(Op); 5579 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 5580 if (Depth) { 5581 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 5582 SDValue Offset = DAG.getConstant(4, dl, MVT::i32); 5583 return DAG.getLoad(VT, dl, DAG.getEntryNode(), 5584 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset), 5585 MachinePointerInfo()); 5586 } 5587 5588 // Return LR, which contains the return address. Mark it an implicit live-in. 5589 unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); 5590 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); 5591 } 5592 5593 SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 5594 const ARMBaseRegisterInfo &ARI = 5595 *static_cast<const ARMBaseRegisterInfo*>(RegInfo); 5596 MachineFunction &MF = DAG.getMachineFunction(); 5597 MachineFrameInfo &MFI = MF.getFrameInfo(); 5598 MFI.setFrameAddressIsTaken(true); 5599 5600 EVT VT = Op.getValueType(); 5601 SDLoc dl(Op); // FIXME probably not meaningful 5602 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 5603 Register FrameReg = ARI.getFrameRegister(MF); 5604 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 5605 while (Depth--) 5606 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 5607 MachinePointerInfo()); 5608 return FrameAddr; 5609 } 5610 5611 // FIXME? Maybe this could be a TableGen attribute on some registers and 5612 // this table could be generated automatically from RegInfo. 5613 Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT, 5614 const MachineFunction &MF) const { 5615 Register Reg = StringSwitch<unsigned>(RegName) 5616 .Case("sp", ARM::SP) 5617 .Default(0); 5618 if (Reg) 5619 return Reg; 5620 report_fatal_error(Twine("Invalid register name \"" 5621 + StringRef(RegName) + "\".")); 5622 } 5623 5624 // Result is 64 bit value so split into two 32 bit values and return as a 5625 // pair of values. 5626 static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results, 5627 SelectionDAG &DAG) { 5628 SDLoc DL(N); 5629 5630 // This function is only supposed to be called for i64 type destination. 5631 assert(N->getValueType(0) == MVT::i64 5632 && "ExpandREAD_REGISTER called for non-i64 type result."); 5633 5634 SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL, 5635 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other), 5636 N->getOperand(0), 5637 N->getOperand(1)); 5638 5639 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0), 5640 Read.getValue(1))); 5641 Results.push_back(Read.getOperand(0)); 5642 } 5643 5644 /// \p BC is a bitcast that is about to be turned into a VMOVDRR. 5645 /// When \p DstVT, the destination type of \p BC, is on the vector 5646 /// register bank and the source of bitcast, \p Op, operates on the same bank, 5647 /// it might be possible to combine them, such that everything stays on the 5648 /// vector register bank. 5649 /// \p return The node that would replace \p BT, if the combine 5650 /// is possible. 5651 static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, 5652 SelectionDAG &DAG) { 5653 SDValue Op = BC->getOperand(0); 5654 EVT DstVT = BC->getValueType(0); 5655 5656 // The only vector instruction that can produce a scalar (remember, 5657 // since the bitcast was about to be turned into VMOVDRR, the source 5658 // type is i64) from a vector is EXTRACT_VECTOR_ELT. 5659 // Moreover, we can do this combine only if there is one use. 5660 // Finally, if the destination type is not a vector, there is not 5661 // much point on forcing everything on the vector bank. 5662 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 5663 !Op.hasOneUse()) 5664 return SDValue(); 5665 5666 // If the index is not constant, we will introduce an additional 5667 // multiply that will stick. 5668 // Give up in that case. 5669 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 5670 if (!Index) 5671 return SDValue(); 5672 unsigned DstNumElt = DstVT.getVectorNumElements(); 5673 5674 // Compute the new index. 5675 const APInt &APIntIndex = Index->getAPIntValue(); 5676 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt); 5677 NewIndex *= APIntIndex; 5678 // Check if the new constant index fits into i32. 5679 if (NewIndex.getBitWidth() > 32) 5680 return SDValue(); 5681 5682 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) -> 5683 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M) 5684 SDLoc dl(Op); 5685 SDValue ExtractSrc = Op.getOperand(0); 5686 EVT VecVT = EVT::getVectorVT( 5687 *DAG.getContext(), DstVT.getScalarType(), 5688 ExtractSrc.getValueType().getVectorNumElements() * DstNumElt); 5689 SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc); 5690 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast, 5691 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32)); 5692 } 5693 5694 /// ExpandBITCAST - If the target supports VFP, this function is called to 5695 /// expand a bit convert where either the source or destination type is i64 to 5696 /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 5697 /// operand type is illegal (e.g., v2f32 for a target that doesn't support 5698 /// vectors), since the legalizer won't know what to do with that. 5699 static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG, 5700 const ARMSubtarget *Subtarget) { 5701 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 5702 SDLoc dl(N); 5703 SDValue Op = N->getOperand(0); 5704 5705 // This function is only supposed to be called for i64 types, either as the 5706 // source or destination of the bit convert. 5707 EVT SrcVT = Op.getValueType(); 5708 EVT DstVT = N->getValueType(0); 5709 const bool HasFullFP16 = Subtarget->hasFullFP16(); 5710 5711 if (SrcVT == MVT::f32 && DstVT == MVT::i32) { 5712 // FullFP16: half values are passed in S-registers, and we don't 5713 // need any of the bitcast and moves: 5714 // 5715 // t2: f32,ch = CopyFromReg t0, Register:f32 %0 5716 // t5: i32 = bitcast t2 5717 // t18: f16 = ARMISD::VMOVhr t5 5718 if (Op.getOpcode() != ISD::CopyFromReg || 5719 Op.getValueType() != MVT::f32) 5720 return SDValue(); 5721 5722 auto Move = N->use_begin(); 5723 if (Move->getOpcode() != ARMISD::VMOVhr) 5724 return SDValue(); 5725 5726 SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) }; 5727 SDValue Copy = DAG.getNode(ISD::CopyFromReg, SDLoc(Op), MVT::f16, Ops); 5728 DAG.ReplaceAllUsesWith(*Move, &Copy); 5729 return Copy; 5730 } 5731 5732 if (SrcVT == MVT::i16 && DstVT == MVT::f16) { 5733 if (!HasFullFP16) 5734 return SDValue(); 5735 // SoftFP: read half-precision arguments: 5736 // 5737 // t2: i32,ch = ... 5738 // t7: i16 = truncate t2 <~~~~ Op 5739 // t8: f16 = bitcast t7 <~~~~ N 5740 // 5741 if (Op.getOperand(0).getValueType() == MVT::i32) 5742 return DAG.getNode(ARMISD::VMOVhr, SDLoc(Op), 5743 MVT::f16, Op.getOperand(0)); 5744 5745 return SDValue(); 5746 } 5747 5748 // Half-precision return values 5749 if (SrcVT == MVT::f16 && DstVT == MVT::i16) { 5750 if (!HasFullFP16) 5751 return SDValue(); 5752 // 5753 // t11: f16 = fadd t8, t10 5754 // t12: i16 = bitcast t11 <~~~ SDNode N 5755 // t13: i32 = zero_extend t12 5756 // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t13 5757 // t17: ch = ARMISD::RET_FLAG t16, Register:i32 %r0, t16:1 5758 // 5759 // transform this into: 5760 // 5761 // t20: i32 = ARMISD::VMOVrh t11 5762 // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t20 5763 // 5764 auto ZeroExtend = N->use_begin(); 5765 if (N->use_size() != 1 || ZeroExtend->getOpcode() != ISD::ZERO_EXTEND || 5766 ZeroExtend->getValueType(0) != MVT::i32) 5767 return SDValue(); 5768 5769 auto Copy = ZeroExtend->use_begin(); 5770 if (Copy->getOpcode() == ISD::CopyToReg && 5771 Copy->use_begin()->getOpcode() == ARMISD::RET_FLAG) { 5772 SDValue Cvt = DAG.getNode(ARMISD::VMOVrh, SDLoc(Op), MVT::i32, Op); 5773 DAG.ReplaceAllUsesWith(*ZeroExtend, &Cvt); 5774 return Cvt; 5775 } 5776 return SDValue(); 5777 } 5778 5779 if (!(SrcVT == MVT::i64 || DstVT == MVT::i64)) 5780 return SDValue(); 5781 5782 // Turn i64->f64 into VMOVDRR. 5783 if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) { 5784 // Do not force values to GPRs (this is what VMOVDRR does for the inputs) 5785 // if we can combine the bitcast with its source. 5786 if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG)) 5787 return Val; 5788 5789 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 5790 DAG.getConstant(0, dl, MVT::i32)); 5791 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 5792 DAG.getConstant(1, dl, MVT::i32)); 5793 return DAG.getNode(ISD::BITCAST, dl, DstVT, 5794 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi)); 5795 } 5796 5797 // Turn f64->i64 into VMOVRRD. 5798 if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) { 5799 SDValue Cvt; 5800 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() && 5801 SrcVT.getVectorNumElements() > 1) 5802 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, 5803 DAG.getVTList(MVT::i32, MVT::i32), 5804 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op)); 5805 else 5806 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, 5807 DAG.getVTList(MVT::i32, MVT::i32), Op); 5808 // Merge the pieces into a single i64 value. 5809 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1)); 5810 } 5811 5812 return SDValue(); 5813 } 5814 5815 /// getZeroVector - Returns a vector of specified type with all zero elements. 5816 /// Zero vectors are used to represent vector negation and in those cases 5817 /// will be implemented with the NEON VNEG instruction. However, VNEG does 5818 /// not support i64 elements, so sometimes the zero vectors will need to be 5819 /// explicitly constructed. Regardless, use a canonical VMOV to create the 5820 /// zero vector. 5821 static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) { 5822 assert(VT.isVector() && "Expected a vector type"); 5823 // The canonical modified immediate encoding of a zero vector is....0! 5824 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32); 5825 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 5826 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal); 5827 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 5828 } 5829 5830 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two 5831 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 5832 SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, 5833 SelectionDAG &DAG) const { 5834 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5835 EVT VT = Op.getValueType(); 5836 unsigned VTBits = VT.getSizeInBits(); 5837 SDLoc dl(Op); 5838 SDValue ShOpLo = Op.getOperand(0); 5839 SDValue ShOpHi = Op.getOperand(1); 5840 SDValue ShAmt = Op.getOperand(2); 5841 SDValue ARMcc; 5842 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5843 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 5844 5845 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 5846 5847 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 5848 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); 5849 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 5850 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 5851 DAG.getConstant(VTBits, dl, MVT::i32)); 5852 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 5853 SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 5854 SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 5855 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5856 ISD::SETGE, ARMcc, DAG, dl); 5857 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift, 5858 ARMcc, CCR, CmpLo); 5859 5860 SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 5861 SDValue HiBigShift = Opc == ISD::SRA 5862 ? DAG.getNode(Opc, dl, VT, ShOpHi, 5863 DAG.getConstant(VTBits - 1, dl, VT)) 5864 : DAG.getConstant(0, dl, VT); 5865 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5866 ISD::SETGE, ARMcc, DAG, dl); 5867 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, 5868 ARMcc, CCR, CmpHi); 5869 5870 SDValue Ops[2] = { Lo, Hi }; 5871 return DAG.getMergeValues(Ops, dl); 5872 } 5873 5874 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two 5875 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 5876 SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, 5877 SelectionDAG &DAG) const { 5878 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5879 EVT VT = Op.getValueType(); 5880 unsigned VTBits = VT.getSizeInBits(); 5881 SDLoc dl(Op); 5882 SDValue ShOpLo = Op.getOperand(0); 5883 SDValue ShOpHi = Op.getOperand(1); 5884 SDValue ShAmt = Op.getOperand(2); 5885 SDValue ARMcc; 5886 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5887 5888 assert(Op.getOpcode() == ISD::SHL_PARTS); 5889 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 5890 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); 5891 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 5892 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 5893 SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 5894 5895 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 5896 DAG.getConstant(VTBits, dl, MVT::i32)); 5897 SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 5898 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5899 ISD::SETGE, ARMcc, DAG, dl); 5900 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, 5901 ARMcc, CCR, CmpHi); 5902 5903 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5904 ISD::SETGE, ARMcc, DAG, dl); 5905 SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 5906 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, 5907 DAG.getConstant(0, dl, VT), ARMcc, CCR, CmpLo); 5908 5909 SDValue Ops[2] = { Lo, Hi }; 5910 return DAG.getMergeValues(Ops, dl); 5911 } 5912 5913 SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, 5914 SelectionDAG &DAG) const { 5915 // The rounding mode is in bits 23:22 of the FPSCR. 5916 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 5917 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) 5918 // so that the shift + and get folded into a bitfield extract. 5919 SDLoc dl(Op); 5920 SDValue Ops[] = { DAG.getEntryNode(), 5921 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32) }; 5922 5923 SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, MVT::i32, Ops); 5924 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR, 5925 DAG.getConstant(1U << 22, dl, MVT::i32)); 5926 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, 5927 DAG.getConstant(22, dl, MVT::i32)); 5928 return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, 5929 DAG.getConstant(3, dl, MVT::i32)); 5930 } 5931 5932 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, 5933 const ARMSubtarget *ST) { 5934 SDLoc dl(N); 5935 EVT VT = N->getValueType(0); 5936 if (VT.isVector() && ST->hasNEON()) { 5937 5938 // Compute the least significant set bit: LSB = X & -X 5939 SDValue X = N->getOperand(0); 5940 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X); 5941 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX); 5942 5943 EVT ElemTy = VT.getVectorElementType(); 5944 5945 if (ElemTy == MVT::i8) { 5946 // Compute with: cttz(x) = ctpop(lsb - 1) 5947 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 5948 DAG.getTargetConstant(1, dl, ElemTy)); 5949 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); 5950 return DAG.getNode(ISD::CTPOP, dl, VT, Bits); 5951 } 5952 5953 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) && 5954 (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) { 5955 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0 5956 unsigned NumBits = ElemTy.getSizeInBits(); 5957 SDValue WidthMinus1 = 5958 DAG.getNode(ARMISD::VMOVIMM, dl, VT, 5959 DAG.getTargetConstant(NumBits - 1, dl, ElemTy)); 5960 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB); 5961 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ); 5962 } 5963 5964 // Compute with: cttz(x) = ctpop(lsb - 1) 5965 5966 // Compute LSB - 1. 5967 SDValue Bits; 5968 if (ElemTy == MVT::i64) { 5969 // Load constant 0xffff'ffff'ffff'ffff to register. 5970 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 5971 DAG.getTargetConstant(0x1eff, dl, MVT::i32)); 5972 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF); 5973 } else { 5974 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 5975 DAG.getTargetConstant(1, dl, ElemTy)); 5976 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); 5977 } 5978 return DAG.getNode(ISD::CTPOP, dl, VT, Bits); 5979 } 5980 5981 if (!ST->hasV6T2Ops()) 5982 return SDValue(); 5983 5984 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0)); 5985 return DAG.getNode(ISD::CTLZ, dl, VT, rbit); 5986 } 5987 5988 static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG, 5989 const ARMSubtarget *ST) { 5990 EVT VT = N->getValueType(0); 5991 SDLoc DL(N); 5992 5993 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON."); 5994 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 || 5995 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) && 5996 "Unexpected type for custom ctpop lowering"); 5997 5998 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 5999 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; 6000 SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0)); 6001 Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res); 6002 6003 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds. 6004 unsigned EltSize = 8; 6005 unsigned NumElts = VT.is64BitVector() ? 8 : 16; 6006 while (EltSize != VT.getScalarSizeInBits()) { 6007 SmallVector<SDValue, 8> Ops; 6008 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL, 6009 TLI.getPointerTy(DAG.getDataLayout()))); 6010 Ops.push_back(Res); 6011 6012 EltSize *= 2; 6013 NumElts /= 2; 6014 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts); 6015 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops); 6016 } 6017 6018 return Res; 6019 } 6020 6021 /// Getvshiftimm - Check if this is a valid build_vector for the immediate 6022 /// operand of a vector shift operation, where all the elements of the 6023 /// build_vector must have the same constant integer value. 6024 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { 6025 // Ignore bit_converts. 6026 while (Op.getOpcode() == ISD::BITCAST) 6027 Op = Op.getOperand(0); 6028 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); 6029 APInt SplatBits, SplatUndef; 6030 unsigned SplatBitSize; 6031 bool HasAnyUndefs; 6032 if (!BVN || 6033 !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, 6034 ElementBits) || 6035 SplatBitSize > ElementBits) 6036 return false; 6037 Cnt = SplatBits.getSExtValue(); 6038 return true; 6039 } 6040 6041 /// isVShiftLImm - Check if this is a valid build_vector for the immediate 6042 /// operand of a vector shift left operation. That value must be in the range: 6043 /// 0 <= Value < ElementBits for a left shift; or 6044 /// 0 <= Value <= ElementBits for a long left shift. 6045 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { 6046 assert(VT.isVector() && "vector shift count is not a vector type"); 6047 int64_t ElementBits = VT.getScalarSizeInBits(); 6048 if (!getVShiftImm(Op, ElementBits, Cnt)) 6049 return false; 6050 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); 6051 } 6052 6053 /// isVShiftRImm - Check if this is a valid build_vector for the immediate 6054 /// operand of a vector shift right operation. For a shift opcode, the value 6055 /// is positive, but for an intrinsic the value count must be negative. The 6056 /// absolute value must be in the range: 6057 /// 1 <= |Value| <= ElementBits for a right shift; or 6058 /// 1 <= |Value| <= ElementBits/2 for a narrow right shift. 6059 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, 6060 int64_t &Cnt) { 6061 assert(VT.isVector() && "vector shift count is not a vector type"); 6062 int64_t ElementBits = VT.getScalarSizeInBits(); 6063 if (!getVShiftImm(Op, ElementBits, Cnt)) 6064 return false; 6065 if (!isIntrinsic) 6066 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); 6067 if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) { 6068 Cnt = -Cnt; 6069 return true; 6070 } 6071 return false; 6072 } 6073 6074 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, 6075 const ARMSubtarget *ST) { 6076 EVT VT = N->getValueType(0); 6077 SDLoc dl(N); 6078 int64_t Cnt; 6079 6080 if (!VT.isVector()) 6081 return SDValue(); 6082 6083 // We essentially have two forms here. Shift by an immediate and shift by a 6084 // vector register (there are also shift by a gpr, but that is just handled 6085 // with a tablegen pattern). We cannot easily match shift by an immediate in 6086 // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM. 6087 // For shifting by a vector, we don't have VSHR, only VSHL (which can be 6088 // signed or unsigned, and a negative shift indicates a shift right). 6089 if (N->getOpcode() == ISD::SHL) { 6090 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) 6091 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0), 6092 DAG.getConstant(Cnt, dl, MVT::i32)); 6093 return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0), 6094 N->getOperand(1)); 6095 } 6096 6097 assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) && 6098 "unexpected vector shift opcode"); 6099 6100 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { 6101 unsigned VShiftOpc = 6102 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM); 6103 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), 6104 DAG.getConstant(Cnt, dl, MVT::i32)); 6105 } 6106 6107 // Other right shifts we don't have operations for (we use a shift left by a 6108 // negative number). 6109 EVT ShiftVT = N->getOperand(1).getValueType(); 6110 SDValue NegatedCount = DAG.getNode( 6111 ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1)); 6112 unsigned VShiftOpc = 6113 (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu); 6114 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount); 6115 } 6116 6117 static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, 6118 const ARMSubtarget *ST) { 6119 EVT VT = N->getValueType(0); 6120 SDLoc dl(N); 6121 6122 // We can get here for a node like i32 = ISD::SHL i32, i64 6123 if (VT != MVT::i64) 6124 return SDValue(); 6125 6126 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA || 6127 N->getOpcode() == ISD::SHL) && 6128 "Unknown shift to lower!"); 6129 6130 unsigned ShOpc = N->getOpcode(); 6131 if (ST->hasMVEIntegerOps()) { 6132 SDValue ShAmt = N->getOperand(1); 6133 unsigned ShPartsOpc = ARMISD::LSLL; 6134 ConstantSDNode *Con = dyn_cast<ConstantSDNode>(ShAmt); 6135 6136 // If the shift amount is greater than 32 or has a greater bitwidth than 64 6137 // then do the default optimisation 6138 if (ShAmt->getValueType(0).getSizeInBits() > 64 || 6139 (Con && (Con->getZExtValue() == 0 || Con->getZExtValue() >= 32))) 6140 return SDValue(); 6141 6142 // Extract the lower 32 bits of the shift amount if it's not an i32 6143 if (ShAmt->getValueType(0) != MVT::i32) 6144 ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32); 6145 6146 if (ShOpc == ISD::SRL) { 6147 if (!Con) 6148 // There is no t2LSRLr instruction so negate and perform an lsll if the 6149 // shift amount is in a register, emulating a right shift. 6150 ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 6151 DAG.getConstant(0, dl, MVT::i32), ShAmt); 6152 else 6153 // Else generate an lsrl on the immediate shift amount 6154 ShPartsOpc = ARMISD::LSRL; 6155 } else if (ShOpc == ISD::SRA) 6156 ShPartsOpc = ARMISD::ASRL; 6157 6158 // Lower 32 bits of the destination/source 6159 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6160 DAG.getConstant(0, dl, MVT::i32)); 6161 // Upper 32 bits of the destination/source 6162 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6163 DAG.getConstant(1, dl, MVT::i32)); 6164 6165 // Generate the shift operation as computed above 6166 Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi, 6167 ShAmt); 6168 // The upper 32 bits come from the second return value of lsll 6169 Hi = SDValue(Lo.getNode(), 1); 6170 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 6171 } 6172 6173 // We only lower SRA, SRL of 1 here, all others use generic lowering. 6174 if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL) 6175 return SDValue(); 6176 6177 // If we are in thumb mode, we don't have RRX. 6178 if (ST->isThumb1Only()) 6179 return SDValue(); 6180 6181 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr. 6182 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6183 DAG.getConstant(0, dl, MVT::i32)); 6184 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6185 DAG.getConstant(1, dl, MVT::i32)); 6186 6187 // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and 6188 // captures the result into a carry flag. 6189 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG; 6190 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi); 6191 6192 // The low part is an ARMISD::RRX operand, which shifts the carry in. 6193 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1)); 6194 6195 // Merge the pieces into a single i64 value. 6196 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 6197 } 6198 6199 static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, 6200 const ARMSubtarget *ST) { 6201 bool Invert = false; 6202 bool Swap = false; 6203 unsigned Opc = ARMCC::AL; 6204 6205 SDValue Op0 = Op.getOperand(0); 6206 SDValue Op1 = Op.getOperand(1); 6207 SDValue CC = Op.getOperand(2); 6208 EVT VT = Op.getValueType(); 6209 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 6210 SDLoc dl(Op); 6211 6212 EVT CmpVT; 6213 if (ST->hasNEON()) 6214 CmpVT = Op0.getValueType().changeVectorElementTypeToInteger(); 6215 else { 6216 assert(ST->hasMVEIntegerOps() && 6217 "No hardware support for integer vector comparison!"); 6218 6219 if (Op.getValueType().getVectorElementType() != MVT::i1) 6220 return SDValue(); 6221 6222 // Make sure we expand floating point setcc to scalar if we do not have 6223 // mve.fp, so that we can handle them from there. 6224 if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps()) 6225 return SDValue(); 6226 6227 CmpVT = VT; 6228 } 6229 6230 if (Op0.getValueType().getVectorElementType() == MVT::i64 && 6231 (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) { 6232 // Special-case integer 64-bit equality comparisons. They aren't legal, 6233 // but they can be lowered with a few vector instructions. 6234 unsigned CmpElements = CmpVT.getVectorNumElements() * 2; 6235 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements); 6236 SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0); 6237 SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1); 6238 SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1, 6239 DAG.getCondCode(ISD::SETEQ)); 6240 SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp); 6241 SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed); 6242 Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged); 6243 if (SetCCOpcode == ISD::SETNE) 6244 Merged = DAG.getNOT(dl, Merged, CmpVT); 6245 Merged = DAG.getSExtOrTrunc(Merged, dl, VT); 6246 return Merged; 6247 } 6248 6249 if (CmpVT.getVectorElementType() == MVT::i64) 6250 // 64-bit comparisons are not legal in general. 6251 return SDValue(); 6252 6253 if (Op1.getValueType().isFloatingPoint()) { 6254 switch (SetCCOpcode) { 6255 default: llvm_unreachable("Illegal FP comparison"); 6256 case ISD::SETUNE: 6257 case ISD::SETNE: 6258 if (ST->hasMVEFloatOps()) { 6259 Opc = ARMCC::NE; break; 6260 } else { 6261 Invert = true; LLVM_FALLTHROUGH; 6262 } 6263 case ISD::SETOEQ: 6264 case ISD::SETEQ: Opc = ARMCC::EQ; break; 6265 case ISD::SETOLT: 6266 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; 6267 case ISD::SETOGT: 6268 case ISD::SETGT: Opc = ARMCC::GT; break; 6269 case ISD::SETOLE: 6270 case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; 6271 case ISD::SETOGE: 6272 case ISD::SETGE: Opc = ARMCC::GE; break; 6273 case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH; 6274 case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break; 6275 case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH; 6276 case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break; 6277 case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH; 6278 case ISD::SETONE: { 6279 // Expand this to (OLT | OGT). 6280 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0, 6281 DAG.getConstant(ARMCC::GT, dl, MVT::i32)); 6282 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, 6283 DAG.getConstant(ARMCC::GT, dl, MVT::i32)); 6284 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1); 6285 if (Invert) 6286 Result = DAG.getNOT(dl, Result, VT); 6287 return Result; 6288 } 6289 case ISD::SETUO: Invert = true; LLVM_FALLTHROUGH; 6290 case ISD::SETO: { 6291 // Expand this to (OLT | OGE). 6292 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0, 6293 DAG.getConstant(ARMCC::GT, dl, MVT::i32)); 6294 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, 6295 DAG.getConstant(ARMCC::GE, dl, MVT::i32)); 6296 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1); 6297 if (Invert) 6298 Result = DAG.getNOT(dl, Result, VT); 6299 return Result; 6300 } 6301 } 6302 } else { 6303 // Integer comparisons. 6304 switch (SetCCOpcode) { 6305 default: llvm_unreachable("Illegal integer comparison"); 6306 case ISD::SETNE: 6307 if (ST->hasMVEIntegerOps()) { 6308 Opc = ARMCC::NE; break; 6309 } else { 6310 Invert = true; LLVM_FALLTHROUGH; 6311 } 6312 case ISD::SETEQ: Opc = ARMCC::EQ; break; 6313 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; 6314 case ISD::SETGT: Opc = ARMCC::GT; break; 6315 case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; 6316 case ISD::SETGE: Opc = ARMCC::GE; break; 6317 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH; 6318 case ISD::SETUGT: Opc = ARMCC::HI; break; 6319 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH; 6320 case ISD::SETUGE: Opc = ARMCC::HS; break; 6321 } 6322 6323 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero). 6324 if (ST->hasNEON() && Opc == ARMCC::EQ) { 6325 SDValue AndOp; 6326 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 6327 AndOp = Op0; 6328 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) 6329 AndOp = Op1; 6330 6331 // Ignore bitconvert. 6332 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST) 6333 AndOp = AndOp.getOperand(0); 6334 6335 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) { 6336 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0)); 6337 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1)); 6338 SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1); 6339 if (!Invert) 6340 Result = DAG.getNOT(dl, Result, VT); 6341 return Result; 6342 } 6343 } 6344 } 6345 6346 if (Swap) 6347 std::swap(Op0, Op1); 6348 6349 // If one of the operands is a constant vector zero, attempt to fold the 6350 // comparison to a specialized compare-against-zero form. 6351 SDValue SingleOp; 6352 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 6353 SingleOp = Op0; 6354 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) { 6355 if (Opc == ARMCC::GE) 6356 Opc = ARMCC::LE; 6357 else if (Opc == ARMCC::GT) 6358 Opc = ARMCC::LT; 6359 SingleOp = Op1; 6360 } 6361 6362 SDValue Result; 6363 if (SingleOp.getNode()) { 6364 Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, SingleOp, 6365 DAG.getConstant(Opc, dl, MVT::i32)); 6366 } else { 6367 Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, 6368 DAG.getConstant(Opc, dl, MVT::i32)); 6369 } 6370 6371 Result = DAG.getSExtOrTrunc(Result, dl, VT); 6372 6373 if (Invert) 6374 Result = DAG.getNOT(dl, Result, VT); 6375 6376 return Result; 6377 } 6378 6379 static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) { 6380 SDValue LHS = Op.getOperand(0); 6381 SDValue RHS = Op.getOperand(1); 6382 SDValue Carry = Op.getOperand(2); 6383 SDValue Cond = Op.getOperand(3); 6384 SDLoc DL(Op); 6385 6386 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only."); 6387 6388 // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we 6389 // have to invert the carry first. 6390 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 6391 DAG.getConstant(1, DL, MVT::i32), Carry); 6392 // This converts the boolean value carry into the carry flag. 6393 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 6394 6395 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 6396 SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry); 6397 6398 SDValue FVal = DAG.getConstant(0, DL, MVT::i32); 6399 SDValue TVal = DAG.getConstant(1, DL, MVT::i32); 6400 SDValue ARMcc = DAG.getConstant( 6401 IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32); 6402 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 6403 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR, 6404 Cmp.getValue(1), SDValue()); 6405 return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc, 6406 CCR, Chain.getValue(1)); 6407 } 6408 6409 /// isVMOVModifiedImm - Check if the specified splat value corresponds to a 6410 /// valid vector constant for a NEON or MVE instruction with a "modified 6411 /// immediate" operand (e.g., VMOV). If so, return the encoded value. 6412 static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, 6413 unsigned SplatBitSize, SelectionDAG &DAG, 6414 const SDLoc &dl, EVT &VT, bool is128Bits, 6415 VMOVModImmType type) { 6416 unsigned OpCmode, Imm; 6417 6418 // SplatBitSize is set to the smallest size that splats the vector, so a 6419 // zero vector will always have SplatBitSize == 8. However, NEON modified 6420 // immediate instructions others than VMOV do not support the 8-bit encoding 6421 // of a zero vector, and the default encoding of zero is supposed to be the 6422 // 32-bit version. 6423 if (SplatBits == 0) 6424 SplatBitSize = 32; 6425 6426 switch (SplatBitSize) { 6427 case 8: 6428 if (type != VMOVModImm) 6429 return SDValue(); 6430 // Any 1-byte value is OK. Op=0, Cmode=1110. 6431 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); 6432 OpCmode = 0xe; 6433 Imm = SplatBits; 6434 VT = is128Bits ? MVT::v16i8 : MVT::v8i8; 6435 break; 6436 6437 case 16: 6438 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero. 6439 VT = is128Bits ? MVT::v8i16 : MVT::v4i16; 6440 if ((SplatBits & ~0xff) == 0) { 6441 // Value = 0x00nn: Op=x, Cmode=100x. 6442 OpCmode = 0x8; 6443 Imm = SplatBits; 6444 break; 6445 } 6446 if ((SplatBits & ~0xff00) == 0) { 6447 // Value = 0xnn00: Op=x, Cmode=101x. 6448 OpCmode = 0xa; 6449 Imm = SplatBits >> 8; 6450 break; 6451 } 6452 return SDValue(); 6453 6454 case 32: 6455 // NEON's 32-bit VMOV supports splat values where: 6456 // * only one byte is nonzero, or 6457 // * the least significant byte is 0xff and the second byte is nonzero, or 6458 // * the least significant 2 bytes are 0xff and the third is nonzero. 6459 VT = is128Bits ? MVT::v4i32 : MVT::v2i32; 6460 if ((SplatBits & ~0xff) == 0) { 6461 // Value = 0x000000nn: Op=x, Cmode=000x. 6462 OpCmode = 0; 6463 Imm = SplatBits; 6464 break; 6465 } 6466 if ((SplatBits & ~0xff00) == 0) { 6467 // Value = 0x0000nn00: Op=x, Cmode=001x. 6468 OpCmode = 0x2; 6469 Imm = SplatBits >> 8; 6470 break; 6471 } 6472 if ((SplatBits & ~0xff0000) == 0) { 6473 // Value = 0x00nn0000: Op=x, Cmode=010x. 6474 OpCmode = 0x4; 6475 Imm = SplatBits >> 16; 6476 break; 6477 } 6478 if ((SplatBits & ~0xff000000) == 0) { 6479 // Value = 0xnn000000: Op=x, Cmode=011x. 6480 OpCmode = 0x6; 6481 Imm = SplatBits >> 24; 6482 break; 6483 } 6484 6485 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC 6486 if (type == OtherModImm) return SDValue(); 6487 6488 if ((SplatBits & ~0xffff) == 0 && 6489 ((SplatBits | SplatUndef) & 0xff) == 0xff) { 6490 // Value = 0x0000nnff: Op=x, Cmode=1100. 6491 OpCmode = 0xc; 6492 Imm = SplatBits >> 8; 6493 break; 6494 } 6495 6496 // cmode == 0b1101 is not supported for MVE VMVN 6497 if (type == MVEVMVNModImm) 6498 return SDValue(); 6499 6500 if ((SplatBits & ~0xffffff) == 0 && 6501 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { 6502 // Value = 0x00nnffff: Op=x, Cmode=1101. 6503 OpCmode = 0xd; 6504 Imm = SplatBits >> 16; 6505 break; 6506 } 6507 6508 // Note: there are a few 32-bit splat values (specifically: 00ffff00, 6509 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not 6510 // VMOV.I32. A (very) minor optimization would be to replicate the value 6511 // and fall through here to test for a valid 64-bit splat. But, then the 6512 // caller would also need to check and handle the change in size. 6513 return SDValue(); 6514 6515 case 64: { 6516 if (type != VMOVModImm) 6517 return SDValue(); 6518 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff. 6519 uint64_t BitMask = 0xff; 6520 uint64_t Val = 0; 6521 unsigned ImmMask = 1; 6522 Imm = 0; 6523 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { 6524 if (((SplatBits | SplatUndef) & BitMask) == BitMask) { 6525 Val |= BitMask; 6526 Imm |= ImmMask; 6527 } else if ((SplatBits & BitMask) != 0) { 6528 return SDValue(); 6529 } 6530 BitMask <<= 8; 6531 ImmMask <<= 1; 6532 } 6533 6534 if (DAG.getDataLayout().isBigEndian()) 6535 // swap higher and lower 32 bit word 6536 Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4); 6537 6538 // Op=1, Cmode=1110. 6539 OpCmode = 0x1e; 6540 VT = is128Bits ? MVT::v2i64 : MVT::v1i64; 6541 break; 6542 } 6543 6544 default: 6545 llvm_unreachable("unexpected size for isVMOVModifiedImm"); 6546 } 6547 6548 unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm); 6549 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32); 6550 } 6551 6552 SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, 6553 const ARMSubtarget *ST) const { 6554 EVT VT = Op.getValueType(); 6555 bool IsDouble = (VT == MVT::f64); 6556 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op); 6557 const APFloat &FPVal = CFP->getValueAPF(); 6558 6559 // Prevent floating-point constants from using literal loads 6560 // when execute-only is enabled. 6561 if (ST->genExecuteOnly()) { 6562 // If we can represent the constant as an immediate, don't lower it 6563 if (isFPImmLegal(FPVal, VT)) 6564 return Op; 6565 // Otherwise, construct as integer, and move to float register 6566 APInt INTVal = FPVal.bitcastToAPInt(); 6567 SDLoc DL(CFP); 6568 switch (VT.getSimpleVT().SimpleTy) { 6569 default: 6570 llvm_unreachable("Unknown floating point type!"); 6571 break; 6572 case MVT::f64: { 6573 SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32); 6574 SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32); 6575 if (!ST->isLittle()) 6576 std::swap(Lo, Hi); 6577 return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi); 6578 } 6579 case MVT::f32: 6580 return DAG.getNode(ARMISD::VMOVSR, DL, VT, 6581 DAG.getConstant(INTVal, DL, MVT::i32)); 6582 } 6583 } 6584 6585 if (!ST->hasVFP3Base()) 6586 return SDValue(); 6587 6588 // Use the default (constant pool) lowering for double constants when we have 6589 // an SP-only FPU 6590 if (IsDouble && !Subtarget->hasFP64()) 6591 return SDValue(); 6592 6593 // Try splatting with a VMOV.f32... 6594 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal); 6595 6596 if (ImmVal != -1) { 6597 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) { 6598 // We have code in place to select a valid ConstantFP already, no need to 6599 // do any mangling. 6600 return Op; 6601 } 6602 6603 // It's a float and we are trying to use NEON operations where 6604 // possible. Lower it to a splat followed by an extract. 6605 SDLoc DL(Op); 6606 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32); 6607 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32, 6608 NewVal); 6609 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant, 6610 DAG.getConstant(0, DL, MVT::i32)); 6611 } 6612 6613 // The rest of our options are NEON only, make sure that's allowed before 6614 // proceeding.. 6615 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP())) 6616 return SDValue(); 6617 6618 EVT VMovVT; 6619 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue(); 6620 6621 // It wouldn't really be worth bothering for doubles except for one very 6622 // important value, which does happen to match: 0.0. So make sure we don't do 6623 // anything stupid. 6624 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32)) 6625 return SDValue(); 6626 6627 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too). 6628 SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), 6629 VMovVT, false, VMOVModImm); 6630 if (NewVal != SDValue()) { 6631 SDLoc DL(Op); 6632 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT, 6633 NewVal); 6634 if (IsDouble) 6635 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); 6636 6637 // It's a float: cast and extract a vector element. 6638 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 6639 VecConstant); 6640 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 6641 DAG.getConstant(0, DL, MVT::i32)); 6642 } 6643 6644 // Finally, try a VMVN.i32 6645 NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT, 6646 false, VMVNModImm); 6647 if (NewVal != SDValue()) { 6648 SDLoc DL(Op); 6649 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal); 6650 6651 if (IsDouble) 6652 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); 6653 6654 // It's a float: cast and extract a vector element. 6655 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 6656 VecConstant); 6657 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 6658 DAG.getConstant(0, DL, MVT::i32)); 6659 } 6660 6661 return SDValue(); 6662 } 6663 6664 // check if an VEXT instruction can handle the shuffle mask when the 6665 // vector sources of the shuffle are the same. 6666 static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { 6667 unsigned NumElts = VT.getVectorNumElements(); 6668 6669 // Assume that the first shuffle index is not UNDEF. Fail if it is. 6670 if (M[0] < 0) 6671 return false; 6672 6673 Imm = M[0]; 6674 6675 // If this is a VEXT shuffle, the immediate value is the index of the first 6676 // element. The other shuffle indices must be the successive elements after 6677 // the first one. 6678 unsigned ExpectedElt = Imm; 6679 for (unsigned i = 1; i < NumElts; ++i) { 6680 // Increment the expected index. If it wraps around, just follow it 6681 // back to index zero and keep going. 6682 ++ExpectedElt; 6683 if (ExpectedElt == NumElts) 6684 ExpectedElt = 0; 6685 6686 if (M[i] < 0) continue; // ignore UNDEF indices 6687 if (ExpectedElt != static_cast<unsigned>(M[i])) 6688 return false; 6689 } 6690 6691 return true; 6692 } 6693 6694 static bool isVEXTMask(ArrayRef<int> M, EVT VT, 6695 bool &ReverseVEXT, unsigned &Imm) { 6696 unsigned NumElts = VT.getVectorNumElements(); 6697 ReverseVEXT = false; 6698 6699 // Assume that the first shuffle index is not UNDEF. Fail if it is. 6700 if (M[0] < 0) 6701 return false; 6702 6703 Imm = M[0]; 6704 6705 // If this is a VEXT shuffle, the immediate value is the index of the first 6706 // element. The other shuffle indices must be the successive elements after 6707 // the first one. 6708 unsigned ExpectedElt = Imm; 6709 for (unsigned i = 1; i < NumElts; ++i) { 6710 // Increment the expected index. If it wraps around, it may still be 6711 // a VEXT but the source vectors must be swapped. 6712 ExpectedElt += 1; 6713 if (ExpectedElt == NumElts * 2) { 6714 ExpectedElt = 0; 6715 ReverseVEXT = true; 6716 } 6717 6718 if (M[i] < 0) continue; // ignore UNDEF indices 6719 if (ExpectedElt != static_cast<unsigned>(M[i])) 6720 return false; 6721 } 6722 6723 // Adjust the index value if the source operands will be swapped. 6724 if (ReverseVEXT) 6725 Imm -= NumElts; 6726 6727 return true; 6728 } 6729 6730 /// isVREVMask - Check if a vector shuffle corresponds to a VREV 6731 /// instruction with the specified blocksize. (The order of the elements 6732 /// within each block of the vector is reversed.) 6733 static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { 6734 assert((BlockSize==16 || BlockSize==32 || BlockSize==64) && 6735 "Only possible block sizes for VREV are: 16, 32, 64"); 6736 6737 unsigned EltSz = VT.getScalarSizeInBits(); 6738 if (EltSz == 64) 6739 return false; 6740 6741 unsigned NumElts = VT.getVectorNumElements(); 6742 unsigned BlockElts = M[0] + 1; 6743 // If the first shuffle index is UNDEF, be optimistic. 6744 if (M[0] < 0) 6745 BlockElts = BlockSize / EltSz; 6746 6747 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) 6748 return false; 6749 6750 for (unsigned i = 0; i < NumElts; ++i) { 6751 if (M[i] < 0) continue; // ignore UNDEF indices 6752 if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts)) 6753 return false; 6754 } 6755 6756 return true; 6757 } 6758 6759 static bool isVTBLMask(ArrayRef<int> M, EVT VT) { 6760 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of 6761 // range, then 0 is placed into the resulting vector. So pretty much any mask 6762 // of 8 elements can work here. 6763 return VT == MVT::v8i8 && M.size() == 8; 6764 } 6765 6766 static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask, 6767 unsigned Index) { 6768 if (Mask.size() == Elements * 2) 6769 return Index / Elements; 6770 return Mask[Index] == 0 ? 0 : 1; 6771 } 6772 6773 // Checks whether the shuffle mask represents a vector transpose (VTRN) by 6774 // checking that pairs of elements in the shuffle mask represent the same index 6775 // in each vector, incrementing the expected index by 2 at each step. 6776 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6] 6777 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g} 6778 // v2={e,f,g,h} 6779 // WhichResult gives the offset for each element in the mask based on which 6780 // of the two results it belongs to. 6781 // 6782 // The transpose can be represented either as: 6783 // result1 = shufflevector v1, v2, result1_shuffle_mask 6784 // result2 = shufflevector v1, v2, result2_shuffle_mask 6785 // where v1/v2 and the shuffle masks have the same number of elements 6786 // (here WhichResult (see below) indicates which result is being checked) 6787 // 6788 // or as: 6789 // results = shufflevector v1, v2, shuffle_mask 6790 // where both results are returned in one vector and the shuffle mask has twice 6791 // as many elements as v1/v2 (here WhichResult will always be 0 if true) here we 6792 // want to check the low half and high half of the shuffle mask as if it were 6793 // the other case 6794 static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 6795 unsigned EltSz = VT.getScalarSizeInBits(); 6796 if (EltSz == 64) 6797 return false; 6798 6799 unsigned NumElts = VT.getVectorNumElements(); 6800 if (M.size() != NumElts && M.size() != NumElts*2) 6801 return false; 6802 6803 // If the mask is twice as long as the input vector then we need to check the 6804 // upper and lower parts of the mask with a matching value for WhichResult 6805 // FIXME: A mask with only even values will be rejected in case the first 6806 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only 6807 // M[0] is used to determine WhichResult 6808 for (unsigned i = 0; i < M.size(); i += NumElts) { 6809 WhichResult = SelectPairHalf(NumElts, M, i); 6810 for (unsigned j = 0; j < NumElts; j += 2) { 6811 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || 6812 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult)) 6813 return false; 6814 } 6815 } 6816 6817 if (M.size() == NumElts*2) 6818 WhichResult = 0; 6819 6820 return true; 6821 } 6822 6823 /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of 6824 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 6825 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. 6826 static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 6827 unsigned EltSz = VT.getScalarSizeInBits(); 6828 if (EltSz == 64) 6829 return false; 6830 6831 unsigned NumElts = VT.getVectorNumElements(); 6832 if (M.size() != NumElts && M.size() != NumElts*2) 6833 return false; 6834 6835 for (unsigned i = 0; i < M.size(); i += NumElts) { 6836 WhichResult = SelectPairHalf(NumElts, M, i); 6837 for (unsigned j = 0; j < NumElts; j += 2) { 6838 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || 6839 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult)) 6840 return false; 6841 } 6842 } 6843 6844 if (M.size() == NumElts*2) 6845 WhichResult = 0; 6846 6847 return true; 6848 } 6849 6850 // Checks whether the shuffle mask represents a vector unzip (VUZP) by checking 6851 // that the mask elements are either all even and in steps of size 2 or all odd 6852 // and in steps of size 2. 6853 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6] 6854 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g} 6855 // v2={e,f,g,h} 6856 // Requires similar checks to that of isVTRNMask with 6857 // respect the how results are returned. 6858 static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 6859 unsigned EltSz = VT.getScalarSizeInBits(); 6860 if (EltSz == 64) 6861 return false; 6862 6863 unsigned NumElts = VT.getVectorNumElements(); 6864 if (M.size() != NumElts && M.size() != NumElts*2) 6865 return false; 6866 6867 for (unsigned i = 0; i < M.size(); i += NumElts) { 6868 WhichResult = SelectPairHalf(NumElts, M, i); 6869 for (unsigned j = 0; j < NumElts; ++j) { 6870 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult) 6871 return false; 6872 } 6873 } 6874 6875 if (M.size() == NumElts*2) 6876 WhichResult = 0; 6877 6878 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 6879 if (VT.is64BitVector() && EltSz == 32) 6880 return false; 6881 6882 return true; 6883 } 6884 6885 /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of 6886 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 6887 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, 6888 static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 6889 unsigned EltSz = VT.getScalarSizeInBits(); 6890 if (EltSz == 64) 6891 return false; 6892 6893 unsigned NumElts = VT.getVectorNumElements(); 6894 if (M.size() != NumElts && M.size() != NumElts*2) 6895 return false; 6896 6897 unsigned Half = NumElts / 2; 6898 for (unsigned i = 0; i < M.size(); i += NumElts) { 6899 WhichResult = SelectPairHalf(NumElts, M, i); 6900 for (unsigned j = 0; j < NumElts; j += Half) { 6901 unsigned Idx = WhichResult; 6902 for (unsigned k = 0; k < Half; ++k) { 6903 int MIdx = M[i + j + k]; 6904 if (MIdx >= 0 && (unsigned) MIdx != Idx) 6905 return false; 6906 Idx += 2; 6907 } 6908 } 6909 } 6910 6911 if (M.size() == NumElts*2) 6912 WhichResult = 0; 6913 6914 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 6915 if (VT.is64BitVector() && EltSz == 32) 6916 return false; 6917 6918 return true; 6919 } 6920 6921 // Checks whether the shuffle mask represents a vector zip (VZIP) by checking 6922 // that pairs of elements of the shufflemask represent the same index in each 6923 // vector incrementing sequentially through the vectors. 6924 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5] 6925 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f} 6926 // v2={e,f,g,h} 6927 // Requires similar checks to that of isVTRNMask with respect the how results 6928 // are returned. 6929 static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 6930 unsigned EltSz = VT.getScalarSizeInBits(); 6931 if (EltSz == 64) 6932 return false; 6933 6934 unsigned NumElts = VT.getVectorNumElements(); 6935 if (M.size() != NumElts && M.size() != NumElts*2) 6936 return false; 6937 6938 for (unsigned i = 0; i < M.size(); i += NumElts) { 6939 WhichResult = SelectPairHalf(NumElts, M, i); 6940 unsigned Idx = WhichResult * NumElts / 2; 6941 for (unsigned j = 0; j < NumElts; j += 2) { 6942 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || 6943 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts)) 6944 return false; 6945 Idx += 1; 6946 } 6947 } 6948 6949 if (M.size() == NumElts*2) 6950 WhichResult = 0; 6951 6952 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 6953 if (VT.is64BitVector() && EltSz == 32) 6954 return false; 6955 6956 return true; 6957 } 6958 6959 /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of 6960 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 6961 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. 6962 static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 6963 unsigned EltSz = VT.getScalarSizeInBits(); 6964 if (EltSz == 64) 6965 return false; 6966 6967 unsigned NumElts = VT.getVectorNumElements(); 6968 if (M.size() != NumElts && M.size() != NumElts*2) 6969 return false; 6970 6971 for (unsigned i = 0; i < M.size(); i += NumElts) { 6972 WhichResult = SelectPairHalf(NumElts, M, i); 6973 unsigned Idx = WhichResult * NumElts / 2; 6974 for (unsigned j = 0; j < NumElts; j += 2) { 6975 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || 6976 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx)) 6977 return false; 6978 Idx += 1; 6979 } 6980 } 6981 6982 if (M.size() == NumElts*2) 6983 WhichResult = 0; 6984 6985 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 6986 if (VT.is64BitVector() && EltSz == 32) 6987 return false; 6988 6989 return true; 6990 } 6991 6992 /// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), 6993 /// and return the corresponding ARMISD opcode if it is, or 0 if it isn't. 6994 static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT, 6995 unsigned &WhichResult, 6996 bool &isV_UNDEF) { 6997 isV_UNDEF = false; 6998 if (isVTRNMask(ShuffleMask, VT, WhichResult)) 6999 return ARMISD::VTRN; 7000 if (isVUZPMask(ShuffleMask, VT, WhichResult)) 7001 return ARMISD::VUZP; 7002 if (isVZIPMask(ShuffleMask, VT, WhichResult)) 7003 return ARMISD::VZIP; 7004 7005 isV_UNDEF = true; 7006 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) 7007 return ARMISD::VTRN; 7008 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 7009 return ARMISD::VUZP; 7010 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 7011 return ARMISD::VZIP; 7012 7013 return 0; 7014 } 7015 7016 /// \return true if this is a reverse operation on an vector. 7017 static bool isReverseMask(ArrayRef<int> M, EVT VT) { 7018 unsigned NumElts = VT.getVectorNumElements(); 7019 // Make sure the mask has the right size. 7020 if (NumElts != M.size()) 7021 return false; 7022 7023 // Look for <15, ..., 3, -1, 1, 0>. 7024 for (unsigned i = 0; i != NumElts; ++i) 7025 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i)) 7026 return false; 7027 7028 return true; 7029 } 7030 7031 static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top) { 7032 unsigned NumElts = VT.getVectorNumElements(); 7033 // Make sure the mask has the right size. 7034 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8)) 7035 return false; 7036 7037 // If Top 7038 // Look for <0, N, 2, N+2, 4, N+4, ..>. 7039 // This inserts Input2 into Input1 7040 // else if not Top 7041 // Look for <0, N+1, 2, N+3, 4, N+5, ..> 7042 // This inserts Input1 into Input2 7043 unsigned Offset = Top ? 0 : 1; 7044 for (unsigned i = 0; i < NumElts; i+=2) { 7045 if (M[i] >= 0 && M[i] != (int)i) 7046 return false; 7047 if (M[i+1] >= 0 && M[i+1] != (int)(NumElts + i + Offset)) 7048 return false; 7049 } 7050 7051 return true; 7052 } 7053 7054 // If N is an integer constant that can be moved into a register in one 7055 // instruction, return an SDValue of such a constant (will become a MOV 7056 // instruction). Otherwise return null. 7057 static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, 7058 const ARMSubtarget *ST, const SDLoc &dl) { 7059 uint64_t Val; 7060 if (!isa<ConstantSDNode>(N)) 7061 return SDValue(); 7062 Val = cast<ConstantSDNode>(N)->getZExtValue(); 7063 7064 if (ST->isThumb1Only()) { 7065 if (Val <= 255 || ~Val <= 255) 7066 return DAG.getConstant(Val, dl, MVT::i32); 7067 } else { 7068 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1) 7069 return DAG.getConstant(Val, dl, MVT::i32); 7070 } 7071 return SDValue(); 7072 } 7073 7074 static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, 7075 const ARMSubtarget *ST) { 7076 SDLoc dl(Op); 7077 EVT VT = Op.getValueType(); 7078 7079 assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!"); 7080 7081 unsigned NumElts = VT.getVectorNumElements(); 7082 unsigned BoolMask; 7083 unsigned BitsPerBool; 7084 if (NumElts == 4) { 7085 BitsPerBool = 4; 7086 BoolMask = 0xf; 7087 } else if (NumElts == 8) { 7088 BitsPerBool = 2; 7089 BoolMask = 0x3; 7090 } else if (NumElts == 16) { 7091 BitsPerBool = 1; 7092 BoolMask = 0x1; 7093 } else 7094 return SDValue(); 7095 7096 // If this is a single value copied into all lanes (a splat), we can just sign 7097 // extend that single value 7098 SDValue FirstOp = Op.getOperand(0); 7099 if (!isa<ConstantSDNode>(FirstOp) && 7100 std::all_of(std::next(Op->op_begin()), Op->op_end(), 7101 [&FirstOp](SDUse &U) { 7102 return U.get().isUndef() || U.get() == FirstOp; 7103 })) { 7104 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp, 7105 DAG.getValueType(MVT::i1)); 7106 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext); 7107 } 7108 7109 // First create base with bits set where known 7110 unsigned Bits32 = 0; 7111 for (unsigned i = 0; i < NumElts; ++i) { 7112 SDValue V = Op.getOperand(i); 7113 if (!isa<ConstantSDNode>(V) && !V.isUndef()) 7114 continue; 7115 bool BitSet = V.isUndef() ? false : cast<ConstantSDNode>(V)->getZExtValue(); 7116 if (BitSet) 7117 Bits32 |= BoolMask << (i * BitsPerBool); 7118 } 7119 7120 // Add in unknown nodes 7121 SDValue Base = DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, 7122 DAG.getConstant(Bits32, dl, MVT::i32)); 7123 for (unsigned i = 0; i < NumElts; ++i) { 7124 SDValue V = Op.getOperand(i); 7125 if (isa<ConstantSDNode>(V) || V.isUndef()) 7126 continue; 7127 Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V, 7128 DAG.getConstant(i, dl, MVT::i32)); 7129 } 7130 7131 return Base; 7132 } 7133 7134 // If this is a case we can't handle, return null and let the default 7135 // expansion code take care of it. 7136 SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, 7137 const ARMSubtarget *ST) const { 7138 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 7139 SDLoc dl(Op); 7140 EVT VT = Op.getValueType(); 7141 7142 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) 7143 return LowerBUILD_VECTOR_i1(Op, DAG, ST); 7144 7145 APInt SplatBits, SplatUndef; 7146 unsigned SplatBitSize; 7147 bool HasAnyUndefs; 7148 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 7149 if (SplatUndef.isAllOnesValue()) 7150 return DAG.getUNDEF(VT); 7151 7152 if ((ST->hasNEON() && SplatBitSize <= 64) || 7153 (ST->hasMVEIntegerOps() && SplatBitSize <= 32)) { 7154 // Check if an immediate VMOV works. 7155 EVT VmovVT; 7156 SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(), 7157 SplatUndef.getZExtValue(), SplatBitSize, 7158 DAG, dl, VmovVT, VT.is128BitVector(), 7159 VMOVModImm); 7160 7161 if (Val.getNode()) { 7162 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val); 7163 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 7164 } 7165 7166 // Try an immediate VMVN. 7167 uint64_t NegatedImm = (~SplatBits).getZExtValue(); 7168 Val = isVMOVModifiedImm( 7169 NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, 7170 DAG, dl, VmovVT, VT.is128BitVector(), 7171 ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm); 7172 if (Val.getNode()) { 7173 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val); 7174 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 7175 } 7176 7177 // Use vmov.f32 to materialize other v2f32 and v4f32 splats. 7178 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) { 7179 int ImmVal = ARM_AM::getFP32Imm(SplatBits); 7180 if (ImmVal != -1) { 7181 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32); 7182 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val); 7183 } 7184 } 7185 } 7186 } 7187 7188 // Scan through the operands to see if only one value is used. 7189 // 7190 // As an optimisation, even if more than one value is used it may be more 7191 // profitable to splat with one value then change some lanes. 7192 // 7193 // Heuristically we decide to do this if the vector has a "dominant" value, 7194 // defined as splatted to more than half of the lanes. 7195 unsigned NumElts = VT.getVectorNumElements(); 7196 bool isOnlyLowElement = true; 7197 bool usesOnlyOneValue = true; 7198 bool hasDominantValue = false; 7199 bool isConstant = true; 7200 7201 // Map of the number of times a particular SDValue appears in the 7202 // element list. 7203 DenseMap<SDValue, unsigned> ValueCounts; 7204 SDValue Value; 7205 for (unsigned i = 0; i < NumElts; ++i) { 7206 SDValue V = Op.getOperand(i); 7207 if (V.isUndef()) 7208 continue; 7209 if (i > 0) 7210 isOnlyLowElement = false; 7211 if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) 7212 isConstant = false; 7213 7214 ValueCounts.insert(std::make_pair(V, 0)); 7215 unsigned &Count = ValueCounts[V]; 7216 7217 // Is this value dominant? (takes up more than half of the lanes) 7218 if (++Count > (NumElts / 2)) { 7219 hasDominantValue = true; 7220 Value = V; 7221 } 7222 } 7223 if (ValueCounts.size() != 1) 7224 usesOnlyOneValue = false; 7225 if (!Value.getNode() && !ValueCounts.empty()) 7226 Value = ValueCounts.begin()->first; 7227 7228 if (ValueCounts.empty()) 7229 return DAG.getUNDEF(VT); 7230 7231 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR. 7232 // Keep going if we are hitting this case. 7233 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode())) 7234 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); 7235 7236 unsigned EltSize = VT.getScalarSizeInBits(); 7237 7238 // Use VDUP for non-constant splats. For f32 constant splats, reduce to 7239 // i32 and try again. 7240 if (hasDominantValue && EltSize <= 32) { 7241 if (!isConstant) { 7242 SDValue N; 7243 7244 // If we are VDUPing a value that comes directly from a vector, that will 7245 // cause an unnecessary move to and from a GPR, where instead we could 7246 // just use VDUPLANE. We can only do this if the lane being extracted 7247 // is at a constant index, as the VDUP from lane instructions only have 7248 // constant-index forms. 7249 ConstantSDNode *constIndex; 7250 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT && 7251 (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) { 7252 // We need to create a new undef vector to use for the VDUPLANE if the 7253 // size of the vector from which we get the value is different than the 7254 // size of the vector that we need to create. We will insert the element 7255 // such that the register coalescer will remove unnecessary copies. 7256 if (VT != Value->getOperand(0).getValueType()) { 7257 unsigned index = constIndex->getAPIntValue().getLimitedValue() % 7258 VT.getVectorNumElements(); 7259 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 7260 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT), 7261 Value, DAG.getConstant(index, dl, MVT::i32)), 7262 DAG.getConstant(index, dl, MVT::i32)); 7263 } else 7264 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 7265 Value->getOperand(0), Value->getOperand(1)); 7266 } else 7267 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value); 7268 7269 if (!usesOnlyOneValue) { 7270 // The dominant value was splatted as 'N', but we now have to insert 7271 // all differing elements. 7272 for (unsigned I = 0; I < NumElts; ++I) { 7273 if (Op.getOperand(I) == Value) 7274 continue; 7275 SmallVector<SDValue, 3> Ops; 7276 Ops.push_back(N); 7277 Ops.push_back(Op.getOperand(I)); 7278 Ops.push_back(DAG.getConstant(I, dl, MVT::i32)); 7279 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops); 7280 } 7281 } 7282 return N; 7283 } 7284 if (VT.getVectorElementType().isFloatingPoint()) { 7285 SmallVector<SDValue, 8> Ops; 7286 MVT FVT = VT.getVectorElementType().getSimpleVT(); 7287 assert(FVT == MVT::f32 || FVT == MVT::f16); 7288 MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16; 7289 for (unsigned i = 0; i < NumElts; ++i) 7290 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT, 7291 Op.getOperand(i))); 7292 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts); 7293 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); 7294 Val = LowerBUILD_VECTOR(Val, DAG, ST); 7295 if (Val.getNode()) 7296 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 7297 } 7298 if (usesOnlyOneValue) { 7299 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); 7300 if (isConstant && Val.getNode()) 7301 return DAG.getNode(ARMISD::VDUP, dl, VT, Val); 7302 } 7303 } 7304 7305 // If all elements are constants and the case above didn't get hit, fall back 7306 // to the default expansion, which will generate a load from the constant 7307 // pool. 7308 if (isConstant) 7309 return SDValue(); 7310 7311 // Empirical tests suggest this is rarely worth it for vectors of length <= 2. 7312 if (NumElts >= 4) { 7313 SDValue shuffle = ReconstructShuffle(Op, DAG); 7314 if (shuffle != SDValue()) 7315 return shuffle; 7316 } 7317 7318 if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) { 7319 // If we haven't found an efficient lowering, try splitting a 128-bit vector 7320 // into two 64-bit vectors; we might discover a better way to lower it. 7321 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts); 7322 EVT ExtVT = VT.getVectorElementType(); 7323 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2); 7324 SDValue Lower = 7325 DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2)); 7326 if (Lower.getOpcode() == ISD::BUILD_VECTOR) 7327 Lower = LowerBUILD_VECTOR(Lower, DAG, ST); 7328 SDValue Upper = DAG.getBuildVector( 7329 HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2)); 7330 if (Upper.getOpcode() == ISD::BUILD_VECTOR) 7331 Upper = LowerBUILD_VECTOR(Upper, DAG, ST); 7332 if (Lower && Upper) 7333 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper); 7334 } 7335 7336 // Vectors with 32- or 64-bit elements can be built by directly assigning 7337 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands 7338 // will be legalized. 7339 if (EltSize >= 32) { 7340 // Do the expansion with floating-point types, since that is what the VFP 7341 // registers are defined to use, and since i64 is not legal. 7342 EVT EltVT = EVT::getFloatingPointVT(EltSize); 7343 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 7344 SmallVector<SDValue, 8> Ops; 7345 for (unsigned i = 0; i < NumElts; ++i) 7346 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i))); 7347 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); 7348 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 7349 } 7350 7351 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we 7352 // know the default expansion would otherwise fall back on something even 7353 // worse. For a vector with one or two non-undef values, that's 7354 // scalar_to_vector for the elements followed by a shuffle (provided the 7355 // shuffle is valid for the target) and materialization element by element 7356 // on the stack followed by a load for everything else. 7357 if (!isConstant && !usesOnlyOneValue) { 7358 SDValue Vec = DAG.getUNDEF(VT); 7359 for (unsigned i = 0 ; i < NumElts; ++i) { 7360 SDValue V = Op.getOperand(i); 7361 if (V.isUndef()) 7362 continue; 7363 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32); 7364 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); 7365 } 7366 return Vec; 7367 } 7368 7369 return SDValue(); 7370 } 7371 7372 // Gather data to see if the operation can be modelled as a 7373 // shuffle in combination with VEXTs. 7374 SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, 7375 SelectionDAG &DAG) const { 7376 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 7377 SDLoc dl(Op); 7378 EVT VT = Op.getValueType(); 7379 unsigned NumElts = VT.getVectorNumElements(); 7380 7381 struct ShuffleSourceInfo { 7382 SDValue Vec; 7383 unsigned MinElt = std::numeric_limits<unsigned>::max(); 7384 unsigned MaxElt = 0; 7385 7386 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to 7387 // be compatible with the shuffle we intend to construct. As a result 7388 // ShuffleVec will be some sliding window into the original Vec. 7389 SDValue ShuffleVec; 7390 7391 // Code should guarantee that element i in Vec starts at element "WindowBase 7392 // + i * WindowScale in ShuffleVec". 7393 int WindowBase = 0; 7394 int WindowScale = 1; 7395 7396 ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {} 7397 7398 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } 7399 }; 7400 7401 // First gather all vectors used as an immediate source for this BUILD_VECTOR 7402 // node. 7403 SmallVector<ShuffleSourceInfo, 2> Sources; 7404 for (unsigned i = 0; i < NumElts; ++i) { 7405 SDValue V = Op.getOperand(i); 7406 if (V.isUndef()) 7407 continue; 7408 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { 7409 // A shuffle can only come from building a vector from various 7410 // elements of other vectors. 7411 return SDValue(); 7412 } else if (!isa<ConstantSDNode>(V.getOperand(1))) { 7413 // Furthermore, shuffles require a constant mask, whereas extractelts 7414 // accept variable indices. 7415 return SDValue(); 7416 } 7417 7418 // Add this element source to the list if it's not already there. 7419 SDValue SourceVec = V.getOperand(0); 7420 auto Source = llvm::find(Sources, SourceVec); 7421 if (Source == Sources.end()) 7422 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); 7423 7424 // Update the minimum and maximum lane number seen. 7425 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); 7426 Source->MinElt = std::min(Source->MinElt, EltNo); 7427 Source->MaxElt = std::max(Source->MaxElt, EltNo); 7428 } 7429 7430 // Currently only do something sane when at most two source vectors 7431 // are involved. 7432 if (Sources.size() > 2) 7433 return SDValue(); 7434 7435 // Find out the smallest element size among result and two sources, and use 7436 // it as element size to build the shuffle_vector. 7437 EVT SmallestEltTy = VT.getVectorElementType(); 7438 for (auto &Source : Sources) { 7439 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); 7440 if (SrcEltTy.bitsLT(SmallestEltTy)) 7441 SmallestEltTy = SrcEltTy; 7442 } 7443 unsigned ResMultiplier = 7444 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits(); 7445 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); 7446 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); 7447 7448 // If the source vector is too wide or too narrow, we may nevertheless be able 7449 // to construct a compatible shuffle either by concatenating it with UNDEF or 7450 // extracting a suitable range of elements. 7451 for (auto &Src : Sources) { 7452 EVT SrcVT = Src.ShuffleVec.getValueType(); 7453 7454 if (SrcVT.getSizeInBits() == VT.getSizeInBits()) 7455 continue; 7456 7457 // This stage of the search produces a source with the same element type as 7458 // the original, but with a total width matching the BUILD_VECTOR output. 7459 EVT EltVT = SrcVT.getVectorElementType(); 7460 unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); 7461 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); 7462 7463 if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { 7464 if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits()) 7465 return SDValue(); 7466 // We can pad out the smaller vector for free, so if it's part of a 7467 // shuffle... 7468 Src.ShuffleVec = 7469 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, 7470 DAG.getUNDEF(Src.ShuffleVec.getValueType())); 7471 continue; 7472 } 7473 7474 if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits()) 7475 return SDValue(); 7476 7477 if (Src.MaxElt - Src.MinElt >= NumSrcElts) { 7478 // Span too large for a VEXT to cope 7479 return SDValue(); 7480 } 7481 7482 if (Src.MinElt >= NumSrcElts) { 7483 // The extraction can just take the second half 7484 Src.ShuffleVec = 7485 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 7486 DAG.getConstant(NumSrcElts, dl, MVT::i32)); 7487 Src.WindowBase = -NumSrcElts; 7488 } else if (Src.MaxElt < NumSrcElts) { 7489 // The extraction can just take the first half 7490 Src.ShuffleVec = 7491 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 7492 DAG.getConstant(0, dl, MVT::i32)); 7493 } else { 7494 // An actual VEXT is needed 7495 SDValue VEXTSrc1 = 7496 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 7497 DAG.getConstant(0, dl, MVT::i32)); 7498 SDValue VEXTSrc2 = 7499 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 7500 DAG.getConstant(NumSrcElts, dl, MVT::i32)); 7501 7502 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1, 7503 VEXTSrc2, 7504 DAG.getConstant(Src.MinElt, dl, MVT::i32)); 7505 Src.WindowBase = -Src.MinElt; 7506 } 7507 } 7508 7509 // Another possible incompatibility occurs from the vector element types. We 7510 // can fix this by bitcasting the source vectors to the same type we intend 7511 // for the shuffle. 7512 for (auto &Src : Sources) { 7513 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); 7514 if (SrcEltTy == SmallestEltTy) 7515 continue; 7516 assert(ShuffleVT.getVectorElementType() == SmallestEltTy); 7517 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); 7518 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); 7519 Src.WindowBase *= Src.WindowScale; 7520 } 7521 7522 // Final sanity check before we try to actually produce a shuffle. 7523 LLVM_DEBUG(for (auto Src 7524 : Sources) 7525 assert(Src.ShuffleVec.getValueType() == ShuffleVT);); 7526 7527 // The stars all align, our next step is to produce the mask for the shuffle. 7528 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); 7529 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits(); 7530 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { 7531 SDValue Entry = Op.getOperand(i); 7532 if (Entry.isUndef()) 7533 continue; 7534 7535 auto Src = llvm::find(Sources, Entry.getOperand(0)); 7536 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); 7537 7538 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit 7539 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this 7540 // segment. 7541 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); 7542 int BitsDefined = std::min(OrigEltTy.getSizeInBits(), 7543 VT.getScalarSizeInBits()); 7544 int LanesDefined = BitsDefined / BitsPerShuffleLane; 7545 7546 // This source is expected to fill ResMultiplier lanes of the final shuffle, 7547 // starting at the appropriate offset. 7548 int *LaneMask = &Mask[i * ResMultiplier]; 7549 7550 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; 7551 ExtractBase += NumElts * (Src - Sources.begin()); 7552 for (int j = 0; j < LanesDefined; ++j) 7553 LaneMask[j] = ExtractBase + j; 7554 } 7555 7556 7557 // We can't handle more than two sources. This should have already 7558 // been checked before this point. 7559 assert(Sources.size() <= 2 && "Too many sources!"); 7560 7561 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; 7562 for (unsigned i = 0; i < Sources.size(); ++i) 7563 ShuffleOps[i] = Sources[i].ShuffleVec; 7564 7565 SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0], 7566 ShuffleOps[1], Mask, DAG); 7567 if (!Shuffle) 7568 return SDValue(); 7569 return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); 7570 } 7571 7572 enum ShuffleOpCodes { 7573 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 7574 OP_VREV, 7575 OP_VDUP0, 7576 OP_VDUP1, 7577 OP_VDUP2, 7578 OP_VDUP3, 7579 OP_VEXT1, 7580 OP_VEXT2, 7581 OP_VEXT3, 7582 OP_VUZPL, // VUZP, left result 7583 OP_VUZPR, // VUZP, right result 7584 OP_VZIPL, // VZIP, left result 7585 OP_VZIPR, // VZIP, right result 7586 OP_VTRNL, // VTRN, left result 7587 OP_VTRNR // VTRN, right result 7588 }; 7589 7590 static bool isLegalMVEShuffleOp(unsigned PFEntry) { 7591 unsigned OpNum = (PFEntry >> 26) & 0x0F; 7592 switch (OpNum) { 7593 case OP_COPY: 7594 case OP_VREV: 7595 case OP_VDUP0: 7596 case OP_VDUP1: 7597 case OP_VDUP2: 7598 case OP_VDUP3: 7599 return true; 7600 } 7601 return false; 7602 } 7603 7604 /// isShuffleMaskLegal - Targets can use this to indicate that they only 7605 /// support *some* VECTOR_SHUFFLE operations, those with specific masks. 7606 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 7607 /// are assumed to be legal. 7608 bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { 7609 if (VT.getVectorNumElements() == 4 && 7610 (VT.is128BitVector() || VT.is64BitVector())) { 7611 unsigned PFIndexes[4]; 7612 for (unsigned i = 0; i != 4; ++i) { 7613 if (M[i] < 0) 7614 PFIndexes[i] = 8; 7615 else 7616 PFIndexes[i] = M[i]; 7617 } 7618 7619 // Compute the index in the perfect shuffle table. 7620 unsigned PFTableIndex = 7621 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 7622 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 7623 unsigned Cost = (PFEntry >> 30); 7624 7625 if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry))) 7626 return true; 7627 } 7628 7629 bool ReverseVEXT, isV_UNDEF; 7630 unsigned Imm, WhichResult; 7631 7632 unsigned EltSize = VT.getScalarSizeInBits(); 7633 if (EltSize >= 32 || 7634 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 7635 ShuffleVectorInst::isIdentityMask(M) || 7636 isVREVMask(M, VT, 64) || 7637 isVREVMask(M, VT, 32) || 7638 isVREVMask(M, VT, 16)) 7639 return true; 7640 else if (Subtarget->hasNEON() && 7641 (isVEXTMask(M, VT, ReverseVEXT, Imm) || 7642 isVTBLMask(M, VT) || 7643 isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF))) 7644 return true; 7645 else if (Subtarget->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && 7646 isReverseMask(M, VT)) 7647 return true; 7648 else if (Subtarget->hasMVEIntegerOps() && 7649 (isVMOVNMask(M, VT, 0) || isVMOVNMask(M, VT, 1))) 7650 return true; 7651 else 7652 return false; 7653 } 7654 7655 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit 7656 /// the specified operations to build the shuffle. 7657 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, 7658 SDValue RHS, SelectionDAG &DAG, 7659 const SDLoc &dl) { 7660 unsigned OpNum = (PFEntry >> 26) & 0x0F; 7661 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); 7662 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); 7663 7664 if (OpNum == OP_COPY) { 7665 if (LHSID == (1*9+2)*9+3) return LHS; 7666 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); 7667 return RHS; 7668 } 7669 7670 SDValue OpLHS, OpRHS; 7671 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); 7672 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); 7673 EVT VT = OpLHS.getValueType(); 7674 7675 switch (OpNum) { 7676 default: llvm_unreachable("Unknown shuffle opcode!"); 7677 case OP_VREV: 7678 // VREV divides the vector in half and swaps within the half. 7679 if (VT.getVectorElementType() == MVT::i32 || 7680 VT.getVectorElementType() == MVT::f32) 7681 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS); 7682 // vrev <4 x i16> -> VREV32 7683 if (VT.getVectorElementType() == MVT::i16) 7684 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS); 7685 // vrev <4 x i8> -> VREV16 7686 assert(VT.getVectorElementType() == MVT::i8); 7687 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS); 7688 case OP_VDUP0: 7689 case OP_VDUP1: 7690 case OP_VDUP2: 7691 case OP_VDUP3: 7692 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, 7693 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32)); 7694 case OP_VEXT1: 7695 case OP_VEXT2: 7696 case OP_VEXT3: 7697 return DAG.getNode(ARMISD::VEXT, dl, VT, 7698 OpLHS, OpRHS, 7699 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32)); 7700 case OP_VUZPL: 7701 case OP_VUZPR: 7702 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), 7703 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL); 7704 case OP_VZIPL: 7705 case OP_VZIPR: 7706 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), 7707 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL); 7708 case OP_VTRNL: 7709 case OP_VTRNR: 7710 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), 7711 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL); 7712 } 7713 } 7714 7715 static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, 7716 ArrayRef<int> ShuffleMask, 7717 SelectionDAG &DAG) { 7718 // Check to see if we can use the VTBL instruction. 7719 SDValue V1 = Op.getOperand(0); 7720 SDValue V2 = Op.getOperand(1); 7721 SDLoc DL(Op); 7722 7723 SmallVector<SDValue, 8> VTBLMask; 7724 for (ArrayRef<int>::iterator 7725 I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I) 7726 VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32)); 7727 7728 if (V2.getNode()->isUndef()) 7729 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1, 7730 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); 7731 7732 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2, 7733 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); 7734 } 7735 7736 static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op, 7737 SelectionDAG &DAG) { 7738 SDLoc DL(Op); 7739 SDValue OpLHS = Op.getOperand(0); 7740 EVT VT = OpLHS.getValueType(); 7741 7742 assert((VT == MVT::v8i16 || VT == MVT::v16i8) && 7743 "Expect an v8i16/v16i8 type"); 7744 OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS); 7745 // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now, 7746 // extract the first 8 bytes into the top double word and the last 8 bytes 7747 // into the bottom double word. The v8i16 case is similar. 7748 unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4; 7749 return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS, 7750 DAG.getConstant(ExtractNum, DL, MVT::i32)); 7751 } 7752 7753 static EVT getVectorTyFromPredicateVector(EVT VT) { 7754 switch (VT.getSimpleVT().SimpleTy) { 7755 case MVT::v4i1: 7756 return MVT::v4i32; 7757 case MVT::v8i1: 7758 return MVT::v8i16; 7759 case MVT::v16i1: 7760 return MVT::v16i8; 7761 default: 7762 llvm_unreachable("Unexpected vector predicate type"); 7763 } 7764 } 7765 7766 static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT, 7767 SelectionDAG &DAG) { 7768 // Converting from boolean predicates to integers involves creating a vector 7769 // of all ones or all zeroes and selecting the lanes based upon the real 7770 // predicate. 7771 SDValue AllOnes = 7772 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32); 7773 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes); 7774 7775 SDValue AllZeroes = 7776 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32); 7777 AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes); 7778 7779 // Get full vector type from predicate type 7780 EVT NewVT = getVectorTyFromPredicateVector(VT); 7781 7782 SDValue RecastV1; 7783 // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast 7784 // this to a v16i1. This cannot be done with an ordinary bitcast because the 7785 // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node, 7786 // since we know in hardware the sizes are really the same. 7787 if (VT != MVT::v16i1) 7788 RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred); 7789 else 7790 RecastV1 = Pred; 7791 7792 // Select either all ones or zeroes depending upon the real predicate bits. 7793 SDValue PredAsVector = 7794 DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes); 7795 7796 // Recast our new predicate-as-integer v16i8 vector into something 7797 // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate. 7798 return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector); 7799 } 7800 7801 static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, 7802 const ARMSubtarget *ST) { 7803 EVT VT = Op.getValueType(); 7804 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 7805 ArrayRef<int> ShuffleMask = SVN->getMask(); 7806 7807 assert(ST->hasMVEIntegerOps() && 7808 "No support for vector shuffle of boolean predicates"); 7809 7810 SDValue V1 = Op.getOperand(0); 7811 SDLoc dl(Op); 7812 if (isReverseMask(ShuffleMask, VT)) { 7813 SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1); 7814 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast); 7815 SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit, 7816 DAG.getConstant(16, dl, MVT::i32)); 7817 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl); 7818 } 7819 7820 // Until we can come up with optimised cases for every single vector 7821 // shuffle in existence we have chosen the least painful strategy. This is 7822 // to essentially promote the boolean predicate to a 8-bit integer, where 7823 // each predicate represents a byte. Then we fall back on a normal integer 7824 // vector shuffle and convert the result back into a predicate vector. In 7825 // many cases the generated code might be even better than scalar code 7826 // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit 7827 // fields in a register into 8 other arbitrary 2-bit fields! 7828 SDValue PredAsVector = PromoteMVEPredVector(dl, V1, VT, DAG); 7829 EVT NewVT = PredAsVector.getValueType(); 7830 7831 // Do the shuffle! 7832 SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector, 7833 DAG.getUNDEF(NewVT), ShuffleMask); 7834 7835 // Now return the result of comparing the shuffled vector with zero, 7836 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. 7837 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled, 7838 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 7839 } 7840 7841 static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op, 7842 ArrayRef<int> ShuffleMask, 7843 SelectionDAG &DAG) { 7844 // Attempt to lower the vector shuffle using as many whole register movs as 7845 // possible. This is useful for types smaller than 32bits, which would 7846 // often otherwise become a series for grp movs. 7847 SDLoc dl(Op); 7848 EVT VT = Op.getValueType(); 7849 if (VT.getScalarSizeInBits() >= 32) 7850 return SDValue(); 7851 7852 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) && 7853 "Unexpected vector type"); 7854 int NumElts = VT.getVectorNumElements(); 7855 int QuarterSize = NumElts / 4; 7856 // The four final parts of the vector, as i32's 7857 SDValue Parts[4]; 7858 7859 // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not 7860 // <u,u,u,u>), returning the vmov lane index 7861 auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) { 7862 // Detect which mov lane this would be from the first non-undef element. 7863 int MovIdx = -1; 7864 for (int i = 0; i < Length; i++) { 7865 if (ShuffleMask[Start + i] >= 0) { 7866 if (ShuffleMask[Start + i] % Length != i) 7867 return -1; 7868 MovIdx = ShuffleMask[Start + i] / Length; 7869 break; 7870 } 7871 } 7872 // If all items are undef, leave this for other combines 7873 if (MovIdx == -1) 7874 return -1; 7875 // Check the remaining values are the correct part of the same mov 7876 for (int i = 1; i < Length; i++) { 7877 if (ShuffleMask[Start + i] >= 0 && 7878 (ShuffleMask[Start + i] / Length != MovIdx || 7879 ShuffleMask[Start + i] % Length != i)) 7880 return -1; 7881 } 7882 return MovIdx; 7883 }; 7884 7885 for (int Part = 0; Part < 4; ++Part) { 7886 // Does this part look like a mov 7887 int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize); 7888 if (Elt != -1) { 7889 SDValue Input = Op->getOperand(0); 7890 if (Elt >= 4) { 7891 Input = Op->getOperand(1); 7892 Elt -= 4; 7893 } 7894 SDValue BitCast = DAG.getBitcast(MVT::v4i32, Input); 7895 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, BitCast, 7896 DAG.getConstant(Elt, dl, MVT::i32)); 7897 } 7898 } 7899 7900 // Nothing interesting found, just return 7901 if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3]) 7902 return SDValue(); 7903 7904 // The other parts need to be built with the old shuffle vector, cast to a 7905 // v4i32 and extract_vector_elts 7906 if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) { 7907 SmallVector<int, 16> NewShuffleMask; 7908 for (int Part = 0; Part < 4; ++Part) 7909 for (int i = 0; i < QuarterSize; i++) 7910 NewShuffleMask.push_back( 7911 Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]); 7912 SDValue NewShuffle = DAG.getVectorShuffle( 7913 VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask); 7914 SDValue BitCast = DAG.getBitcast(MVT::v4i32, NewShuffle); 7915 7916 for (int Part = 0; Part < 4; ++Part) 7917 if (!Parts[Part]) 7918 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 7919 BitCast, DAG.getConstant(Part, dl, MVT::i32)); 7920 } 7921 // Build a vector out of the various parts and bitcast it back to the original 7922 // type. 7923 SDValue NewVec = DAG.getBuildVector(MVT::v4i32, dl, Parts); 7924 return DAG.getBitcast(VT, NewVec); 7925 } 7926 7927 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, 7928 const ARMSubtarget *ST) { 7929 SDValue V1 = Op.getOperand(0); 7930 SDValue V2 = Op.getOperand(1); 7931 SDLoc dl(Op); 7932 EVT VT = Op.getValueType(); 7933 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 7934 unsigned EltSize = VT.getScalarSizeInBits(); 7935 7936 if (ST->hasMVEIntegerOps() && EltSize == 1) 7937 return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST); 7938 7939 // Convert shuffles that are directly supported on NEON to target-specific 7940 // DAG nodes, instead of keeping them as shuffles and matching them again 7941 // during code selection. This is more efficient and avoids the possibility 7942 // of inconsistencies between legalization and selection. 7943 // FIXME: floating-point vectors should be canonicalized to integer vectors 7944 // of the same time so that they get CSEd properly. 7945 ArrayRef<int> ShuffleMask = SVN->getMask(); 7946 7947 if (EltSize <= 32) { 7948 if (SVN->isSplat()) { 7949 int Lane = SVN->getSplatIndex(); 7950 // If this is undef splat, generate it via "just" vdup, if possible. 7951 if (Lane == -1) Lane = 0; 7952 7953 // Test if V1 is a SCALAR_TO_VECTOR. 7954 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { 7955 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 7956 } 7957 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR 7958 // (and probably will turn into a SCALAR_TO_VECTOR once legalization 7959 // reaches it). 7960 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR && 7961 !isa<ConstantSDNode>(V1.getOperand(0))) { 7962 bool IsScalarToVector = true; 7963 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i) 7964 if (!V1.getOperand(i).isUndef()) { 7965 IsScalarToVector = false; 7966 break; 7967 } 7968 if (IsScalarToVector) 7969 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 7970 } 7971 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1, 7972 DAG.getConstant(Lane, dl, MVT::i32)); 7973 } 7974 7975 bool ReverseVEXT = false; 7976 unsigned Imm = 0; 7977 if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { 7978 if (ReverseVEXT) 7979 std::swap(V1, V2); 7980 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, 7981 DAG.getConstant(Imm, dl, MVT::i32)); 7982 } 7983 7984 if (isVREVMask(ShuffleMask, VT, 64)) 7985 return DAG.getNode(ARMISD::VREV64, dl, VT, V1); 7986 if (isVREVMask(ShuffleMask, VT, 32)) 7987 return DAG.getNode(ARMISD::VREV32, dl, VT, V1); 7988 if (isVREVMask(ShuffleMask, VT, 16)) 7989 return DAG.getNode(ARMISD::VREV16, dl, VT, V1); 7990 7991 if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) { 7992 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1, 7993 DAG.getConstant(Imm, dl, MVT::i32)); 7994 } 7995 7996 // Check for Neon shuffles that modify both input vectors in place. 7997 // If both results are used, i.e., if there are two shuffles with the same 7998 // source operands and with masks corresponding to both results of one of 7999 // these operations, DAG memoization will ensure that a single node is 8000 // used for both shuffles. 8001 unsigned WhichResult = 0; 8002 bool isV_UNDEF = false; 8003 if (ST->hasNEON()) { 8004 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( 8005 ShuffleMask, VT, WhichResult, isV_UNDEF)) { 8006 if (isV_UNDEF) 8007 V2 = V1; 8008 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2) 8009 .getValue(WhichResult); 8010 } 8011 } 8012 if (ST->hasMVEIntegerOps()) { 8013 if (isVMOVNMask(ShuffleMask, VT, 0)) 8014 return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1, 8015 DAG.getConstant(0, dl, MVT::i32)); 8016 if (isVMOVNMask(ShuffleMask, VT, 1)) 8017 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2, 8018 DAG.getConstant(1, dl, MVT::i32)); 8019 } 8020 8021 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize 8022 // shuffles that produce a result larger than their operands with: 8023 // shuffle(concat(v1, undef), concat(v2, undef)) 8024 // -> 8025 // shuffle(concat(v1, v2), undef) 8026 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine). 8027 // 8028 // This is useful in the general case, but there are special cases where 8029 // native shuffles produce larger results: the two-result ops. 8030 // 8031 // Look through the concat when lowering them: 8032 // shuffle(concat(v1, v2), undef) 8033 // -> 8034 // concat(VZIP(v1, v2):0, :1) 8035 // 8036 if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) { 8037 SDValue SubV1 = V1->getOperand(0); 8038 SDValue SubV2 = V1->getOperand(1); 8039 EVT SubVT = SubV1.getValueType(); 8040 8041 // We expect these to have been canonicalized to -1. 8042 assert(llvm::all_of(ShuffleMask, [&](int i) { 8043 return i < (int)VT.getVectorNumElements(); 8044 }) && "Unexpected shuffle index into UNDEF operand!"); 8045 8046 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( 8047 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) { 8048 if (isV_UNDEF) 8049 SubV2 = SubV1; 8050 assert((WhichResult == 0) && 8051 "In-place shuffle of concat can only have one result!"); 8052 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT), 8053 SubV1, SubV2); 8054 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0), 8055 Res.getValue(1)); 8056 } 8057 } 8058 } 8059 8060 // If the shuffle is not directly supported and it has 4 elements, use 8061 // the PerfectShuffle-generated table to synthesize it from other shuffles. 8062 unsigned NumElts = VT.getVectorNumElements(); 8063 if (NumElts == 4) { 8064 unsigned PFIndexes[4]; 8065 for (unsigned i = 0; i != 4; ++i) { 8066 if (ShuffleMask[i] < 0) 8067 PFIndexes[i] = 8; 8068 else 8069 PFIndexes[i] = ShuffleMask[i]; 8070 } 8071 8072 // Compute the index in the perfect shuffle table. 8073 unsigned PFTableIndex = 8074 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 8075 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 8076 unsigned Cost = (PFEntry >> 30); 8077 8078 if (Cost <= 4) { 8079 if (ST->hasNEON()) 8080 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 8081 else if (isLegalMVEShuffleOp(PFEntry)) { 8082 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); 8083 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); 8084 unsigned PFEntryLHS = PerfectShuffleTable[LHSID]; 8085 unsigned PFEntryRHS = PerfectShuffleTable[RHSID]; 8086 if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS)) 8087 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 8088 } 8089 } 8090 } 8091 8092 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs. 8093 if (EltSize >= 32) { 8094 // Do the expansion with floating-point types, since that is what the VFP 8095 // registers are defined to use, and since i64 is not legal. 8096 EVT EltVT = EVT::getFloatingPointVT(EltSize); 8097 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 8098 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1); 8099 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2); 8100 SmallVector<SDValue, 8> Ops; 8101 for (unsigned i = 0; i < NumElts; ++i) { 8102 if (ShuffleMask[i] < 0) 8103 Ops.push_back(DAG.getUNDEF(EltVT)); 8104 else 8105 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, 8106 ShuffleMask[i] < (int)NumElts ? V1 : V2, 8107 DAG.getConstant(ShuffleMask[i] & (NumElts-1), 8108 dl, MVT::i32))); 8109 } 8110 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); 8111 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 8112 } 8113 8114 if (ST->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT)) 8115 return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG); 8116 8117 if (ST->hasNEON() && VT == MVT::v8i8) 8118 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG)) 8119 return NewOp; 8120 8121 if (ST->hasMVEIntegerOps()) 8122 if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG)) 8123 return NewOp; 8124 8125 return SDValue(); 8126 } 8127 8128 static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, 8129 const ARMSubtarget *ST) { 8130 EVT VecVT = Op.getOperand(0).getValueType(); 8131 SDLoc dl(Op); 8132 8133 assert(ST->hasMVEIntegerOps() && 8134 "LowerINSERT_VECTOR_ELT_i1 called without MVE!"); 8135 8136 SDValue Conv = 8137 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0)); 8138 unsigned Lane = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 8139 unsigned LaneWidth = 8140 getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8; 8141 unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth; 8142 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, 8143 Op.getOperand(1), DAG.getValueType(MVT::i1)); 8144 SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext, 8145 DAG.getConstant(~Mask, dl, MVT::i32)); 8146 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI); 8147 } 8148 8149 SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 8150 SelectionDAG &DAG) const { 8151 // INSERT_VECTOR_ELT is legal only for immediate indexes. 8152 SDValue Lane = Op.getOperand(2); 8153 if (!isa<ConstantSDNode>(Lane)) 8154 return SDValue(); 8155 8156 SDValue Elt = Op.getOperand(1); 8157 EVT EltVT = Elt.getValueType(); 8158 8159 if (Subtarget->hasMVEIntegerOps() && 8160 Op.getValueType().getScalarSizeInBits() == 1) 8161 return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget); 8162 8163 if (getTypeAction(*DAG.getContext(), EltVT) == 8164 TargetLowering::TypePromoteFloat) { 8165 // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32, 8166 // but the type system will try to do that if we don't intervene. 8167 // Reinterpret any such vector-element insertion as one with the 8168 // corresponding integer types. 8169 8170 SDLoc dl(Op); 8171 8172 EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits()); 8173 assert(getTypeAction(*DAG.getContext(), IEltVT) != 8174 TargetLowering::TypePromoteFloat); 8175 8176 SDValue VecIn = Op.getOperand(0); 8177 EVT VecVT = VecIn.getValueType(); 8178 EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT, 8179 VecVT.getVectorNumElements()); 8180 8181 SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt); 8182 SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn); 8183 SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT, 8184 IVecIn, IElt, Lane); 8185 return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut); 8186 } 8187 8188 return Op; 8189 } 8190 8191 static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, 8192 const ARMSubtarget *ST) { 8193 EVT VecVT = Op.getOperand(0).getValueType(); 8194 SDLoc dl(Op); 8195 8196 assert(ST->hasMVEIntegerOps() && 8197 "LowerINSERT_VECTOR_ELT_i1 called without MVE!"); 8198 8199 SDValue Conv = 8200 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0)); 8201 unsigned Lane = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 8202 unsigned LaneWidth = 8203 getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8; 8204 SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv, 8205 DAG.getConstant(Lane * LaneWidth, dl, MVT::i32)); 8206 return Shift; 8207 } 8208 8209 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG, 8210 const ARMSubtarget *ST) { 8211 // EXTRACT_VECTOR_ELT is legal only for immediate indexes. 8212 SDValue Lane = Op.getOperand(1); 8213 if (!isa<ConstantSDNode>(Lane)) 8214 return SDValue(); 8215 8216 SDValue Vec = Op.getOperand(0); 8217 EVT VT = Vec.getValueType(); 8218 8219 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) 8220 return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST); 8221 8222 if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) { 8223 SDLoc dl(Op); 8224 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); 8225 } 8226 8227 return Op; 8228 } 8229 8230 static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, 8231 const ARMSubtarget *ST) { 8232 SDValue V1 = Op.getOperand(0); 8233 SDValue V2 = Op.getOperand(1); 8234 SDLoc dl(Op); 8235 EVT VT = Op.getValueType(); 8236 EVT Op1VT = V1.getValueType(); 8237 EVT Op2VT = V2.getValueType(); 8238 unsigned NumElts = VT.getVectorNumElements(); 8239 8240 assert(Op1VT == Op2VT && "Operand types don't match!"); 8241 assert(VT.getScalarSizeInBits() == 1 && 8242 "Unexpected custom CONCAT_VECTORS lowering"); 8243 assert(ST->hasMVEIntegerOps() && 8244 "CONCAT_VECTORS lowering only supported for MVE"); 8245 8246 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); 8247 SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG); 8248 8249 // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets 8250 // promoted to v8i16, etc. 8251 8252 MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); 8253 8254 // Extract the vector elements from Op1 and Op2 one by one and truncate them 8255 // to be the right size for the destination. For example, if Op1 is v4i1 then 8256 // the promoted vector is v4i32. The result of concatentation gives a v8i1, 8257 // which when promoted is v8i16. That means each i32 element from Op1 needs 8258 // truncating to i16 and inserting in the result. 8259 EVT ConcatVT = MVT::getVectorVT(ElType, NumElts); 8260 SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT); 8261 auto ExractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) { 8262 EVT NewVT = NewV.getValueType(); 8263 EVT ConcatVT = ConVec.getValueType(); 8264 for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) { 8265 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV, 8266 DAG.getIntPtrConstant(i, dl)); 8267 ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt, 8268 DAG.getConstant(j, dl, MVT::i32)); 8269 } 8270 return ConVec; 8271 }; 8272 unsigned j = 0; 8273 ConVec = ExractInto(NewV1, ConVec, j); 8274 ConVec = ExractInto(NewV2, ConVec, j); 8275 8276 // Now return the result of comparing the subvector with zero, 8277 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. 8278 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec, 8279 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 8280 } 8281 8282 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, 8283 const ARMSubtarget *ST) { 8284 EVT VT = Op->getValueType(0); 8285 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) 8286 return LowerCONCAT_VECTORS_i1(Op, DAG, ST); 8287 8288 // The only time a CONCAT_VECTORS operation can have legal types is when 8289 // two 64-bit vectors are concatenated to a 128-bit vector. 8290 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 && 8291 "unexpected CONCAT_VECTORS"); 8292 SDLoc dl(Op); 8293 SDValue Val = DAG.getUNDEF(MVT::v2f64); 8294 SDValue Op0 = Op.getOperand(0); 8295 SDValue Op1 = Op.getOperand(1); 8296 if (!Op0.isUndef()) 8297 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 8298 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0), 8299 DAG.getIntPtrConstant(0, dl)); 8300 if (!Op1.isUndef()) 8301 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 8302 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1), 8303 DAG.getIntPtrConstant(1, dl)); 8304 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val); 8305 } 8306 8307 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, 8308 const ARMSubtarget *ST) { 8309 SDValue V1 = Op.getOperand(0); 8310 SDValue V2 = Op.getOperand(1); 8311 SDLoc dl(Op); 8312 EVT VT = Op.getValueType(); 8313 EVT Op1VT = V1.getValueType(); 8314 unsigned NumElts = VT.getVectorNumElements(); 8315 unsigned Index = cast<ConstantSDNode>(V2)->getZExtValue(); 8316 8317 assert(VT.getScalarSizeInBits() == 1 && 8318 "Unexpected custom EXTRACT_SUBVECTOR lowering"); 8319 assert(ST->hasMVEIntegerOps() && 8320 "EXTRACT_SUBVECTOR lowering only supported for MVE"); 8321 8322 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); 8323 8324 // We now have Op1 promoted to a vector of integers, where v8i1 gets 8325 // promoted to v8i16, etc. 8326 8327 MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); 8328 8329 EVT SubVT = MVT::getVectorVT(ElType, NumElts); 8330 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT); 8331 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) { 8332 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1, 8333 DAG.getIntPtrConstant(i, dl)); 8334 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt, 8335 DAG.getConstant(j, dl, MVT::i32)); 8336 } 8337 8338 // Now return the result of comparing the subvector with zero, 8339 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. 8340 return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec, 8341 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 8342 } 8343 8344 /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each 8345 /// element has been zero/sign-extended, depending on the isSigned parameter, 8346 /// from an integer type half its size. 8347 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, 8348 bool isSigned) { 8349 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32. 8350 EVT VT = N->getValueType(0); 8351 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) { 8352 SDNode *BVN = N->getOperand(0).getNode(); 8353 if (BVN->getValueType(0) != MVT::v4i32 || 8354 BVN->getOpcode() != ISD::BUILD_VECTOR) 8355 return false; 8356 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; 8357 unsigned HiElt = 1 - LoElt; 8358 ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt)); 8359 ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt)); 8360 ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2)); 8361 ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2)); 8362 if (!Lo0 || !Hi0 || !Lo1 || !Hi1) 8363 return false; 8364 if (isSigned) { 8365 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 && 8366 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32) 8367 return true; 8368 } else { 8369 if (Hi0->isNullValue() && Hi1->isNullValue()) 8370 return true; 8371 } 8372 return false; 8373 } 8374 8375 if (N->getOpcode() != ISD::BUILD_VECTOR) 8376 return false; 8377 8378 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 8379 SDNode *Elt = N->getOperand(i).getNode(); 8380 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { 8381 unsigned EltSize = VT.getScalarSizeInBits(); 8382 unsigned HalfSize = EltSize / 2; 8383 if (isSigned) { 8384 if (!isIntN(HalfSize, C->getSExtValue())) 8385 return false; 8386 } else { 8387 if (!isUIntN(HalfSize, C->getZExtValue())) 8388 return false; 8389 } 8390 continue; 8391 } 8392 return false; 8393 } 8394 8395 return true; 8396 } 8397 8398 /// isSignExtended - Check if a node is a vector value that is sign-extended 8399 /// or a constant BUILD_VECTOR with sign-extended elements. 8400 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { 8401 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N)) 8402 return true; 8403 if (isExtendedBUILD_VECTOR(N, DAG, true)) 8404 return true; 8405 return false; 8406 } 8407 8408 /// isZeroExtended - Check if a node is a vector value that is zero-extended 8409 /// or a constant BUILD_VECTOR with zero-extended elements. 8410 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { 8411 if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N)) 8412 return true; 8413 if (isExtendedBUILD_VECTOR(N, DAG, false)) 8414 return true; 8415 return false; 8416 } 8417 8418 static EVT getExtensionTo64Bits(const EVT &OrigVT) { 8419 if (OrigVT.getSizeInBits() >= 64) 8420 return OrigVT; 8421 8422 assert(OrigVT.isSimple() && "Expecting a simple value type"); 8423 8424 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; 8425 switch (OrigSimpleTy) { 8426 default: llvm_unreachable("Unexpected Vector Type"); 8427 case MVT::v2i8: 8428 case MVT::v2i16: 8429 return MVT::v2i32; 8430 case MVT::v4i8: 8431 return MVT::v4i16; 8432 } 8433 } 8434 8435 /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total 8436 /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL. 8437 /// We insert the required extension here to get the vector to fill a D register. 8438 static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, 8439 const EVT &OrigTy, 8440 const EVT &ExtTy, 8441 unsigned ExtOpcode) { 8442 // The vector originally had a size of OrigTy. It was then extended to ExtTy. 8443 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than 8444 // 64-bits we need to insert a new extension so that it will be 64-bits. 8445 assert(ExtTy.is128BitVector() && "Unexpected extension size"); 8446 if (OrigTy.getSizeInBits() >= 64) 8447 return N; 8448 8449 // Must extend size to at least 64 bits to be used as an operand for VMULL. 8450 EVT NewVT = getExtensionTo64Bits(OrigTy); 8451 8452 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); 8453 } 8454 8455 /// SkipLoadExtensionForVMULL - return a load of the original vector size that 8456 /// does not do any sign/zero extension. If the original vector is less 8457 /// than 64 bits, an appropriate extension will be added after the load to 8458 /// reach a total size of 64 bits. We have to add the extension separately 8459 /// because ARM does not have a sign/zero extending load for vectors. 8460 static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) { 8461 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT()); 8462 8463 // The load already has the right type. 8464 if (ExtendedTy == LD->getMemoryVT()) 8465 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(), 8466 LD->getBasePtr(), LD->getPointerInfo(), 8467 LD->getAlignment(), LD->getMemOperand()->getFlags()); 8468 8469 // We need to create a zextload/sextload. We cannot just create a load 8470 // followed by a zext/zext node because LowerMUL is also run during normal 8471 // operation legalization where we can't create illegal types. 8472 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy, 8473 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), 8474 LD->getMemoryVT(), LD->getAlignment(), 8475 LD->getMemOperand()->getFlags()); 8476 } 8477 8478 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, 8479 /// extending load, or BUILD_VECTOR with extended elements, return the 8480 /// unextended value. The unextended vector should be 64 bits so that it can 8481 /// be used as an operand to a VMULL instruction. If the original vector size 8482 /// before extension is less than 64 bits we add a an extension to resize 8483 /// the vector to 64 bits. 8484 static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { 8485 if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) 8486 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG, 8487 N->getOperand(0)->getValueType(0), 8488 N->getValueType(0), 8489 N->getOpcode()); 8490 8491 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 8492 assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) && 8493 "Expected extending load"); 8494 8495 SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG); 8496 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1)); 8497 unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 8498 SDValue extLoad = 8499 DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad); 8500 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad); 8501 8502 return newLoad; 8503 } 8504 8505 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will 8506 // have been legalized as a BITCAST from v4i32. 8507 if (N->getOpcode() == ISD::BITCAST) { 8508 SDNode *BVN = N->getOperand(0).getNode(); 8509 assert(BVN->getOpcode() == ISD::BUILD_VECTOR && 8510 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR"); 8511 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; 8512 return DAG.getBuildVector( 8513 MVT::v2i32, SDLoc(N), 8514 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)}); 8515 } 8516 // Construct a new BUILD_VECTOR with elements truncated to half the size. 8517 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); 8518 EVT VT = N->getValueType(0); 8519 unsigned EltSize = VT.getScalarSizeInBits() / 2; 8520 unsigned NumElts = VT.getVectorNumElements(); 8521 MVT TruncVT = MVT::getIntegerVT(EltSize); 8522 SmallVector<SDValue, 8> Ops; 8523 SDLoc dl(N); 8524 for (unsigned i = 0; i != NumElts; ++i) { 8525 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); 8526 const APInt &CInt = C->getAPIntValue(); 8527 // Element types smaller than 32 bits are not legal, so use i32 elements. 8528 // The values are implicitly truncated so sext vs. zext doesn't matter. 8529 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); 8530 } 8531 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops); 8532 } 8533 8534 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { 8535 unsigned Opcode = N->getOpcode(); 8536 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 8537 SDNode *N0 = N->getOperand(0).getNode(); 8538 SDNode *N1 = N->getOperand(1).getNode(); 8539 return N0->hasOneUse() && N1->hasOneUse() && 8540 isSignExtended(N0, DAG) && isSignExtended(N1, DAG); 8541 } 8542 return false; 8543 } 8544 8545 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { 8546 unsigned Opcode = N->getOpcode(); 8547 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 8548 SDNode *N0 = N->getOperand(0).getNode(); 8549 SDNode *N1 = N->getOperand(1).getNode(); 8550 return N0->hasOneUse() && N1->hasOneUse() && 8551 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); 8552 } 8553 return false; 8554 } 8555 8556 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { 8557 // Multiplications are only custom-lowered for 128-bit vectors so that 8558 // VMULL can be detected. Otherwise v2i64 multiplications are not legal. 8559 EVT VT = Op.getValueType(); 8560 assert(VT.is128BitVector() && VT.isInteger() && 8561 "unexpected type for custom-lowering ISD::MUL"); 8562 SDNode *N0 = Op.getOperand(0).getNode(); 8563 SDNode *N1 = Op.getOperand(1).getNode(); 8564 unsigned NewOpc = 0; 8565 bool isMLA = false; 8566 bool isN0SExt = isSignExtended(N0, DAG); 8567 bool isN1SExt = isSignExtended(N1, DAG); 8568 if (isN0SExt && isN1SExt) 8569 NewOpc = ARMISD::VMULLs; 8570 else { 8571 bool isN0ZExt = isZeroExtended(N0, DAG); 8572 bool isN1ZExt = isZeroExtended(N1, DAG); 8573 if (isN0ZExt && isN1ZExt) 8574 NewOpc = ARMISD::VMULLu; 8575 else if (isN1SExt || isN1ZExt) { 8576 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these 8577 // into (s/zext A * s/zext C) + (s/zext B * s/zext C) 8578 if (isN1SExt && isAddSubSExt(N0, DAG)) { 8579 NewOpc = ARMISD::VMULLs; 8580 isMLA = true; 8581 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { 8582 NewOpc = ARMISD::VMULLu; 8583 isMLA = true; 8584 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { 8585 std::swap(N0, N1); 8586 NewOpc = ARMISD::VMULLu; 8587 isMLA = true; 8588 } 8589 } 8590 8591 if (!NewOpc) { 8592 if (VT == MVT::v2i64) 8593 // Fall through to expand this. It is not legal. 8594 return SDValue(); 8595 else 8596 // Other vector multiplications are legal. 8597 return Op; 8598 } 8599 } 8600 8601 // Legalize to a VMULL instruction. 8602 SDLoc DL(Op); 8603 SDValue Op0; 8604 SDValue Op1 = SkipExtensionForVMULL(N1, DAG); 8605 if (!isMLA) { 8606 Op0 = SkipExtensionForVMULL(N0, DAG); 8607 assert(Op0.getValueType().is64BitVector() && 8608 Op1.getValueType().is64BitVector() && 8609 "unexpected types for extended operands to VMULL"); 8610 return DAG.getNode(NewOpc, DL, VT, Op0, Op1); 8611 } 8612 8613 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during 8614 // isel lowering to take advantage of no-stall back to back vmul + vmla. 8615 // vmull q0, d4, d6 8616 // vmlal q0, d5, d6 8617 // is faster than 8618 // vaddl q0, d4, d5 8619 // vmovl q1, d6 8620 // vmul q0, q0, q1 8621 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG); 8622 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG); 8623 EVT Op1VT = Op1.getValueType(); 8624 return DAG.getNode(N0->getOpcode(), DL, VT, 8625 DAG.getNode(NewOpc, DL, VT, 8626 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), 8627 DAG.getNode(NewOpc, DL, VT, 8628 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); 8629 } 8630 8631 static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, 8632 SelectionDAG &DAG) { 8633 // TODO: Should this propagate fast-math-flags? 8634 8635 // Convert to float 8636 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo)); 8637 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo)); 8638 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X); 8639 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y); 8640 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X); 8641 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y); 8642 // Get reciprocal estimate. 8643 // float4 recip = vrecpeq_f32(yf); 8644 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8645 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 8646 Y); 8647 // Because char has a smaller range than uchar, we can actually get away 8648 // without any newton steps. This requires that we use a weird bias 8649 // of 0xb000, however (again, this has been exhaustively tested). 8650 // float4 result = as_float4(as_int4(xf*recip) + 0xb000); 8651 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y); 8652 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X); 8653 Y = DAG.getConstant(0xb000, dl, MVT::v4i32); 8654 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y); 8655 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X); 8656 // Convert back to short. 8657 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X); 8658 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X); 8659 return X; 8660 } 8661 8662 static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, 8663 SelectionDAG &DAG) { 8664 // TODO: Should this propagate fast-math-flags? 8665 8666 SDValue N2; 8667 // Convert to float. 8668 // float4 yf = vcvt_f32_s32(vmovl_s16(y)); 8669 // float4 xf = vcvt_f32_s32(vmovl_s16(x)); 8670 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0); 8671 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1); 8672 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 8673 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 8674 8675 // Use reciprocal estimate and one refinement step. 8676 // float4 recip = vrecpeq_f32(yf); 8677 // recip *= vrecpsq_f32(yf, recip); 8678 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8679 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 8680 N1); 8681 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8682 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 8683 N1, N2); 8684 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 8685 // Because short has a smaller range than ushort, we can actually get away 8686 // with only a single newton step. This requires that we use a weird bias 8687 // of 89, however (again, this has been exhaustively tested). 8688 // float4 result = as_float4(as_int4(xf*recip) + 0x89); 8689 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 8690 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 8691 N1 = DAG.getConstant(0x89, dl, MVT::v4i32); 8692 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 8693 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 8694 // Convert back to integer and return. 8695 // return vmovn_s32(vcvt_s32_f32(result)); 8696 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 8697 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 8698 return N0; 8699 } 8700 8701 static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG, 8702 const ARMSubtarget *ST) { 8703 EVT VT = Op.getValueType(); 8704 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 8705 "unexpected type for custom-lowering ISD::SDIV"); 8706 8707 SDLoc dl(Op); 8708 SDValue N0 = Op.getOperand(0); 8709 SDValue N1 = Op.getOperand(1); 8710 SDValue N2, N3; 8711 8712 if (VT == MVT::v8i8) { 8713 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0); 8714 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1); 8715 8716 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 8717 DAG.getIntPtrConstant(4, dl)); 8718 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 8719 DAG.getIntPtrConstant(4, dl)); 8720 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 8721 DAG.getIntPtrConstant(0, dl)); 8722 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 8723 DAG.getIntPtrConstant(0, dl)); 8724 8725 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16 8726 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16 8727 8728 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 8729 N0 = LowerCONCAT_VECTORS(N0, DAG, ST); 8730 8731 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0); 8732 return N0; 8733 } 8734 return LowerSDIV_v4i16(N0, N1, dl, DAG); 8735 } 8736 8737 static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG, 8738 const ARMSubtarget *ST) { 8739 // TODO: Should this propagate fast-math-flags? 8740 EVT VT = Op.getValueType(); 8741 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 8742 "unexpected type for custom-lowering ISD::UDIV"); 8743 8744 SDLoc dl(Op); 8745 SDValue N0 = Op.getOperand(0); 8746 SDValue N1 = Op.getOperand(1); 8747 SDValue N2, N3; 8748 8749 if (VT == MVT::v8i8) { 8750 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0); 8751 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1); 8752 8753 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 8754 DAG.getIntPtrConstant(4, dl)); 8755 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 8756 DAG.getIntPtrConstant(4, dl)); 8757 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 8758 DAG.getIntPtrConstant(0, dl)); 8759 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 8760 DAG.getIntPtrConstant(0, dl)); 8761 8762 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16 8763 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16 8764 8765 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 8766 N0 = LowerCONCAT_VECTORS(N0, DAG, ST); 8767 8768 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8, 8769 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl, 8770 MVT::i32), 8771 N0); 8772 return N0; 8773 } 8774 8775 // v4i16 sdiv ... Convert to float. 8776 // float4 yf = vcvt_f32_s32(vmovl_u16(y)); 8777 // float4 xf = vcvt_f32_s32(vmovl_u16(x)); 8778 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0); 8779 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1); 8780 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 8781 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 8782 8783 // Use reciprocal estimate and two refinement steps. 8784 // float4 recip = vrecpeq_f32(yf); 8785 // recip *= vrecpsq_f32(yf, recip); 8786 // recip *= vrecpsq_f32(yf, recip); 8787 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8788 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 8789 BN1); 8790 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8791 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 8792 BN1, N2); 8793 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 8794 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8795 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 8796 BN1, N2); 8797 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 8798 // Simply multiplying by the reciprocal estimate can leave us a few ulps 8799 // too low, so we add 2 ulps (exhaustive testing shows that this is enough, 8800 // and that it will never cause us to return an answer too large). 8801 // float4 result = as_float4(as_int4(xf*recip) + 2); 8802 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 8803 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 8804 N1 = DAG.getConstant(2, dl, MVT::v4i32); 8805 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 8806 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 8807 // Convert back to integer and return. 8808 // return vmovn_u32(vcvt_s32_f32(result)); 8809 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 8810 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 8811 return N0; 8812 } 8813 8814 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) { 8815 SDNode *N = Op.getNode(); 8816 EVT VT = N->getValueType(0); 8817 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 8818 8819 SDValue Carry = Op.getOperand(2); 8820 8821 SDLoc DL(Op); 8822 8823 SDValue Result; 8824 if (Op.getOpcode() == ISD::ADDCARRY) { 8825 // This converts the boolean value carry into the carry flag. 8826 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 8827 8828 // Do the addition proper using the carry flag we wanted. 8829 Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0), 8830 Op.getOperand(1), Carry); 8831 8832 // Now convert the carry flag into a boolean value. 8833 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG); 8834 } else { 8835 // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we 8836 // have to invert the carry first. 8837 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 8838 DAG.getConstant(1, DL, MVT::i32), Carry); 8839 // This converts the boolean value carry into the carry flag. 8840 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 8841 8842 // Do the subtraction proper using the carry flag we wanted. 8843 Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0), 8844 Op.getOperand(1), Carry); 8845 8846 // Now convert the carry flag into a boolean value. 8847 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG); 8848 // But the carry returned by ARMISD::SUBE is not a borrow as expected 8849 // by ISD::SUBCARRY, so compute 1 - C. 8850 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 8851 DAG.getConstant(1, DL, MVT::i32), Carry); 8852 } 8853 8854 // Return both values. 8855 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry); 8856 } 8857 8858 SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { 8859 assert(Subtarget->isTargetDarwin()); 8860 8861 // For iOS, we want to call an alternative entry point: __sincos_stret, 8862 // return values are passed via sret. 8863 SDLoc dl(Op); 8864 SDValue Arg = Op.getOperand(0); 8865 EVT ArgVT = Arg.getValueType(); 8866 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 8867 auto PtrVT = getPointerTy(DAG.getDataLayout()); 8868 8869 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 8870 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 8871 8872 // Pair of floats / doubles used to pass the result. 8873 Type *RetTy = StructType::get(ArgTy, ArgTy); 8874 auto &DL = DAG.getDataLayout(); 8875 8876 ArgListTy Args; 8877 bool ShouldUseSRet = Subtarget->isAPCS_ABI(); 8878 SDValue SRet; 8879 if (ShouldUseSRet) { 8880 // Create stack object for sret. 8881 const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); 8882 const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy); 8883 int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); 8884 SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL)); 8885 8886 ArgListEntry Entry; 8887 Entry.Node = SRet; 8888 Entry.Ty = RetTy->getPointerTo(); 8889 Entry.IsSExt = false; 8890 Entry.IsZExt = false; 8891 Entry.IsSRet = true; 8892 Args.push_back(Entry); 8893 RetTy = Type::getVoidTy(*DAG.getContext()); 8894 } 8895 8896 ArgListEntry Entry; 8897 Entry.Node = Arg; 8898 Entry.Ty = ArgTy; 8899 Entry.IsSExt = false; 8900 Entry.IsZExt = false; 8901 Args.push_back(Entry); 8902 8903 RTLIB::Libcall LC = 8904 (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; 8905 const char *LibcallName = getLibcallName(LC); 8906 CallingConv::ID CC = getLibcallCallingConv(LC); 8907 SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL)); 8908 8909 TargetLowering::CallLoweringInfo CLI(DAG); 8910 CLI.setDebugLoc(dl) 8911 .setChain(DAG.getEntryNode()) 8912 .setCallee(CC, RetTy, Callee, std::move(Args)) 8913 .setDiscardResult(ShouldUseSRet); 8914 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 8915 8916 if (!ShouldUseSRet) 8917 return CallResult.first; 8918 8919 SDValue LoadSin = 8920 DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo()); 8921 8922 // Address of cos field. 8923 SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet, 8924 DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl)); 8925 SDValue LoadCos = 8926 DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo()); 8927 8928 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); 8929 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, 8930 LoadSin.getValue(0), LoadCos.getValue(0)); 8931 } 8932 8933 SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, 8934 bool Signed, 8935 SDValue &Chain) const { 8936 EVT VT = Op.getValueType(); 8937 assert((VT == MVT::i32 || VT == MVT::i64) && 8938 "unexpected type for custom lowering DIV"); 8939 SDLoc dl(Op); 8940 8941 const auto &DL = DAG.getDataLayout(); 8942 const auto &TLI = DAG.getTargetLoweringInfo(); 8943 8944 const char *Name = nullptr; 8945 if (Signed) 8946 Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64"; 8947 else 8948 Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64"; 8949 8950 SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL)); 8951 8952 ARMTargetLowering::ArgListTy Args; 8953 8954 for (auto AI : {1, 0}) { 8955 ArgListEntry Arg; 8956 Arg.Node = Op.getOperand(AI); 8957 Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext()); 8958 Args.push_back(Arg); 8959 } 8960 8961 CallLoweringInfo CLI(DAG); 8962 CLI.setDebugLoc(dl) 8963 .setChain(Chain) 8964 .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()), 8965 ES, std::move(Args)); 8966 8967 return LowerCallTo(CLI).first; 8968 } 8969 8970 // This is a code size optimisation: return the original SDIV node to 8971 // DAGCombiner when we don't want to expand SDIV into a sequence of 8972 // instructions, and an empty node otherwise which will cause the 8973 // SDIV to be expanded in DAGCombine. 8974 SDValue 8975 ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, 8976 SelectionDAG &DAG, 8977 SmallVectorImpl<SDNode *> &Created) const { 8978 // TODO: Support SREM 8979 if (N->getOpcode() != ISD::SDIV) 8980 return SDValue(); 8981 8982 const auto &ST = static_cast<const ARMSubtarget&>(DAG.getSubtarget()); 8983 const bool MinSize = ST.hasMinSize(); 8984 const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode() 8985 : ST.hasDivideInARMMode(); 8986 8987 // Don't touch vector types; rewriting this may lead to scalarizing 8988 // the int divs. 8989 if (N->getOperand(0).getValueType().isVector()) 8990 return SDValue(); 8991 8992 // Bail if MinSize is not set, and also for both ARM and Thumb mode we need 8993 // hwdiv support for this to be really profitable. 8994 if (!(MinSize && HasDivide)) 8995 return SDValue(); 8996 8997 // ARM mode is a bit simpler than Thumb: we can handle large power 8998 // of 2 immediates with 1 mov instruction; no further checks required, 8999 // just return the sdiv node. 9000 if (!ST.isThumb()) 9001 return SDValue(N, 0); 9002 9003 // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV, 9004 // and thus lose the code size benefits of a MOVS that requires only 2. 9005 // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here, 9006 // but as it's doing exactly this, it's not worth the trouble to get TTI. 9007 if (Divisor.sgt(128)) 9008 return SDValue(); 9009 9010 return SDValue(N, 0); 9011 } 9012 9013 SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, 9014 bool Signed) const { 9015 assert(Op.getValueType() == MVT::i32 && 9016 "unexpected type for custom lowering DIV"); 9017 SDLoc dl(Op); 9018 9019 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, 9020 DAG.getEntryNode(), Op.getOperand(1)); 9021 9022 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); 9023 } 9024 9025 static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain) { 9026 SDLoc DL(N); 9027 SDValue Op = N->getOperand(1); 9028 if (N->getValueType(0) == MVT::i32) 9029 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op); 9030 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op, 9031 DAG.getConstant(0, DL, MVT::i32)); 9032 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op, 9033 DAG.getConstant(1, DL, MVT::i32)); 9034 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, 9035 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi)); 9036 } 9037 9038 void ARMTargetLowering::ExpandDIV_Windows( 9039 SDValue Op, SelectionDAG &DAG, bool Signed, 9040 SmallVectorImpl<SDValue> &Results) const { 9041 const auto &DL = DAG.getDataLayout(); 9042 const auto &TLI = DAG.getTargetLoweringInfo(); 9043 9044 assert(Op.getValueType() == MVT::i64 && 9045 "unexpected type for custom lowering DIV"); 9046 SDLoc dl(Op); 9047 9048 SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode()); 9049 9050 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); 9051 9052 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result); 9053 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result, 9054 DAG.getConstant(32, dl, TLI.getPointerTy(DL))); 9055 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper); 9056 9057 Results.push_back(Lower); 9058 Results.push_back(Upper); 9059 } 9060 9061 static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) { 9062 LoadSDNode *LD = cast<LoadSDNode>(Op.getNode()); 9063 EVT MemVT = LD->getMemoryVT(); 9064 assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) && 9065 "Expected a predicate type!"); 9066 assert(MemVT == Op.getValueType()); 9067 assert(LD->getExtensionType() == ISD::NON_EXTLOAD && 9068 "Expected a non-extending load"); 9069 assert(LD->isUnindexed() && "Expected a unindexed load"); 9070 9071 // The basic MVE VLDR on a v4i1/v8i1 actually loads the entire 16bit 9072 // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We 9073 // need to make sure that 8/4 bits are actually loaded into the correct 9074 // place, which means loading the value and then shuffling the values into 9075 // the bottom bits of the predicate. 9076 // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect 9077 // for BE). 9078 9079 SDLoc dl(Op); 9080 SDValue Load = DAG.getExtLoad( 9081 ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(), 9082 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()), 9083 LD->getMemOperand()); 9084 SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Load); 9085 if (MemVT != MVT::v16i1) 9086 Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred, 9087 DAG.getConstant(0, dl, MVT::i32)); 9088 return DAG.getMergeValues({Pred, Load.getValue(1)}, dl); 9089 } 9090 9091 void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results, 9092 SelectionDAG &DAG) const { 9093 LoadSDNode *LD = cast<LoadSDNode>(N); 9094 EVT MemVT = LD->getMemoryVT(); 9095 assert(LD->isUnindexed() && "Loads should be unindexed at this point."); 9096 9097 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() && 9098 !Subtarget->isThumb1Only() && LD->isVolatile()) { 9099 SDLoc dl(N); 9100 SDValue Result = DAG.getMemIntrinsicNode( 9101 ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}), 9102 {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand()); 9103 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, 9104 Result.getValue(0), Result.getValue(1)); 9105 Results.append({Pair, Result.getValue(2)}); 9106 } 9107 } 9108 9109 static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) { 9110 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode()); 9111 EVT MemVT = ST->getMemoryVT(); 9112 assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) && 9113 "Expected a predicate type!"); 9114 assert(MemVT == ST->getValue().getValueType()); 9115 assert(!ST->isTruncatingStore() && "Expected a non-extending store"); 9116 assert(ST->isUnindexed() && "Expected a unindexed store"); 9117 9118 // Only store the v4i1 or v8i1 worth of bits, via a buildvector with top bits 9119 // unset and a scalar store. 9120 SDLoc dl(Op); 9121 SDValue Build = ST->getValue(); 9122 if (MemVT != MVT::v16i1) { 9123 SmallVector<SDValue, 16> Ops; 9124 for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) 9125 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build, 9126 DAG.getConstant(I, dl, MVT::i32))); 9127 for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++) 9128 Ops.push_back(DAG.getUNDEF(MVT::i32)); 9129 Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops); 9130 } 9131 SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build); 9132 return DAG.getTruncStore( 9133 ST->getChain(), dl, GRP, ST->getBasePtr(), 9134 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()), 9135 ST->getMemOperand()); 9136 } 9137 9138 static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG, 9139 const ARMSubtarget *Subtarget) { 9140 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode()); 9141 EVT MemVT = ST->getMemoryVT(); 9142 assert(ST->isUnindexed() && "Stores should be unindexed at this point."); 9143 9144 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() && 9145 !Subtarget->isThumb1Only() && ST->isVolatile()) { 9146 SDNode *N = Op.getNode(); 9147 SDLoc dl(N); 9148 9149 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(), 9150 DAG.getTargetConstant(0, dl, MVT::i32)); 9151 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(), 9152 DAG.getTargetConstant(1, dl, MVT::i32)); 9153 9154 return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other), 9155 {ST->getChain(), Lo, Hi, ST->getBasePtr()}, 9156 MemVT, ST->getMemOperand()); 9157 } else if (Subtarget->hasMVEIntegerOps() && 9158 ((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || 9159 MemVT == MVT::v16i1))) { 9160 return LowerPredicateStore(Op, DAG); 9161 } 9162 9163 return SDValue(); 9164 } 9165 9166 static bool isZeroVector(SDValue N) { 9167 return (ISD::isBuildVectorAllZeros(N.getNode()) || 9168 (N->getOpcode() == ARMISD::VMOVIMM && 9169 isNullConstant(N->getOperand(0)))); 9170 } 9171 9172 static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) { 9173 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode()); 9174 MVT VT = Op.getSimpleValueType(); 9175 SDValue Mask = N->getMask(); 9176 SDValue PassThru = N->getPassThru(); 9177 SDLoc dl(Op); 9178 9179 if (isZeroVector(PassThru)) 9180 return Op; 9181 9182 // MVE Masked loads use zero as the passthru value. Here we convert undef to 9183 // zero too, and other values are lowered to a select. 9184 SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 9185 DAG.getTargetConstant(0, dl, MVT::i32)); 9186 SDValue NewLoad = DAG.getMaskedLoad( 9187 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec, 9188 N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(), 9189 N->getExtensionType(), N->isExpandingLoad()); 9190 SDValue Combo = NewLoad; 9191 if (!PassThru.isUndef() && 9192 (PassThru.getOpcode() != ISD::BITCAST || 9193 !isZeroVector(PassThru->getOperand(0)))) 9194 Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru); 9195 return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl); 9196 } 9197 9198 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { 9199 if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering())) 9200 // Acquire/Release load/store is not legal for targets without a dmb or 9201 // equivalent available. 9202 return SDValue(); 9203 9204 // Monotonic load/store is legal for all targets. 9205 return Op; 9206 } 9207 9208 static void ReplaceREADCYCLECOUNTER(SDNode *N, 9209 SmallVectorImpl<SDValue> &Results, 9210 SelectionDAG &DAG, 9211 const ARMSubtarget *Subtarget) { 9212 SDLoc DL(N); 9213 // Under Power Management extensions, the cycle-count is: 9214 // mrc p15, #0, <Rt>, c9, c13, #0 9215 SDValue Ops[] = { N->getOperand(0), // Chain 9216 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32), 9217 DAG.getTargetConstant(15, DL, MVT::i32), 9218 DAG.getTargetConstant(0, DL, MVT::i32), 9219 DAG.getTargetConstant(9, DL, MVT::i32), 9220 DAG.getTargetConstant(13, DL, MVT::i32), 9221 DAG.getTargetConstant(0, DL, MVT::i32) 9222 }; 9223 9224 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, 9225 DAG.getVTList(MVT::i32, MVT::Other), Ops); 9226 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32, 9227 DAG.getConstant(0, DL, MVT::i32))); 9228 Results.push_back(Cycles32.getValue(1)); 9229 } 9230 9231 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) { 9232 SDLoc dl(V.getNode()); 9233 SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32); 9234 SDValue VHi = DAG.getAnyExtOrTrunc( 9235 DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)), 9236 dl, MVT::i32); 9237 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 9238 if (isBigEndian) 9239 std::swap (VLo, VHi); 9240 SDValue RegClass = 9241 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32); 9242 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32); 9243 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32); 9244 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 }; 9245 return SDValue( 9246 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0); 9247 } 9248 9249 static void ReplaceCMP_SWAP_64Results(SDNode *N, 9250 SmallVectorImpl<SDValue> & Results, 9251 SelectionDAG &DAG) { 9252 assert(N->getValueType(0) == MVT::i64 && 9253 "AtomicCmpSwap on types less than 64 should be legal"); 9254 SDValue Ops[] = {N->getOperand(1), 9255 createGPRPairNode(DAG, N->getOperand(2)), 9256 createGPRPairNode(DAG, N->getOperand(3)), 9257 N->getOperand(0)}; 9258 SDNode *CmpSwap = DAG.getMachineNode( 9259 ARM::CMP_SWAP_64, SDLoc(N), 9260 DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops); 9261 9262 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand(); 9263 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp}); 9264 9265 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 9266 9267 Results.push_back( 9268 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0, 9269 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0))); 9270 Results.push_back( 9271 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1, 9272 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0))); 9273 Results.push_back(SDValue(CmpSwap, 2)); 9274 } 9275 9276 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 9277 LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump()); 9278 switch (Op.getOpcode()) { 9279 default: llvm_unreachable("Don't know how to custom lower this!"); 9280 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG); 9281 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 9282 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 9283 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 9284 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 9285 case ISD::SELECT: return LowerSELECT(Op, DAG); 9286 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 9287 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 9288 case ISD::BR_CC: return LowerBR_CC(Op, DAG); 9289 case ISD::BR_JT: return LowerBR_JT(Op, DAG); 9290 case ISD::VASTART: return LowerVASTART(Op, DAG); 9291 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget); 9292 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget); 9293 case ISD::SINT_TO_FP: 9294 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); 9295 case ISD::STRICT_FP_TO_SINT: 9296 case ISD::STRICT_FP_TO_UINT: 9297 case ISD::FP_TO_SINT: 9298 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); 9299 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 9300 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 9301 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 9302 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG); 9303 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG); 9304 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); 9305 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget); 9306 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, 9307 Subtarget); 9308 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget); 9309 case ISD::SHL: 9310 case ISD::SRL: 9311 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget); 9312 case ISD::SREM: return LowerREM(Op.getNode(), DAG); 9313 case ISD::UREM: return LowerREM(Op.getNode(), DAG); 9314 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG); 9315 case ISD::SRL_PARTS: 9316 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); 9317 case ISD::CTTZ: 9318 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget); 9319 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget); 9320 case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget); 9321 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); 9322 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); 9323 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget); 9324 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget); 9325 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget); 9326 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 9327 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget); 9328 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget); 9329 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 9330 case ISD::MUL: return LowerMUL(Op, DAG); 9331 case ISD::SDIV: 9332 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) 9333 return LowerDIV_Windows(Op, DAG, /* Signed */ true); 9334 return LowerSDIV(Op, DAG, Subtarget); 9335 case ISD::UDIV: 9336 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) 9337 return LowerDIV_Windows(Op, DAG, /* Signed */ false); 9338 return LowerUDIV(Op, DAG, Subtarget); 9339 case ISD::ADDCARRY: 9340 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG); 9341 case ISD::SADDO: 9342 case ISD::SSUBO: 9343 return LowerSignedALUO(Op, DAG); 9344 case ISD::UADDO: 9345 case ISD::USUBO: 9346 return LowerUnsignedALUO(Op, DAG); 9347 case ISD::SADDSAT: 9348 case ISD::SSUBSAT: 9349 return LowerSADDSUBSAT(Op, DAG, Subtarget); 9350 case ISD::LOAD: 9351 return LowerPredicateLoad(Op, DAG); 9352 case ISD::STORE: 9353 return LowerSTORE(Op, DAG, Subtarget); 9354 case ISD::MLOAD: 9355 return LowerMLOAD(Op, DAG); 9356 case ISD::ATOMIC_LOAD: 9357 case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); 9358 case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); 9359 case ISD::SDIVREM: 9360 case ISD::UDIVREM: return LowerDivRem(Op, DAG); 9361 case ISD::DYNAMIC_STACKALLOC: 9362 if (Subtarget->isTargetWindows()) 9363 return LowerDYNAMIC_STACKALLOC(Op, DAG); 9364 llvm_unreachable("Don't know how to custom lower this!"); 9365 case ISD::STRICT_FP_ROUND: 9366 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); 9367 case ISD::STRICT_FP_EXTEND: 9368 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); 9369 case ARMISD::WIN__DBZCHK: return SDValue(); 9370 } 9371 } 9372 9373 static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results, 9374 SelectionDAG &DAG) { 9375 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 9376 unsigned Opc = 0; 9377 if (IntNo == Intrinsic::arm_smlald) 9378 Opc = ARMISD::SMLALD; 9379 else if (IntNo == Intrinsic::arm_smlaldx) 9380 Opc = ARMISD::SMLALDX; 9381 else if (IntNo == Intrinsic::arm_smlsld) 9382 Opc = ARMISD::SMLSLD; 9383 else if (IntNo == Intrinsic::arm_smlsldx) 9384 Opc = ARMISD::SMLSLDX; 9385 else 9386 return; 9387 9388 SDLoc dl(N); 9389 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 9390 N->getOperand(3), 9391 DAG.getConstant(0, dl, MVT::i32)); 9392 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 9393 N->getOperand(3), 9394 DAG.getConstant(1, dl, MVT::i32)); 9395 9396 SDValue LongMul = DAG.getNode(Opc, dl, 9397 DAG.getVTList(MVT::i32, MVT::i32), 9398 N->getOperand(1), N->getOperand(2), 9399 Lo, Hi); 9400 Results.push_back(LongMul.getValue(0)); 9401 Results.push_back(LongMul.getValue(1)); 9402 } 9403 9404 /// ReplaceNodeResults - Replace the results of node with an illegal result 9405 /// type with new values built out of custom code. 9406 void ARMTargetLowering::ReplaceNodeResults(SDNode *N, 9407 SmallVectorImpl<SDValue> &Results, 9408 SelectionDAG &DAG) const { 9409 SDValue Res; 9410 switch (N->getOpcode()) { 9411 default: 9412 llvm_unreachable("Don't know how to custom expand this!"); 9413 case ISD::READ_REGISTER: 9414 ExpandREAD_REGISTER(N, Results, DAG); 9415 break; 9416 case ISD::BITCAST: 9417 Res = ExpandBITCAST(N, DAG, Subtarget); 9418 break; 9419 case ISD::SRL: 9420 case ISD::SRA: 9421 case ISD::SHL: 9422 Res = Expand64BitShift(N, DAG, Subtarget); 9423 break; 9424 case ISD::SREM: 9425 case ISD::UREM: 9426 Res = LowerREM(N, DAG); 9427 break; 9428 case ISD::SDIVREM: 9429 case ISD::UDIVREM: 9430 Res = LowerDivRem(SDValue(N, 0), DAG); 9431 assert(Res.getNumOperands() == 2 && "DivRem needs two values"); 9432 Results.push_back(Res.getValue(0)); 9433 Results.push_back(Res.getValue(1)); 9434 return; 9435 case ISD::SADDSAT: 9436 case ISD::SSUBSAT: 9437 Res = LowerSADDSUBSAT(SDValue(N, 0), DAG, Subtarget); 9438 break; 9439 case ISD::READCYCLECOUNTER: 9440 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget); 9441 return; 9442 case ISD::UDIV: 9443 case ISD::SDIV: 9444 assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows"); 9445 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV, 9446 Results); 9447 case ISD::ATOMIC_CMP_SWAP: 9448 ReplaceCMP_SWAP_64Results(N, Results, DAG); 9449 return; 9450 case ISD::INTRINSIC_WO_CHAIN: 9451 return ReplaceLongIntrinsic(N, Results, DAG); 9452 case ISD::ABS: 9453 lowerABS(N, Results, DAG); 9454 return ; 9455 case ISD::LOAD: 9456 LowerLOAD(N, Results, DAG); 9457 break; 9458 } 9459 if (Res.getNode()) 9460 Results.push_back(Res); 9461 } 9462 9463 //===----------------------------------------------------------------------===// 9464 // ARM Scheduler Hooks 9465 //===----------------------------------------------------------------------===// 9466 9467 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and 9468 /// registers the function context. 9469 void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, 9470 MachineBasicBlock *MBB, 9471 MachineBasicBlock *DispatchBB, 9472 int FI) const { 9473 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 9474 "ROPI/RWPI not currently supported with SjLj"); 9475 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 9476 DebugLoc dl = MI.getDebugLoc(); 9477 MachineFunction *MF = MBB->getParent(); 9478 MachineRegisterInfo *MRI = &MF->getRegInfo(); 9479 MachineConstantPool *MCP = MF->getConstantPool(); 9480 ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>(); 9481 const Function &F = MF->getFunction(); 9482 9483 bool isThumb = Subtarget->isThumb(); 9484 bool isThumb2 = Subtarget->isThumb2(); 9485 9486 unsigned PCLabelId = AFI->createPICLabelUId(); 9487 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8; 9488 ARMConstantPoolValue *CPV = 9489 ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj); 9490 unsigned CPI = MCP->getConstantPoolIndex(CPV, 4); 9491 9492 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass 9493 : &ARM::GPRRegClass; 9494 9495 // Grab constant pool and fixed stack memory operands. 9496 MachineMemOperand *CPMMO = 9497 MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), 9498 MachineMemOperand::MOLoad, 4, 4); 9499 9500 MachineMemOperand *FIMMOSt = 9501 MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI), 9502 MachineMemOperand::MOStore, 4, 4); 9503 9504 // Load the address of the dispatch MBB into the jump buffer. 9505 if (isThumb2) { 9506 // Incoming value: jbuf 9507 // ldr.n r5, LCPI1_1 9508 // orr r5, r5, #1 9509 // add r5, pc 9510 // str r5, [$jbuf, #+4] ; &jbuf[1] 9511 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9512 BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1) 9513 .addConstantPoolIndex(CPI) 9514 .addMemOperand(CPMMO) 9515 .add(predOps(ARMCC::AL)); 9516 // Set the low bit because of thumb mode. 9517 Register NewVReg2 = MRI->createVirtualRegister(TRC); 9518 BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2) 9519 .addReg(NewVReg1, RegState::Kill) 9520 .addImm(0x01) 9521 .add(predOps(ARMCC::AL)) 9522 .add(condCodeOp()); 9523 Register NewVReg3 = MRI->createVirtualRegister(TRC); 9524 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3) 9525 .addReg(NewVReg2, RegState::Kill) 9526 .addImm(PCLabelId); 9527 BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12)) 9528 .addReg(NewVReg3, RegState::Kill) 9529 .addFrameIndex(FI) 9530 .addImm(36) // &jbuf[1] :: pc 9531 .addMemOperand(FIMMOSt) 9532 .add(predOps(ARMCC::AL)); 9533 } else if (isThumb) { 9534 // Incoming value: jbuf 9535 // ldr.n r1, LCPI1_4 9536 // add r1, pc 9537 // mov r2, #1 9538 // orrs r1, r2 9539 // add r2, $jbuf, #+4 ; &jbuf[1] 9540 // str r1, [r2] 9541 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9542 BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1) 9543 .addConstantPoolIndex(CPI) 9544 .addMemOperand(CPMMO) 9545 .add(predOps(ARMCC::AL)); 9546 Register NewVReg2 = MRI->createVirtualRegister(TRC); 9547 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2) 9548 .addReg(NewVReg1, RegState::Kill) 9549 .addImm(PCLabelId); 9550 // Set the low bit because of thumb mode. 9551 Register NewVReg3 = MRI->createVirtualRegister(TRC); 9552 BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3) 9553 .addReg(ARM::CPSR, RegState::Define) 9554 .addImm(1) 9555 .add(predOps(ARMCC::AL)); 9556 Register NewVReg4 = MRI->createVirtualRegister(TRC); 9557 BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4) 9558 .addReg(ARM::CPSR, RegState::Define) 9559 .addReg(NewVReg2, RegState::Kill) 9560 .addReg(NewVReg3, RegState::Kill) 9561 .add(predOps(ARMCC::AL)); 9562 Register NewVReg5 = MRI->createVirtualRegister(TRC); 9563 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5) 9564 .addFrameIndex(FI) 9565 .addImm(36); // &jbuf[1] :: pc 9566 BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi)) 9567 .addReg(NewVReg4, RegState::Kill) 9568 .addReg(NewVReg5, RegState::Kill) 9569 .addImm(0) 9570 .addMemOperand(FIMMOSt) 9571 .add(predOps(ARMCC::AL)); 9572 } else { 9573 // Incoming value: jbuf 9574 // ldr r1, LCPI1_1 9575 // add r1, pc, r1 9576 // str r1, [$jbuf, #+4] ; &jbuf[1] 9577 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9578 BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1) 9579 .addConstantPoolIndex(CPI) 9580 .addImm(0) 9581 .addMemOperand(CPMMO) 9582 .add(predOps(ARMCC::AL)); 9583 Register NewVReg2 = MRI->createVirtualRegister(TRC); 9584 BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2) 9585 .addReg(NewVReg1, RegState::Kill) 9586 .addImm(PCLabelId) 9587 .add(predOps(ARMCC::AL)); 9588 BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12)) 9589 .addReg(NewVReg2, RegState::Kill) 9590 .addFrameIndex(FI) 9591 .addImm(36) // &jbuf[1] :: pc 9592 .addMemOperand(FIMMOSt) 9593 .add(predOps(ARMCC::AL)); 9594 } 9595 } 9596 9597 void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, 9598 MachineBasicBlock *MBB) const { 9599 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 9600 DebugLoc dl = MI.getDebugLoc(); 9601 MachineFunction *MF = MBB->getParent(); 9602 MachineRegisterInfo *MRI = &MF->getRegInfo(); 9603 MachineFrameInfo &MFI = MF->getFrameInfo(); 9604 int FI = MFI.getFunctionContextIndex(); 9605 9606 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass 9607 : &ARM::GPRnopcRegClass; 9608 9609 // Get a mapping of the call site numbers to all of the landing pads they're 9610 // associated with. 9611 DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad; 9612 unsigned MaxCSNum = 0; 9613 for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E; 9614 ++BB) { 9615 if (!BB->isEHPad()) continue; 9616 9617 // FIXME: We should assert that the EH_LABEL is the first MI in the landing 9618 // pad. 9619 for (MachineBasicBlock::iterator 9620 II = BB->begin(), IE = BB->end(); II != IE; ++II) { 9621 if (!II->isEHLabel()) continue; 9622 9623 MCSymbol *Sym = II->getOperand(0).getMCSymbol(); 9624 if (!MF->hasCallSiteLandingPad(Sym)) continue; 9625 9626 SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym); 9627 for (SmallVectorImpl<unsigned>::iterator 9628 CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end(); 9629 CSI != CSE; ++CSI) { 9630 CallSiteNumToLPad[*CSI].push_back(&*BB); 9631 MaxCSNum = std::max(MaxCSNum, *CSI); 9632 } 9633 break; 9634 } 9635 } 9636 9637 // Get an ordered list of the machine basic blocks for the jump table. 9638 std::vector<MachineBasicBlock*> LPadList; 9639 SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs; 9640 LPadList.reserve(CallSiteNumToLPad.size()); 9641 for (unsigned I = 1; I <= MaxCSNum; ++I) { 9642 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I]; 9643 for (SmallVectorImpl<MachineBasicBlock*>::iterator 9644 II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) { 9645 LPadList.push_back(*II); 9646 InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end()); 9647 } 9648 } 9649 9650 assert(!LPadList.empty() && 9651 "No landing pad destinations for the dispatch jump table!"); 9652 9653 // Create the jump table and associated information. 9654 MachineJumpTableInfo *JTI = 9655 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline); 9656 unsigned MJTI = JTI->createJumpTableIndex(LPadList); 9657 9658 // Create the MBBs for the dispatch code. 9659 9660 // Shove the dispatch's address into the return slot in the function context. 9661 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock(); 9662 DispatchBB->setIsEHPad(); 9663 9664 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 9665 unsigned trap_opcode; 9666 if (Subtarget->isThumb()) 9667 trap_opcode = ARM::tTRAP; 9668 else 9669 trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP; 9670 9671 BuildMI(TrapBB, dl, TII->get(trap_opcode)); 9672 DispatchBB->addSuccessor(TrapBB); 9673 9674 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock(); 9675 DispatchBB->addSuccessor(DispContBB); 9676 9677 // Insert and MBBs. 9678 MF->insert(MF->end(), DispatchBB); 9679 MF->insert(MF->end(), DispContBB); 9680 MF->insert(MF->end(), TrapBB); 9681 9682 // Insert code into the entry block that creates and registers the function 9683 // context. 9684 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI); 9685 9686 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand( 9687 MachinePointerInfo::getFixedStack(*MF, FI), 9688 MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4); 9689 9690 MachineInstrBuilder MIB; 9691 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup)); 9692 9693 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII); 9694 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo(); 9695 9696 // Add a register mask with no preserved registers. This results in all 9697 // registers being marked as clobbered. This can't work if the dispatch block 9698 // is in a Thumb1 function and is linked with ARM code which uses the FP 9699 // registers, as there is no way to preserve the FP registers in Thumb1 mode. 9700 MIB.addRegMask(RI.getSjLjDispatchPreservedMask(*MF)); 9701 9702 bool IsPositionIndependent = isPositionIndependent(); 9703 unsigned NumLPads = LPadList.size(); 9704 if (Subtarget->isThumb2()) { 9705 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9706 BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1) 9707 .addFrameIndex(FI) 9708 .addImm(4) 9709 .addMemOperand(FIMMOLd) 9710 .add(predOps(ARMCC::AL)); 9711 9712 if (NumLPads < 256) { 9713 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri)) 9714 .addReg(NewVReg1) 9715 .addImm(LPadList.size()) 9716 .add(predOps(ARMCC::AL)); 9717 } else { 9718 Register VReg1 = MRI->createVirtualRegister(TRC); 9719 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1) 9720 .addImm(NumLPads & 0xFFFF) 9721 .add(predOps(ARMCC::AL)); 9722 9723 unsigned VReg2 = VReg1; 9724 if ((NumLPads & 0xFFFF0000) != 0) { 9725 VReg2 = MRI->createVirtualRegister(TRC); 9726 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2) 9727 .addReg(VReg1) 9728 .addImm(NumLPads >> 16) 9729 .add(predOps(ARMCC::AL)); 9730 } 9731 9732 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr)) 9733 .addReg(NewVReg1) 9734 .addReg(VReg2) 9735 .add(predOps(ARMCC::AL)); 9736 } 9737 9738 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc)) 9739 .addMBB(TrapBB) 9740 .addImm(ARMCC::HI) 9741 .addReg(ARM::CPSR); 9742 9743 Register NewVReg3 = MRI->createVirtualRegister(TRC); 9744 BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3) 9745 .addJumpTableIndex(MJTI) 9746 .add(predOps(ARMCC::AL)); 9747 9748 Register NewVReg4 = MRI->createVirtualRegister(TRC); 9749 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4) 9750 .addReg(NewVReg3, RegState::Kill) 9751 .addReg(NewVReg1) 9752 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) 9753 .add(predOps(ARMCC::AL)) 9754 .add(condCodeOp()); 9755 9756 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT)) 9757 .addReg(NewVReg4, RegState::Kill) 9758 .addReg(NewVReg1) 9759 .addJumpTableIndex(MJTI); 9760 } else if (Subtarget->isThumb()) { 9761 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9762 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1) 9763 .addFrameIndex(FI) 9764 .addImm(1) 9765 .addMemOperand(FIMMOLd) 9766 .add(predOps(ARMCC::AL)); 9767 9768 if (NumLPads < 256) { 9769 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8)) 9770 .addReg(NewVReg1) 9771 .addImm(NumLPads) 9772 .add(predOps(ARMCC::AL)); 9773 } else { 9774 MachineConstantPool *ConstantPool = MF->getConstantPool(); 9775 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 9776 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 9777 9778 // MachineConstantPool wants an explicit alignment. 9779 unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); 9780 if (Align == 0) 9781 Align = MF->getDataLayout().getTypeAllocSize(C->getType()); 9782 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 9783 9784 Register VReg1 = MRI->createVirtualRegister(TRC); 9785 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci)) 9786 .addReg(VReg1, RegState::Define) 9787 .addConstantPoolIndex(Idx) 9788 .add(predOps(ARMCC::AL)); 9789 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr)) 9790 .addReg(NewVReg1) 9791 .addReg(VReg1) 9792 .add(predOps(ARMCC::AL)); 9793 } 9794 9795 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc)) 9796 .addMBB(TrapBB) 9797 .addImm(ARMCC::HI) 9798 .addReg(ARM::CPSR); 9799 9800 Register NewVReg2 = MRI->createVirtualRegister(TRC); 9801 BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2) 9802 .addReg(ARM::CPSR, RegState::Define) 9803 .addReg(NewVReg1) 9804 .addImm(2) 9805 .add(predOps(ARMCC::AL)); 9806 9807 Register NewVReg3 = MRI->createVirtualRegister(TRC); 9808 BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3) 9809 .addJumpTableIndex(MJTI) 9810 .add(predOps(ARMCC::AL)); 9811 9812 Register NewVReg4 = MRI->createVirtualRegister(TRC); 9813 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4) 9814 .addReg(ARM::CPSR, RegState::Define) 9815 .addReg(NewVReg2, RegState::Kill) 9816 .addReg(NewVReg3) 9817 .add(predOps(ARMCC::AL)); 9818 9819 MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( 9820 MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); 9821 9822 Register NewVReg5 = MRI->createVirtualRegister(TRC); 9823 BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5) 9824 .addReg(NewVReg4, RegState::Kill) 9825 .addImm(0) 9826 .addMemOperand(JTMMOLd) 9827 .add(predOps(ARMCC::AL)); 9828 9829 unsigned NewVReg6 = NewVReg5; 9830 if (IsPositionIndependent) { 9831 NewVReg6 = MRI->createVirtualRegister(TRC); 9832 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6) 9833 .addReg(ARM::CPSR, RegState::Define) 9834 .addReg(NewVReg5, RegState::Kill) 9835 .addReg(NewVReg3) 9836 .add(predOps(ARMCC::AL)); 9837 } 9838 9839 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr)) 9840 .addReg(NewVReg6, RegState::Kill) 9841 .addJumpTableIndex(MJTI); 9842 } else { 9843 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9844 BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1) 9845 .addFrameIndex(FI) 9846 .addImm(4) 9847 .addMemOperand(FIMMOLd) 9848 .add(predOps(ARMCC::AL)); 9849 9850 if (NumLPads < 256) { 9851 BuildMI(DispatchBB, dl, TII->get(ARM::CMPri)) 9852 .addReg(NewVReg1) 9853 .addImm(NumLPads) 9854 .add(predOps(ARMCC::AL)); 9855 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) { 9856 Register VReg1 = MRI->createVirtualRegister(TRC); 9857 BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1) 9858 .addImm(NumLPads & 0xFFFF) 9859 .add(predOps(ARMCC::AL)); 9860 9861 unsigned VReg2 = VReg1; 9862 if ((NumLPads & 0xFFFF0000) != 0) { 9863 VReg2 = MRI->createVirtualRegister(TRC); 9864 BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2) 9865 .addReg(VReg1) 9866 .addImm(NumLPads >> 16) 9867 .add(predOps(ARMCC::AL)); 9868 } 9869 9870 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 9871 .addReg(NewVReg1) 9872 .addReg(VReg2) 9873 .add(predOps(ARMCC::AL)); 9874 } else { 9875 MachineConstantPool *ConstantPool = MF->getConstantPool(); 9876 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 9877 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 9878 9879 // MachineConstantPool wants an explicit alignment. 9880 unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); 9881 if (Align == 0) 9882 Align = MF->getDataLayout().getTypeAllocSize(C->getType()); 9883 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 9884 9885 Register VReg1 = MRI->createVirtualRegister(TRC); 9886 BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp)) 9887 .addReg(VReg1, RegState::Define) 9888 .addConstantPoolIndex(Idx) 9889 .addImm(0) 9890 .add(predOps(ARMCC::AL)); 9891 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 9892 .addReg(NewVReg1) 9893 .addReg(VReg1, RegState::Kill) 9894 .add(predOps(ARMCC::AL)); 9895 } 9896 9897 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc)) 9898 .addMBB(TrapBB) 9899 .addImm(ARMCC::HI) 9900 .addReg(ARM::CPSR); 9901 9902 Register NewVReg3 = MRI->createVirtualRegister(TRC); 9903 BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3) 9904 .addReg(NewVReg1) 9905 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) 9906 .add(predOps(ARMCC::AL)) 9907 .add(condCodeOp()); 9908 Register NewVReg4 = MRI->createVirtualRegister(TRC); 9909 BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4) 9910 .addJumpTableIndex(MJTI) 9911 .add(predOps(ARMCC::AL)); 9912 9913 MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( 9914 MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); 9915 Register NewVReg5 = MRI->createVirtualRegister(TRC); 9916 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5) 9917 .addReg(NewVReg3, RegState::Kill) 9918 .addReg(NewVReg4) 9919 .addImm(0) 9920 .addMemOperand(JTMMOLd) 9921 .add(predOps(ARMCC::AL)); 9922 9923 if (IsPositionIndependent) { 9924 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd)) 9925 .addReg(NewVReg5, RegState::Kill) 9926 .addReg(NewVReg4) 9927 .addJumpTableIndex(MJTI); 9928 } else { 9929 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr)) 9930 .addReg(NewVReg5, RegState::Kill) 9931 .addJumpTableIndex(MJTI); 9932 } 9933 } 9934 9935 // Add the jump table entries as successors to the MBB. 9936 SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs; 9937 for (std::vector<MachineBasicBlock*>::iterator 9938 I = LPadList.begin(), E = LPadList.end(); I != E; ++I) { 9939 MachineBasicBlock *CurMBB = *I; 9940 if (SeenMBBs.insert(CurMBB).second) 9941 DispContBB->addSuccessor(CurMBB); 9942 } 9943 9944 // N.B. the order the invoke BBs are processed in doesn't matter here. 9945 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF); 9946 SmallVector<MachineBasicBlock*, 64> MBBLPads; 9947 for (MachineBasicBlock *BB : InvokeBBs) { 9948 9949 // Remove the landing pad successor from the invoke block and replace it 9950 // with the new dispatch block. 9951 SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(), 9952 BB->succ_end()); 9953 while (!Successors.empty()) { 9954 MachineBasicBlock *SMBB = Successors.pop_back_val(); 9955 if (SMBB->isEHPad()) { 9956 BB->removeSuccessor(SMBB); 9957 MBBLPads.push_back(SMBB); 9958 } 9959 } 9960 9961 BB->addSuccessor(DispatchBB, BranchProbability::getZero()); 9962 BB->normalizeSuccProbs(); 9963 9964 // Find the invoke call and mark all of the callee-saved registers as 9965 // 'implicit defined' so that they're spilled. This prevents code from 9966 // moving instructions to before the EH block, where they will never be 9967 // executed. 9968 for (MachineBasicBlock::reverse_iterator 9969 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) { 9970 if (!II->isCall()) continue; 9971 9972 DenseMap<unsigned, bool> DefRegs; 9973 for (MachineInstr::mop_iterator 9974 OI = II->operands_begin(), OE = II->operands_end(); 9975 OI != OE; ++OI) { 9976 if (!OI->isReg()) continue; 9977 DefRegs[OI->getReg()] = true; 9978 } 9979 9980 MachineInstrBuilder MIB(*MF, &*II); 9981 9982 for (unsigned i = 0; SavedRegs[i] != 0; ++i) { 9983 unsigned Reg = SavedRegs[i]; 9984 if (Subtarget->isThumb2() && 9985 !ARM::tGPRRegClass.contains(Reg) && 9986 !ARM::hGPRRegClass.contains(Reg)) 9987 continue; 9988 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg)) 9989 continue; 9990 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg)) 9991 continue; 9992 if (!DefRegs[Reg]) 9993 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead); 9994 } 9995 9996 break; 9997 } 9998 } 9999 10000 // Mark all former landing pads as non-landing pads. The dispatch is the only 10001 // landing pad now. 10002 for (SmallVectorImpl<MachineBasicBlock*>::iterator 10003 I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I) 10004 (*I)->setIsEHPad(false); 10005 10006 // The instruction is gone now. 10007 MI.eraseFromParent(); 10008 } 10009 10010 static 10011 MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { 10012 for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(), 10013 E = MBB->succ_end(); I != E; ++I) 10014 if (*I != Succ) 10015 return *I; 10016 llvm_unreachable("Expecting a BB with two successors!"); 10017 } 10018 10019 /// Return the load opcode for a given load size. If load size >= 8, 10020 /// neon opcode will be returned. 10021 static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) { 10022 if (LdSize >= 8) 10023 return LdSize == 16 ? ARM::VLD1q32wb_fixed 10024 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0; 10025 if (IsThumb1) 10026 return LdSize == 4 ? ARM::tLDRi 10027 : LdSize == 2 ? ARM::tLDRHi 10028 : LdSize == 1 ? ARM::tLDRBi : 0; 10029 if (IsThumb2) 10030 return LdSize == 4 ? ARM::t2LDR_POST 10031 : LdSize == 2 ? ARM::t2LDRH_POST 10032 : LdSize == 1 ? ARM::t2LDRB_POST : 0; 10033 return LdSize == 4 ? ARM::LDR_POST_IMM 10034 : LdSize == 2 ? ARM::LDRH_POST 10035 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0; 10036 } 10037 10038 /// Return the store opcode for a given store size. If store size >= 8, 10039 /// neon opcode will be returned. 10040 static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) { 10041 if (StSize >= 8) 10042 return StSize == 16 ? ARM::VST1q32wb_fixed 10043 : StSize == 8 ? ARM::VST1d32wb_fixed : 0; 10044 if (IsThumb1) 10045 return StSize == 4 ? ARM::tSTRi 10046 : StSize == 2 ? ARM::tSTRHi 10047 : StSize == 1 ? ARM::tSTRBi : 0; 10048 if (IsThumb2) 10049 return StSize == 4 ? ARM::t2STR_POST 10050 : StSize == 2 ? ARM::t2STRH_POST 10051 : StSize == 1 ? ARM::t2STRB_POST : 0; 10052 return StSize == 4 ? ARM::STR_POST_IMM 10053 : StSize == 2 ? ARM::STRH_POST 10054 : StSize == 1 ? ARM::STRB_POST_IMM : 0; 10055 } 10056 10057 /// Emit a post-increment load operation with given size. The instructions 10058 /// will be added to BB at Pos. 10059 static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, 10060 const TargetInstrInfo *TII, const DebugLoc &dl, 10061 unsigned LdSize, unsigned Data, unsigned AddrIn, 10062 unsigned AddrOut, bool IsThumb1, bool IsThumb2) { 10063 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2); 10064 assert(LdOpc != 0 && "Should have a load opcode"); 10065 if (LdSize >= 8) { 10066 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 10067 .addReg(AddrOut, RegState::Define) 10068 .addReg(AddrIn) 10069 .addImm(0) 10070 .add(predOps(ARMCC::AL)); 10071 } else if (IsThumb1) { 10072 // load + update AddrIn 10073 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 10074 .addReg(AddrIn) 10075 .addImm(0) 10076 .add(predOps(ARMCC::AL)); 10077 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) 10078 .add(t1CondCodeOp()) 10079 .addReg(AddrIn) 10080 .addImm(LdSize) 10081 .add(predOps(ARMCC::AL)); 10082 } else if (IsThumb2) { 10083 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 10084 .addReg(AddrOut, RegState::Define) 10085 .addReg(AddrIn) 10086 .addImm(LdSize) 10087 .add(predOps(ARMCC::AL)); 10088 } else { // arm 10089 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 10090 .addReg(AddrOut, RegState::Define) 10091 .addReg(AddrIn) 10092 .addReg(0) 10093 .addImm(LdSize) 10094 .add(predOps(ARMCC::AL)); 10095 } 10096 } 10097 10098 /// Emit a post-increment store operation with given size. The instructions 10099 /// will be added to BB at Pos. 10100 static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, 10101 const TargetInstrInfo *TII, const DebugLoc &dl, 10102 unsigned StSize, unsigned Data, unsigned AddrIn, 10103 unsigned AddrOut, bool IsThumb1, bool IsThumb2) { 10104 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2); 10105 assert(StOpc != 0 && "Should have a store opcode"); 10106 if (StSize >= 8) { 10107 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 10108 .addReg(AddrIn) 10109 .addImm(0) 10110 .addReg(Data) 10111 .add(predOps(ARMCC::AL)); 10112 } else if (IsThumb1) { 10113 // store + update AddrIn 10114 BuildMI(*BB, Pos, dl, TII->get(StOpc)) 10115 .addReg(Data) 10116 .addReg(AddrIn) 10117 .addImm(0) 10118 .add(predOps(ARMCC::AL)); 10119 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) 10120 .add(t1CondCodeOp()) 10121 .addReg(AddrIn) 10122 .addImm(StSize) 10123 .add(predOps(ARMCC::AL)); 10124 } else if (IsThumb2) { 10125 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 10126 .addReg(Data) 10127 .addReg(AddrIn) 10128 .addImm(StSize) 10129 .add(predOps(ARMCC::AL)); 10130 } else { // arm 10131 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 10132 .addReg(Data) 10133 .addReg(AddrIn) 10134 .addReg(0) 10135 .addImm(StSize) 10136 .add(predOps(ARMCC::AL)); 10137 } 10138 } 10139 10140 MachineBasicBlock * 10141 ARMTargetLowering::EmitStructByval(MachineInstr &MI, 10142 MachineBasicBlock *BB) const { 10143 // This pseudo instruction has 3 operands: dst, src, size 10144 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold(). 10145 // Otherwise, we will generate unrolled scalar copies. 10146 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 10147 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 10148 MachineFunction::iterator It = ++BB->getIterator(); 10149 10150 Register dest = MI.getOperand(0).getReg(); 10151 Register src = MI.getOperand(1).getReg(); 10152 unsigned SizeVal = MI.getOperand(2).getImm(); 10153 unsigned Align = MI.getOperand(3).getImm(); 10154 DebugLoc dl = MI.getDebugLoc(); 10155 10156 MachineFunction *MF = BB->getParent(); 10157 MachineRegisterInfo &MRI = MF->getRegInfo(); 10158 unsigned UnitSize = 0; 10159 const TargetRegisterClass *TRC = nullptr; 10160 const TargetRegisterClass *VecTRC = nullptr; 10161 10162 bool IsThumb1 = Subtarget->isThumb1Only(); 10163 bool IsThumb2 = Subtarget->isThumb2(); 10164 bool IsThumb = Subtarget->isThumb(); 10165 10166 if (Align & 1) { 10167 UnitSize = 1; 10168 } else if (Align & 2) { 10169 UnitSize = 2; 10170 } else { 10171 // Check whether we can use NEON instructions. 10172 if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) && 10173 Subtarget->hasNEON()) { 10174 if ((Align % 16 == 0) && SizeVal >= 16) 10175 UnitSize = 16; 10176 else if ((Align % 8 == 0) && SizeVal >= 8) 10177 UnitSize = 8; 10178 } 10179 // Can't use NEON instructions. 10180 if (UnitSize == 0) 10181 UnitSize = 4; 10182 } 10183 10184 // Select the correct opcode and register class for unit size load/store 10185 bool IsNeon = UnitSize >= 8; 10186 TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass; 10187 if (IsNeon) 10188 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass 10189 : UnitSize == 8 ? &ARM::DPRRegClass 10190 : nullptr; 10191 10192 unsigned BytesLeft = SizeVal % UnitSize; 10193 unsigned LoopSize = SizeVal - BytesLeft; 10194 10195 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) { 10196 // Use LDR and STR to copy. 10197 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize) 10198 // [destOut] = STR_POST(scratch, destIn, UnitSize) 10199 unsigned srcIn = src; 10200 unsigned destIn = dest; 10201 for (unsigned i = 0; i < LoopSize; i+=UnitSize) { 10202 Register srcOut = MRI.createVirtualRegister(TRC); 10203 Register destOut = MRI.createVirtualRegister(TRC); 10204 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); 10205 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut, 10206 IsThumb1, IsThumb2); 10207 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut, 10208 IsThumb1, IsThumb2); 10209 srcIn = srcOut; 10210 destIn = destOut; 10211 } 10212 10213 // Handle the leftover bytes with LDRB and STRB. 10214 // [scratch, srcOut] = LDRB_POST(srcIn, 1) 10215 // [destOut] = STRB_POST(scratch, destIn, 1) 10216 for (unsigned i = 0; i < BytesLeft; i++) { 10217 Register srcOut = MRI.createVirtualRegister(TRC); 10218 Register destOut = MRI.createVirtualRegister(TRC); 10219 Register scratch = MRI.createVirtualRegister(TRC); 10220 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut, 10221 IsThumb1, IsThumb2); 10222 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut, 10223 IsThumb1, IsThumb2); 10224 srcIn = srcOut; 10225 destIn = destOut; 10226 } 10227 MI.eraseFromParent(); // The instruction is gone now. 10228 return BB; 10229 } 10230 10231 // Expand the pseudo op to a loop. 10232 // thisMBB: 10233 // ... 10234 // movw varEnd, # --> with thumb2 10235 // movt varEnd, # 10236 // ldrcp varEnd, idx --> without thumb2 10237 // fallthrough --> loopMBB 10238 // loopMBB: 10239 // PHI varPhi, varEnd, varLoop 10240 // PHI srcPhi, src, srcLoop 10241 // PHI destPhi, dst, destLoop 10242 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 10243 // [destLoop] = STR_POST(scratch, destPhi, UnitSize) 10244 // subs varLoop, varPhi, #UnitSize 10245 // bne loopMBB 10246 // fallthrough --> exitMBB 10247 // exitMBB: 10248 // epilogue to handle left-over bytes 10249 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 10250 // [destOut] = STRB_POST(scratch, destLoop, 1) 10251 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); 10252 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); 10253 MF->insert(It, loopMBB); 10254 MF->insert(It, exitMBB); 10255 10256 // Transfer the remainder of BB and its successor edges to exitMBB. 10257 exitMBB->splice(exitMBB->begin(), BB, 10258 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10259 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 10260 10261 // Load an immediate to varEnd. 10262 Register varEnd = MRI.createVirtualRegister(TRC); 10263 if (Subtarget->useMovt()) { 10264 unsigned Vtmp = varEnd; 10265 if ((LoopSize & 0xFFFF0000) != 0) 10266 Vtmp = MRI.createVirtualRegister(TRC); 10267 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16), Vtmp) 10268 .addImm(LoopSize & 0xFFFF) 10269 .add(predOps(ARMCC::AL)); 10270 10271 if ((LoopSize & 0xFFFF0000) != 0) 10272 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16), varEnd) 10273 .addReg(Vtmp) 10274 .addImm(LoopSize >> 16) 10275 .add(predOps(ARMCC::AL)); 10276 } else { 10277 MachineConstantPool *ConstantPool = MF->getConstantPool(); 10278 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 10279 const Constant *C = ConstantInt::get(Int32Ty, LoopSize); 10280 10281 // MachineConstantPool wants an explicit alignment. 10282 unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); 10283 if (Align == 0) 10284 Align = MF->getDataLayout().getTypeAllocSize(C->getType()); 10285 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 10286 MachineMemOperand *CPMMO = 10287 MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), 10288 MachineMemOperand::MOLoad, 4, 4); 10289 10290 if (IsThumb) 10291 BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)) 10292 .addReg(varEnd, RegState::Define) 10293 .addConstantPoolIndex(Idx) 10294 .add(predOps(ARMCC::AL)) 10295 .addMemOperand(CPMMO); 10296 else 10297 BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)) 10298 .addReg(varEnd, RegState::Define) 10299 .addConstantPoolIndex(Idx) 10300 .addImm(0) 10301 .add(predOps(ARMCC::AL)) 10302 .addMemOperand(CPMMO); 10303 } 10304 BB->addSuccessor(loopMBB); 10305 10306 // Generate the loop body: 10307 // varPhi = PHI(varLoop, varEnd) 10308 // srcPhi = PHI(srcLoop, src) 10309 // destPhi = PHI(destLoop, dst) 10310 MachineBasicBlock *entryBB = BB; 10311 BB = loopMBB; 10312 Register varLoop = MRI.createVirtualRegister(TRC); 10313 Register varPhi = MRI.createVirtualRegister(TRC); 10314 Register srcLoop = MRI.createVirtualRegister(TRC); 10315 Register srcPhi = MRI.createVirtualRegister(TRC); 10316 Register destLoop = MRI.createVirtualRegister(TRC); 10317 Register destPhi = MRI.createVirtualRegister(TRC); 10318 10319 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi) 10320 .addReg(varLoop).addMBB(loopMBB) 10321 .addReg(varEnd).addMBB(entryBB); 10322 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi) 10323 .addReg(srcLoop).addMBB(loopMBB) 10324 .addReg(src).addMBB(entryBB); 10325 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi) 10326 .addReg(destLoop).addMBB(loopMBB) 10327 .addReg(dest).addMBB(entryBB); 10328 10329 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 10330 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz) 10331 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); 10332 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop, 10333 IsThumb1, IsThumb2); 10334 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop, 10335 IsThumb1, IsThumb2); 10336 10337 // Decrement loop variable by UnitSize. 10338 if (IsThumb1) { 10339 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop) 10340 .add(t1CondCodeOp()) 10341 .addReg(varPhi) 10342 .addImm(UnitSize) 10343 .add(predOps(ARMCC::AL)); 10344 } else { 10345 MachineInstrBuilder MIB = 10346 BuildMI(*BB, BB->end(), dl, 10347 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop); 10348 MIB.addReg(varPhi) 10349 .addImm(UnitSize) 10350 .add(predOps(ARMCC::AL)) 10351 .add(condCodeOp()); 10352 MIB->getOperand(5).setReg(ARM::CPSR); 10353 MIB->getOperand(5).setIsDef(true); 10354 } 10355 BuildMI(*BB, BB->end(), dl, 10356 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc)) 10357 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); 10358 10359 // loopMBB can loop back to loopMBB or fall through to exitMBB. 10360 BB->addSuccessor(loopMBB); 10361 BB->addSuccessor(exitMBB); 10362 10363 // Add epilogue to handle BytesLeft. 10364 BB = exitMBB; 10365 auto StartOfExit = exitMBB->begin(); 10366 10367 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 10368 // [destOut] = STRB_POST(scratch, destLoop, 1) 10369 unsigned srcIn = srcLoop; 10370 unsigned destIn = destLoop; 10371 for (unsigned i = 0; i < BytesLeft; i++) { 10372 Register srcOut = MRI.createVirtualRegister(TRC); 10373 Register destOut = MRI.createVirtualRegister(TRC); 10374 Register scratch = MRI.createVirtualRegister(TRC); 10375 emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut, 10376 IsThumb1, IsThumb2); 10377 emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut, 10378 IsThumb1, IsThumb2); 10379 srcIn = srcOut; 10380 destIn = destOut; 10381 } 10382 10383 MI.eraseFromParent(); // The instruction is gone now. 10384 return BB; 10385 } 10386 10387 MachineBasicBlock * 10388 ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI, 10389 MachineBasicBlock *MBB) const { 10390 const TargetMachine &TM = getTargetMachine(); 10391 const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); 10392 DebugLoc DL = MI.getDebugLoc(); 10393 10394 assert(Subtarget->isTargetWindows() && 10395 "__chkstk is only supported on Windows"); 10396 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode"); 10397 10398 // __chkstk takes the number of words to allocate on the stack in R4, and 10399 // returns the stack adjustment in number of bytes in R4. This will not 10400 // clober any other registers (other than the obvious lr). 10401 // 10402 // Although, technically, IP should be considered a register which may be 10403 // clobbered, the call itself will not touch it. Windows on ARM is a pure 10404 // thumb-2 environment, so there is no interworking required. As a result, we 10405 // do not expect a veneer to be emitted by the linker, clobbering IP. 10406 // 10407 // Each module receives its own copy of __chkstk, so no import thunk is 10408 // required, again, ensuring that IP is not clobbered. 10409 // 10410 // Finally, although some linkers may theoretically provide a trampoline for 10411 // out of range calls (which is quite common due to a 32M range limitation of 10412 // branches for Thumb), we can generate the long-call version via 10413 // -mcmodel=large, alleviating the need for the trampoline which may clobber 10414 // IP. 10415 10416 switch (TM.getCodeModel()) { 10417 case CodeModel::Tiny: 10418 llvm_unreachable("Tiny code model not available on ARM."); 10419 case CodeModel::Small: 10420 case CodeModel::Medium: 10421 case CodeModel::Kernel: 10422 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL)) 10423 .add(predOps(ARMCC::AL)) 10424 .addExternalSymbol("__chkstk") 10425 .addReg(ARM::R4, RegState::Implicit | RegState::Kill) 10426 .addReg(ARM::R4, RegState::Implicit | RegState::Define) 10427 .addReg(ARM::R12, 10428 RegState::Implicit | RegState::Define | RegState::Dead) 10429 .addReg(ARM::CPSR, 10430 RegState::Implicit | RegState::Define | RegState::Dead); 10431 break; 10432 case CodeModel::Large: { 10433 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 10434 Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 10435 10436 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg) 10437 .addExternalSymbol("__chkstk"); 10438 BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr)) 10439 .add(predOps(ARMCC::AL)) 10440 .addReg(Reg, RegState::Kill) 10441 .addReg(ARM::R4, RegState::Implicit | RegState::Kill) 10442 .addReg(ARM::R4, RegState::Implicit | RegState::Define) 10443 .addReg(ARM::R12, 10444 RegState::Implicit | RegState::Define | RegState::Dead) 10445 .addReg(ARM::CPSR, 10446 RegState::Implicit | RegState::Define | RegState::Dead); 10447 break; 10448 } 10449 } 10450 10451 BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP) 10452 .addReg(ARM::SP, RegState::Kill) 10453 .addReg(ARM::R4, RegState::Kill) 10454 .setMIFlags(MachineInstr::FrameSetup) 10455 .add(predOps(ARMCC::AL)) 10456 .add(condCodeOp()); 10457 10458 MI.eraseFromParent(); 10459 return MBB; 10460 } 10461 10462 MachineBasicBlock * 10463 ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI, 10464 MachineBasicBlock *MBB) const { 10465 DebugLoc DL = MI.getDebugLoc(); 10466 MachineFunction *MF = MBB->getParent(); 10467 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 10468 10469 MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock(); 10470 MF->insert(++MBB->getIterator(), ContBB); 10471 ContBB->splice(ContBB->begin(), MBB, 10472 std::next(MachineBasicBlock::iterator(MI)), MBB->end()); 10473 ContBB->transferSuccessorsAndUpdatePHIs(MBB); 10474 MBB->addSuccessor(ContBB); 10475 10476 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 10477 BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0)); 10478 MF->push_back(TrapBB); 10479 MBB->addSuccessor(TrapBB); 10480 10481 BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8)) 10482 .addReg(MI.getOperand(0).getReg()) 10483 .addImm(0) 10484 .add(predOps(ARMCC::AL)); 10485 BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc)) 10486 .addMBB(TrapBB) 10487 .addImm(ARMCC::EQ) 10488 .addReg(ARM::CPSR); 10489 10490 MI.eraseFromParent(); 10491 return ContBB; 10492 } 10493 10494 // The CPSR operand of SelectItr might be missing a kill marker 10495 // because there were multiple uses of CPSR, and ISel didn't know 10496 // which to mark. Figure out whether SelectItr should have had a 10497 // kill marker, and set it if it should. Returns the correct kill 10498 // marker value. 10499 static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr, 10500 MachineBasicBlock* BB, 10501 const TargetRegisterInfo* TRI) { 10502 // Scan forward through BB for a use/def of CPSR. 10503 MachineBasicBlock::iterator miI(std::next(SelectItr)); 10504 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { 10505 const MachineInstr& mi = *miI; 10506 if (mi.readsRegister(ARM::CPSR)) 10507 return false; 10508 if (mi.definesRegister(ARM::CPSR)) 10509 break; // Should have kill-flag - update below. 10510 } 10511 10512 // If we hit the end of the block, check whether CPSR is live into a 10513 // successor. 10514 if (miI == BB->end()) { 10515 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), 10516 sEnd = BB->succ_end(); 10517 sItr != sEnd; ++sItr) { 10518 MachineBasicBlock* succ = *sItr; 10519 if (succ->isLiveIn(ARM::CPSR)) 10520 return false; 10521 } 10522 } 10523 10524 // We found a def, or hit the end of the basic block and CPSR wasn't live 10525 // out. SelectMI should have a kill flag on CPSR. 10526 SelectItr->addRegisterKilled(ARM::CPSR, TRI); 10527 return true; 10528 } 10529 10530 MachineBasicBlock * 10531 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, 10532 MachineBasicBlock *BB) const { 10533 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 10534 DebugLoc dl = MI.getDebugLoc(); 10535 bool isThumb2 = Subtarget->isThumb2(); 10536 switch (MI.getOpcode()) { 10537 default: { 10538 MI.print(errs()); 10539 llvm_unreachable("Unexpected instr type to insert"); 10540 } 10541 10542 // Thumb1 post-indexed loads are really just single-register LDMs. 10543 case ARM::tLDR_postidx: { 10544 MachineOperand Def(MI.getOperand(1)); 10545 BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD)) 10546 .add(Def) // Rn_wb 10547 .add(MI.getOperand(2)) // Rn 10548 .add(MI.getOperand(3)) // PredImm 10549 .add(MI.getOperand(4)) // PredReg 10550 .add(MI.getOperand(0)) // Rt 10551 .cloneMemRefs(MI); 10552 MI.eraseFromParent(); 10553 return BB; 10554 } 10555 10556 // The Thumb2 pre-indexed stores have the same MI operands, they just 10557 // define them differently in the .td files from the isel patterns, so 10558 // they need pseudos. 10559 case ARM::t2STR_preidx: 10560 MI.setDesc(TII->get(ARM::t2STR_PRE)); 10561 return BB; 10562 case ARM::t2STRB_preidx: 10563 MI.setDesc(TII->get(ARM::t2STRB_PRE)); 10564 return BB; 10565 case ARM::t2STRH_preidx: 10566 MI.setDesc(TII->get(ARM::t2STRH_PRE)); 10567 return BB; 10568 10569 case ARM::STRi_preidx: 10570 case ARM::STRBi_preidx: { 10571 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM 10572 : ARM::STRB_PRE_IMM; 10573 // Decode the offset. 10574 unsigned Offset = MI.getOperand(4).getImm(); 10575 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub; 10576 Offset = ARM_AM::getAM2Offset(Offset); 10577 if (isSub) 10578 Offset = -Offset; 10579 10580 MachineMemOperand *MMO = *MI.memoperands_begin(); 10581 BuildMI(*BB, MI, dl, TII->get(NewOpc)) 10582 .add(MI.getOperand(0)) // Rn_wb 10583 .add(MI.getOperand(1)) // Rt 10584 .add(MI.getOperand(2)) // Rn 10585 .addImm(Offset) // offset (skip GPR==zero_reg) 10586 .add(MI.getOperand(5)) // pred 10587 .add(MI.getOperand(6)) 10588 .addMemOperand(MMO); 10589 MI.eraseFromParent(); 10590 return BB; 10591 } 10592 case ARM::STRr_preidx: 10593 case ARM::STRBr_preidx: 10594 case ARM::STRH_preidx: { 10595 unsigned NewOpc; 10596 switch (MI.getOpcode()) { 10597 default: llvm_unreachable("unexpected opcode!"); 10598 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break; 10599 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break; 10600 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break; 10601 } 10602 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc)); 10603 for (unsigned i = 0; i < MI.getNumOperands(); ++i) 10604 MIB.add(MI.getOperand(i)); 10605 MI.eraseFromParent(); 10606 return BB; 10607 } 10608 10609 case ARM::tMOVCCr_pseudo: { 10610 // To "insert" a SELECT_CC instruction, we actually have to insert the 10611 // diamond control-flow pattern. The incoming instruction knows the 10612 // destination vreg to set, the condition code register to branch on, the 10613 // true/false values to select between, and a branch opcode to use. 10614 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 10615 MachineFunction::iterator It = ++BB->getIterator(); 10616 10617 // thisMBB: 10618 // ... 10619 // TrueVal = ... 10620 // cmpTY ccX, r1, r2 10621 // bCC copy1MBB 10622 // fallthrough --> copy0MBB 10623 MachineBasicBlock *thisMBB = BB; 10624 MachineFunction *F = BB->getParent(); 10625 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 10626 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 10627 F->insert(It, copy0MBB); 10628 F->insert(It, sinkMBB); 10629 10630 // Check whether CPSR is live past the tMOVCCr_pseudo. 10631 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); 10632 if (!MI.killsRegister(ARM::CPSR) && 10633 !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) { 10634 copy0MBB->addLiveIn(ARM::CPSR); 10635 sinkMBB->addLiveIn(ARM::CPSR); 10636 } 10637 10638 // Transfer the remainder of BB and its successor edges to sinkMBB. 10639 sinkMBB->splice(sinkMBB->begin(), BB, 10640 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10641 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 10642 10643 BB->addSuccessor(copy0MBB); 10644 BB->addSuccessor(sinkMBB); 10645 10646 BuildMI(BB, dl, TII->get(ARM::tBcc)) 10647 .addMBB(sinkMBB) 10648 .addImm(MI.getOperand(3).getImm()) 10649 .addReg(MI.getOperand(4).getReg()); 10650 10651 // copy0MBB: 10652 // %FalseValue = ... 10653 // # fallthrough to sinkMBB 10654 BB = copy0MBB; 10655 10656 // Update machine-CFG edges 10657 BB->addSuccessor(sinkMBB); 10658 10659 // sinkMBB: 10660 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 10661 // ... 10662 BB = sinkMBB; 10663 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg()) 10664 .addReg(MI.getOperand(1).getReg()) 10665 .addMBB(copy0MBB) 10666 .addReg(MI.getOperand(2).getReg()) 10667 .addMBB(thisMBB); 10668 10669 MI.eraseFromParent(); // The pseudo instruction is gone now. 10670 return BB; 10671 } 10672 10673 case ARM::BCCi64: 10674 case ARM::BCCZi64: { 10675 // If there is an unconditional branch to the other successor, remove it. 10676 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10677 10678 // Compare both parts that make up the double comparison separately for 10679 // equality. 10680 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64; 10681 10682 Register LHS1 = MI.getOperand(1).getReg(); 10683 Register LHS2 = MI.getOperand(2).getReg(); 10684 if (RHSisZero) { 10685 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 10686 .addReg(LHS1) 10687 .addImm(0) 10688 .add(predOps(ARMCC::AL)); 10689 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 10690 .addReg(LHS2).addImm(0) 10691 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 10692 } else { 10693 Register RHS1 = MI.getOperand(3).getReg(); 10694 Register RHS2 = MI.getOperand(4).getReg(); 10695 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 10696 .addReg(LHS1) 10697 .addReg(RHS1) 10698 .add(predOps(ARMCC::AL)); 10699 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 10700 .addReg(LHS2).addReg(RHS2) 10701 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 10702 } 10703 10704 MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB(); 10705 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB); 10706 if (MI.getOperand(0).getImm() == ARMCC::NE) 10707 std::swap(destMBB, exitMBB); 10708 10709 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) 10710 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR); 10711 if (isThumb2) 10712 BuildMI(BB, dl, TII->get(ARM::t2B)) 10713 .addMBB(exitMBB) 10714 .add(predOps(ARMCC::AL)); 10715 else 10716 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB); 10717 10718 MI.eraseFromParent(); // The pseudo instruction is gone now. 10719 return BB; 10720 } 10721 10722 case ARM::Int_eh_sjlj_setjmp: 10723 case ARM::Int_eh_sjlj_setjmp_nofp: 10724 case ARM::tInt_eh_sjlj_setjmp: 10725 case ARM::t2Int_eh_sjlj_setjmp: 10726 case ARM::t2Int_eh_sjlj_setjmp_nofp: 10727 return BB; 10728 10729 case ARM::Int_eh_sjlj_setup_dispatch: 10730 EmitSjLjDispatchBlock(MI, BB); 10731 return BB; 10732 10733 case ARM::ABS: 10734 case ARM::t2ABS: { 10735 // To insert an ABS instruction, we have to insert the 10736 // diamond control-flow pattern. The incoming instruction knows the 10737 // source vreg to test against 0, the destination vreg to set, 10738 // the condition code register to branch on, the 10739 // true/false values to select between, and a branch opcode to use. 10740 // It transforms 10741 // V1 = ABS V0 10742 // into 10743 // V2 = MOVS V0 10744 // BCC (branch to SinkBB if V0 >= 0) 10745 // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0) 10746 // SinkBB: V1 = PHI(V2, V3) 10747 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 10748 MachineFunction::iterator BBI = ++BB->getIterator(); 10749 MachineFunction *Fn = BB->getParent(); 10750 MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB); 10751 MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB); 10752 Fn->insert(BBI, RSBBB); 10753 Fn->insert(BBI, SinkBB); 10754 10755 Register ABSSrcReg = MI.getOperand(1).getReg(); 10756 Register ABSDstReg = MI.getOperand(0).getReg(); 10757 bool ABSSrcKIll = MI.getOperand(1).isKill(); 10758 bool isThumb2 = Subtarget->isThumb2(); 10759 MachineRegisterInfo &MRI = Fn->getRegInfo(); 10760 // In Thumb mode S must not be specified if source register is the SP or 10761 // PC and if destination register is the SP, so restrict register class 10762 Register NewRsbDstReg = MRI.createVirtualRegister( 10763 isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass); 10764 10765 // Transfer the remainder of BB and its successor edges to sinkMBB. 10766 SinkBB->splice(SinkBB->begin(), BB, 10767 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10768 SinkBB->transferSuccessorsAndUpdatePHIs(BB); 10769 10770 BB->addSuccessor(RSBBB); 10771 BB->addSuccessor(SinkBB); 10772 10773 // fall through to SinkMBB 10774 RSBBB->addSuccessor(SinkBB); 10775 10776 // insert a cmp at the end of BB 10777 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 10778 .addReg(ABSSrcReg) 10779 .addImm(0) 10780 .add(predOps(ARMCC::AL)); 10781 10782 // insert a bcc with opposite CC to ARMCC::MI at the end of BB 10783 BuildMI(BB, dl, 10784 TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB) 10785 .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR); 10786 10787 // insert rsbri in RSBBB 10788 // Note: BCC and rsbri will be converted into predicated rsbmi 10789 // by if-conversion pass 10790 BuildMI(*RSBBB, RSBBB->begin(), dl, 10791 TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg) 10792 .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0) 10793 .addImm(0) 10794 .add(predOps(ARMCC::AL)) 10795 .add(condCodeOp()); 10796 10797 // insert PHI in SinkBB, 10798 // reuse ABSDstReg to not change uses of ABS instruction 10799 BuildMI(*SinkBB, SinkBB->begin(), dl, 10800 TII->get(ARM::PHI), ABSDstReg) 10801 .addReg(NewRsbDstReg).addMBB(RSBBB) 10802 .addReg(ABSSrcReg).addMBB(BB); 10803 10804 // remove ABS instruction 10805 MI.eraseFromParent(); 10806 10807 // return last added BB 10808 return SinkBB; 10809 } 10810 case ARM::COPY_STRUCT_BYVAL_I32: 10811 ++NumLoopByVals; 10812 return EmitStructByval(MI, BB); 10813 case ARM::WIN__CHKSTK: 10814 return EmitLowered__chkstk(MI, BB); 10815 case ARM::WIN__DBZCHK: 10816 return EmitLowered__dbzchk(MI, BB); 10817 } 10818 } 10819 10820 /// Attaches vregs to MEMCPY that it will use as scratch registers 10821 /// when it is expanded into LDM/STM. This is done as a post-isel lowering 10822 /// instead of as a custom inserter because we need the use list from the SDNode. 10823 static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, 10824 MachineInstr &MI, const SDNode *Node) { 10825 bool isThumb1 = Subtarget->isThumb1Only(); 10826 10827 DebugLoc DL = MI.getDebugLoc(); 10828 MachineFunction *MF = MI.getParent()->getParent(); 10829 MachineRegisterInfo &MRI = MF->getRegInfo(); 10830 MachineInstrBuilder MIB(*MF, MI); 10831 10832 // If the new dst/src is unused mark it as dead. 10833 if (!Node->hasAnyUseOfValue(0)) { 10834 MI.getOperand(0).setIsDead(true); 10835 } 10836 if (!Node->hasAnyUseOfValue(1)) { 10837 MI.getOperand(1).setIsDead(true); 10838 } 10839 10840 // The MEMCPY both defines and kills the scratch registers. 10841 for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) { 10842 Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass 10843 : &ARM::GPRRegClass); 10844 MIB.addReg(TmpReg, RegState::Define|RegState::Dead); 10845 } 10846 } 10847 10848 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, 10849 SDNode *Node) const { 10850 if (MI.getOpcode() == ARM::MEMCPY) { 10851 attachMEMCPYScratchRegs(Subtarget, MI, Node); 10852 return; 10853 } 10854 10855 const MCInstrDesc *MCID = &MI.getDesc(); 10856 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB, 10857 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional 10858 // operand is still set to noreg. If needed, set the optional operand's 10859 // register to CPSR, and remove the redundant implicit def. 10860 // 10861 // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR). 10862 10863 // Rename pseudo opcodes. 10864 unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode()); 10865 unsigned ccOutIdx; 10866 if (NewOpc) { 10867 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo(); 10868 MCID = &TII->get(NewOpc); 10869 10870 assert(MCID->getNumOperands() == 10871 MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize() 10872 && "converted opcode should be the same except for cc_out" 10873 " (and, on Thumb1, pred)"); 10874 10875 MI.setDesc(*MCID); 10876 10877 // Add the optional cc_out operand 10878 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true)); 10879 10880 // On Thumb1, move all input operands to the end, then add the predicate 10881 if (Subtarget->isThumb1Only()) { 10882 for (unsigned c = MCID->getNumOperands() - 4; c--;) { 10883 MI.addOperand(MI.getOperand(1)); 10884 MI.RemoveOperand(1); 10885 } 10886 10887 // Restore the ties 10888 for (unsigned i = MI.getNumOperands(); i--;) { 10889 const MachineOperand& op = MI.getOperand(i); 10890 if (op.isReg() && op.isUse()) { 10891 int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO); 10892 if (DefIdx != -1) 10893 MI.tieOperands(DefIdx, i); 10894 } 10895 } 10896 10897 MI.addOperand(MachineOperand::CreateImm(ARMCC::AL)); 10898 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false)); 10899 ccOutIdx = 1; 10900 } else 10901 ccOutIdx = MCID->getNumOperands() - 1; 10902 } else 10903 ccOutIdx = MCID->getNumOperands() - 1; 10904 10905 // Any ARM instruction that sets the 's' bit should specify an optional 10906 // "cc_out" operand in the last operand position. 10907 if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) { 10908 assert(!NewOpc && "Optional cc_out operand required"); 10909 return; 10910 } 10911 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it 10912 // since we already have an optional CPSR def. 10913 bool definesCPSR = false; 10914 bool deadCPSR = false; 10915 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e; 10916 ++i) { 10917 const MachineOperand &MO = MI.getOperand(i); 10918 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) { 10919 definesCPSR = true; 10920 if (MO.isDead()) 10921 deadCPSR = true; 10922 MI.RemoveOperand(i); 10923 break; 10924 } 10925 } 10926 if (!definesCPSR) { 10927 assert(!NewOpc && "Optional cc_out operand required"); 10928 return; 10929 } 10930 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag"); 10931 if (deadCPSR) { 10932 assert(!MI.getOperand(ccOutIdx).getReg() && 10933 "expect uninitialized optional cc_out operand"); 10934 // Thumb1 instructions must have the S bit even if the CPSR is dead. 10935 if (!Subtarget->isThumb1Only()) 10936 return; 10937 } 10938 10939 // If this instruction was defined with an optional CPSR def and its dag node 10940 // had a live implicit CPSR def, then activate the optional CPSR def. 10941 MachineOperand &MO = MI.getOperand(ccOutIdx); 10942 MO.setReg(ARM::CPSR); 10943 MO.setIsDef(true); 10944 } 10945 10946 //===----------------------------------------------------------------------===// 10947 // ARM Optimization Hooks 10948 //===----------------------------------------------------------------------===// 10949 10950 // Helper function that checks if N is a null or all ones constant. 10951 static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) { 10952 return AllOnes ? isAllOnesConstant(N) : isNullConstant(N); 10953 } 10954 10955 // Return true if N is conditionally 0 or all ones. 10956 // Detects these expressions where cc is an i1 value: 10957 // 10958 // (select cc 0, y) [AllOnes=0] 10959 // (select cc y, 0) [AllOnes=0] 10960 // (zext cc) [AllOnes=0] 10961 // (sext cc) [AllOnes=0/1] 10962 // (select cc -1, y) [AllOnes=1] 10963 // (select cc y, -1) [AllOnes=1] 10964 // 10965 // Invert is set when N is the null/all ones constant when CC is false. 10966 // OtherOp is set to the alternative value of N. 10967 static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, 10968 SDValue &CC, bool &Invert, 10969 SDValue &OtherOp, 10970 SelectionDAG &DAG) { 10971 switch (N->getOpcode()) { 10972 default: return false; 10973 case ISD::SELECT: { 10974 CC = N->getOperand(0); 10975 SDValue N1 = N->getOperand(1); 10976 SDValue N2 = N->getOperand(2); 10977 if (isZeroOrAllOnes(N1, AllOnes)) { 10978 Invert = false; 10979 OtherOp = N2; 10980 return true; 10981 } 10982 if (isZeroOrAllOnes(N2, AllOnes)) { 10983 Invert = true; 10984 OtherOp = N1; 10985 return true; 10986 } 10987 return false; 10988 } 10989 case ISD::ZERO_EXTEND: 10990 // (zext cc) can never be the all ones value. 10991 if (AllOnes) 10992 return false; 10993 LLVM_FALLTHROUGH; 10994 case ISD::SIGN_EXTEND: { 10995 SDLoc dl(N); 10996 EVT VT = N->getValueType(0); 10997 CC = N->getOperand(0); 10998 if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC) 10999 return false; 11000 Invert = !AllOnes; 11001 if (AllOnes) 11002 // When looking for an AllOnes constant, N is an sext, and the 'other' 11003 // value is 0. 11004 OtherOp = DAG.getConstant(0, dl, VT); 11005 else if (N->getOpcode() == ISD::ZERO_EXTEND) 11006 // When looking for a 0 constant, N can be zext or sext. 11007 OtherOp = DAG.getConstant(1, dl, VT); 11008 else 11009 OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl, 11010 VT); 11011 return true; 11012 } 11013 } 11014 } 11015 11016 // Combine a constant select operand into its use: 11017 // 11018 // (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 11019 // (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 11020 // (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1] 11021 // (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 11022 // (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 11023 // 11024 // The transform is rejected if the select doesn't have a constant operand that 11025 // is null, or all ones when AllOnes is set. 11026 // 11027 // Also recognize sext/zext from i1: 11028 // 11029 // (add (zext cc), x) -> (select cc (add x, 1), x) 11030 // (add (sext cc), x) -> (select cc (add x, -1), x) 11031 // 11032 // These transformations eventually create predicated instructions. 11033 // 11034 // @param N The node to transform. 11035 // @param Slct The N operand that is a select. 11036 // @param OtherOp The other N operand (x above). 11037 // @param DCI Context. 11038 // @param AllOnes Require the select constant to be all ones instead of null. 11039 // @returns The new node, or SDValue() on failure. 11040 static 11041 SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, 11042 TargetLowering::DAGCombinerInfo &DCI, 11043 bool AllOnes = false) { 11044 SelectionDAG &DAG = DCI.DAG; 11045 EVT VT = N->getValueType(0); 11046 SDValue NonConstantVal; 11047 SDValue CCOp; 11048 bool SwapSelectOps; 11049 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps, 11050 NonConstantVal, DAG)) 11051 return SDValue(); 11052 11053 // Slct is now know to be the desired identity constant when CC is true. 11054 SDValue TrueVal = OtherOp; 11055 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT, 11056 OtherOp, NonConstantVal); 11057 // Unless SwapSelectOps says CC should be false. 11058 if (SwapSelectOps) 11059 std::swap(TrueVal, FalseVal); 11060 11061 return DAG.getNode(ISD::SELECT, SDLoc(N), VT, 11062 CCOp, TrueVal, FalseVal); 11063 } 11064 11065 // Attempt combineSelectAndUse on each operand of a commutative operator N. 11066 static 11067 SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, 11068 TargetLowering::DAGCombinerInfo &DCI) { 11069 SDValue N0 = N->getOperand(0); 11070 SDValue N1 = N->getOperand(1); 11071 if (N0.getNode()->hasOneUse()) 11072 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes)) 11073 return Result; 11074 if (N1.getNode()->hasOneUse()) 11075 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes)) 11076 return Result; 11077 return SDValue(); 11078 } 11079 11080 static bool IsVUZPShuffleNode(SDNode *N) { 11081 // VUZP shuffle node. 11082 if (N->getOpcode() == ARMISD::VUZP) 11083 return true; 11084 11085 // "VUZP" on i32 is an alias for VTRN. 11086 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32) 11087 return true; 11088 11089 return false; 11090 } 11091 11092 static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, 11093 TargetLowering::DAGCombinerInfo &DCI, 11094 const ARMSubtarget *Subtarget) { 11095 // Look for ADD(VUZP.0, VUZP.1). 11096 if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() || 11097 N0 == N1) 11098 return SDValue(); 11099 11100 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD. 11101 if (!N->getValueType(0).is64BitVector()) 11102 return SDValue(); 11103 11104 // Generate vpadd. 11105 SelectionDAG &DAG = DCI.DAG; 11106 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11107 SDLoc dl(N); 11108 SDNode *Unzip = N0.getNode(); 11109 EVT VT = N->getValueType(0); 11110 11111 SmallVector<SDValue, 8> Ops; 11112 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl, 11113 TLI.getPointerTy(DAG.getDataLayout()))); 11114 Ops.push_back(Unzip->getOperand(0)); 11115 Ops.push_back(Unzip->getOperand(1)); 11116 11117 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); 11118 } 11119 11120 static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, 11121 TargetLowering::DAGCombinerInfo &DCI, 11122 const ARMSubtarget *Subtarget) { 11123 // Check for two extended operands. 11124 if (!(N0.getOpcode() == ISD::SIGN_EXTEND && 11125 N1.getOpcode() == ISD::SIGN_EXTEND) && 11126 !(N0.getOpcode() == ISD::ZERO_EXTEND && 11127 N1.getOpcode() == ISD::ZERO_EXTEND)) 11128 return SDValue(); 11129 11130 SDValue N00 = N0.getOperand(0); 11131 SDValue N10 = N1.getOperand(0); 11132 11133 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1)) 11134 if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() || 11135 N00 == N10) 11136 return SDValue(); 11137 11138 // We only recognize Q register paddl here; this can't be reached until 11139 // after type legalization. 11140 if (!N00.getValueType().is64BitVector() || 11141 !N0.getValueType().is128BitVector()) 11142 return SDValue(); 11143 11144 // Generate vpaddl. 11145 SelectionDAG &DAG = DCI.DAG; 11146 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11147 SDLoc dl(N); 11148 EVT VT = N->getValueType(0); 11149 11150 SmallVector<SDValue, 8> Ops; 11151 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension. 11152 unsigned Opcode; 11153 if (N0.getOpcode() == ISD::SIGN_EXTEND) 11154 Opcode = Intrinsic::arm_neon_vpaddls; 11155 else 11156 Opcode = Intrinsic::arm_neon_vpaddlu; 11157 Ops.push_back(DAG.getConstant(Opcode, dl, 11158 TLI.getPointerTy(DAG.getDataLayout()))); 11159 EVT ElemTy = N00.getValueType().getVectorElementType(); 11160 unsigned NumElts = VT.getVectorNumElements(); 11161 EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2); 11162 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT, 11163 N00.getOperand(0), N00.getOperand(1)); 11164 Ops.push_back(Concat); 11165 11166 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); 11167 } 11168 11169 // FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in 11170 // an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is 11171 // much easier to match. 11172 static SDValue 11173 AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, 11174 TargetLowering::DAGCombinerInfo &DCI, 11175 const ARMSubtarget *Subtarget) { 11176 // Only perform optimization if after legalize, and if NEON is available. We 11177 // also expected both operands to be BUILD_VECTORs. 11178 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON() 11179 || N0.getOpcode() != ISD::BUILD_VECTOR 11180 || N1.getOpcode() != ISD::BUILD_VECTOR) 11181 return SDValue(); 11182 11183 // Check output type since VPADDL operand elements can only be 8, 16, or 32. 11184 EVT VT = N->getValueType(0); 11185 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64) 11186 return SDValue(); 11187 11188 // Check that the vector operands are of the right form. 11189 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR 11190 // operands, where N is the size of the formed vector. 11191 // Each EXTRACT_VECTOR should have the same input vector and odd or even 11192 // index such that we have a pair wise add pattern. 11193 11194 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing. 11195 if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 11196 return SDValue(); 11197 SDValue Vec = N0->getOperand(0)->getOperand(0); 11198 SDNode *V = Vec.getNode(); 11199 unsigned nextIndex = 0; 11200 11201 // For each operands to the ADD which are BUILD_VECTORs, 11202 // check to see if each of their operands are an EXTRACT_VECTOR with 11203 // the same vector and appropriate index. 11204 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) { 11205 if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT 11206 && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 11207 11208 SDValue ExtVec0 = N0->getOperand(i); 11209 SDValue ExtVec1 = N1->getOperand(i); 11210 11211 // First operand is the vector, verify its the same. 11212 if (V != ExtVec0->getOperand(0).getNode() || 11213 V != ExtVec1->getOperand(0).getNode()) 11214 return SDValue(); 11215 11216 // Second is the constant, verify its correct. 11217 ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1)); 11218 ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1)); 11219 11220 // For the constant, we want to see all the even or all the odd. 11221 if (!C0 || !C1 || C0->getZExtValue() != nextIndex 11222 || C1->getZExtValue() != nextIndex+1) 11223 return SDValue(); 11224 11225 // Increment index. 11226 nextIndex+=2; 11227 } else 11228 return SDValue(); 11229 } 11230 11231 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure 11232 // we're using the entire input vector, otherwise there's a size/legality 11233 // mismatch somewhere. 11234 if (nextIndex != Vec.getValueType().getVectorNumElements() || 11235 Vec.getValueType().getVectorElementType() == VT.getVectorElementType()) 11236 return SDValue(); 11237 11238 // Create VPADDL node. 11239 SelectionDAG &DAG = DCI.DAG; 11240 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11241 11242 SDLoc dl(N); 11243 11244 // Build operand list. 11245 SmallVector<SDValue, 8> Ops; 11246 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl, 11247 TLI.getPointerTy(DAG.getDataLayout()))); 11248 11249 // Input is the vector. 11250 Ops.push_back(Vec); 11251 11252 // Get widened type and narrowed type. 11253 MVT widenType; 11254 unsigned numElem = VT.getVectorNumElements(); 11255 11256 EVT inputLaneType = Vec.getValueType().getVectorElementType(); 11257 switch (inputLaneType.getSimpleVT().SimpleTy) { 11258 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break; 11259 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break; 11260 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break; 11261 default: 11262 llvm_unreachable("Invalid vector element type for padd optimization."); 11263 } 11264 11265 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops); 11266 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE; 11267 return DAG.getNode(ExtOp, dl, VT, tmp); 11268 } 11269 11270 static SDValue findMUL_LOHI(SDValue V) { 11271 if (V->getOpcode() == ISD::UMUL_LOHI || 11272 V->getOpcode() == ISD::SMUL_LOHI) 11273 return V; 11274 return SDValue(); 11275 } 11276 11277 static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, 11278 TargetLowering::DAGCombinerInfo &DCI, 11279 const ARMSubtarget *Subtarget) { 11280 if (!Subtarget->hasBaseDSP()) 11281 return SDValue(); 11282 11283 // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and 11284 // accumulates the product into a 64-bit value. The 16-bit values will 11285 // be sign extended somehow or SRA'd into 32-bit values 11286 // (addc (adde (mul 16bit, 16bit), lo), hi) 11287 SDValue Mul = AddcNode->getOperand(0); 11288 SDValue Lo = AddcNode->getOperand(1); 11289 if (Mul.getOpcode() != ISD::MUL) { 11290 Lo = AddcNode->getOperand(0); 11291 Mul = AddcNode->getOperand(1); 11292 if (Mul.getOpcode() != ISD::MUL) 11293 return SDValue(); 11294 } 11295 11296 SDValue SRA = AddeNode->getOperand(0); 11297 SDValue Hi = AddeNode->getOperand(1); 11298 if (SRA.getOpcode() != ISD::SRA) { 11299 SRA = AddeNode->getOperand(1); 11300 Hi = AddeNode->getOperand(0); 11301 if (SRA.getOpcode() != ISD::SRA) 11302 return SDValue(); 11303 } 11304 if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) { 11305 if (Const->getZExtValue() != 31) 11306 return SDValue(); 11307 } else 11308 return SDValue(); 11309 11310 if (SRA.getOperand(0) != Mul) 11311 return SDValue(); 11312 11313 SelectionDAG &DAG = DCI.DAG; 11314 SDLoc dl(AddcNode); 11315 unsigned Opcode = 0; 11316 SDValue Op0; 11317 SDValue Op1; 11318 11319 if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) { 11320 Opcode = ARMISD::SMLALBB; 11321 Op0 = Mul.getOperand(0); 11322 Op1 = Mul.getOperand(1); 11323 } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) { 11324 Opcode = ARMISD::SMLALBT; 11325 Op0 = Mul.getOperand(0); 11326 Op1 = Mul.getOperand(1).getOperand(0); 11327 } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) { 11328 Opcode = ARMISD::SMLALTB; 11329 Op0 = Mul.getOperand(0).getOperand(0); 11330 Op1 = Mul.getOperand(1); 11331 } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) { 11332 Opcode = ARMISD::SMLALTT; 11333 Op0 = Mul->getOperand(0).getOperand(0); 11334 Op1 = Mul->getOperand(1).getOperand(0); 11335 } 11336 11337 if (!Op0 || !Op1) 11338 return SDValue(); 11339 11340 SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32), 11341 Op0, Op1, Lo, Hi); 11342 // Replace the ADDs' nodes uses by the MLA node's values. 11343 SDValue HiMLALResult(SMLAL.getNode(), 1); 11344 SDValue LoMLALResult(SMLAL.getNode(), 0); 11345 11346 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult); 11347 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult); 11348 11349 // Return original node to notify the driver to stop replacing. 11350 SDValue resNode(AddcNode, 0); 11351 return resNode; 11352 } 11353 11354 static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode, 11355 TargetLowering::DAGCombinerInfo &DCI, 11356 const ARMSubtarget *Subtarget) { 11357 // Look for multiply add opportunities. 11358 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where 11359 // each add nodes consumes a value from ISD::UMUL_LOHI and there is 11360 // a glue link from the first add to the second add. 11361 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by 11362 // a S/UMLAL instruction. 11363 // UMUL_LOHI 11364 // / :lo \ :hi 11365 // V \ [no multiline comment] 11366 // loAdd -> ADDC | 11367 // \ :carry / 11368 // V V 11369 // ADDE <- hiAdd 11370 // 11371 // In the special case where only the higher part of a signed result is used 11372 // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts 11373 // a constant with the exact value of 0x80000000, we recognize we are dealing 11374 // with a "rounded multiply and add" (or subtract) and transform it into 11375 // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively. 11376 11377 assert((AddeSubeNode->getOpcode() == ARMISD::ADDE || 11378 AddeSubeNode->getOpcode() == ARMISD::SUBE) && 11379 "Expect an ADDE or SUBE"); 11380 11381 assert(AddeSubeNode->getNumOperands() == 3 && 11382 AddeSubeNode->getOperand(2).getValueType() == MVT::i32 && 11383 "ADDE node has the wrong inputs"); 11384 11385 // Check that we are chained to the right ADDC or SUBC node. 11386 SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode(); 11387 if ((AddeSubeNode->getOpcode() == ARMISD::ADDE && 11388 AddcSubcNode->getOpcode() != ARMISD::ADDC) || 11389 (AddeSubeNode->getOpcode() == ARMISD::SUBE && 11390 AddcSubcNode->getOpcode() != ARMISD::SUBC)) 11391 return SDValue(); 11392 11393 SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0); 11394 SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1); 11395 11396 // Check if the two operands are from the same mul_lohi node. 11397 if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode()) 11398 return SDValue(); 11399 11400 assert(AddcSubcNode->getNumValues() == 2 && 11401 AddcSubcNode->getValueType(0) == MVT::i32 && 11402 "Expect ADDC with two result values. First: i32"); 11403 11404 // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it 11405 // maybe a SMLAL which multiplies two 16-bit values. 11406 if (AddeSubeNode->getOpcode() == ARMISD::ADDE && 11407 AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI && 11408 AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI && 11409 AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI && 11410 AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI) 11411 return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget); 11412 11413 // Check for the triangle shape. 11414 SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0); 11415 SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1); 11416 11417 // Make sure that the ADDE/SUBE operands are not coming from the same node. 11418 if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode()) 11419 return SDValue(); 11420 11421 // Find the MUL_LOHI node walking up ADDE/SUBE's operands. 11422 bool IsLeftOperandMUL = false; 11423 SDValue MULOp = findMUL_LOHI(AddeSubeOp0); 11424 if (MULOp == SDValue()) 11425 MULOp = findMUL_LOHI(AddeSubeOp1); 11426 else 11427 IsLeftOperandMUL = true; 11428 if (MULOp == SDValue()) 11429 return SDValue(); 11430 11431 // Figure out the right opcode. 11432 unsigned Opc = MULOp->getOpcode(); 11433 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL; 11434 11435 // Figure out the high and low input values to the MLAL node. 11436 SDValue *HiAddSub = nullptr; 11437 SDValue *LoMul = nullptr; 11438 SDValue *LowAddSub = nullptr; 11439 11440 // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI. 11441 if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1))) 11442 return SDValue(); 11443 11444 if (IsLeftOperandMUL) 11445 HiAddSub = &AddeSubeOp1; 11446 else 11447 HiAddSub = &AddeSubeOp0; 11448 11449 // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node 11450 // whose low result is fed to the ADDC/SUBC we are checking. 11451 11452 if (AddcSubcOp0 == MULOp.getValue(0)) { 11453 LoMul = &AddcSubcOp0; 11454 LowAddSub = &AddcSubcOp1; 11455 } 11456 if (AddcSubcOp1 == MULOp.getValue(0)) { 11457 LoMul = &AddcSubcOp1; 11458 LowAddSub = &AddcSubcOp0; 11459 } 11460 11461 if (!LoMul) 11462 return SDValue(); 11463 11464 // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC 11465 // the replacement below will create a cycle. 11466 if (AddcSubcNode == HiAddSub->getNode() || 11467 AddcSubcNode->isPredecessorOf(HiAddSub->getNode())) 11468 return SDValue(); 11469 11470 // Create the merged node. 11471 SelectionDAG &DAG = DCI.DAG; 11472 11473 // Start building operand list. 11474 SmallVector<SDValue, 8> Ops; 11475 Ops.push_back(LoMul->getOperand(0)); 11476 Ops.push_back(LoMul->getOperand(1)); 11477 11478 // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be 11479 // the case, we must be doing signed multiplication and only use the higher 11480 // part of the result of the MLAL, furthermore the LowAddSub must be a constant 11481 // addition or subtraction with the value of 0x800000. 11482 if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() && 11483 FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) && 11484 LowAddSub->getNode()->getOpcode() == ISD::Constant && 11485 static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() == 11486 0x80000000) { 11487 Ops.push_back(*HiAddSub); 11488 if (AddcSubcNode->getOpcode() == ARMISD::SUBC) { 11489 FinalOpc = ARMISD::SMMLSR; 11490 } else { 11491 FinalOpc = ARMISD::SMMLAR; 11492 } 11493 SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops); 11494 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode); 11495 11496 return SDValue(AddeSubeNode, 0); 11497 } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC) 11498 // SMMLS is generated during instruction selection and the rest of this 11499 // function can not handle the case where AddcSubcNode is a SUBC. 11500 return SDValue(); 11501 11502 // Finish building the operand list for {U/S}MLAL 11503 Ops.push_back(*LowAddSub); 11504 Ops.push_back(*HiAddSub); 11505 11506 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), 11507 DAG.getVTList(MVT::i32, MVT::i32), Ops); 11508 11509 // Replace the ADDs' nodes uses by the MLA node's values. 11510 SDValue HiMLALResult(MLALNode.getNode(), 1); 11511 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult); 11512 11513 SDValue LoMLALResult(MLALNode.getNode(), 0); 11514 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult); 11515 11516 // Return original node to notify the driver to stop replacing. 11517 return SDValue(AddeSubeNode, 0); 11518 } 11519 11520 static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, 11521 TargetLowering::DAGCombinerInfo &DCI, 11522 const ARMSubtarget *Subtarget) { 11523 // UMAAL is similar to UMLAL except that it adds two unsigned values. 11524 // While trying to combine for the other MLAL nodes, first search for the 11525 // chance to use UMAAL. Check if Addc uses a node which has already 11526 // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde 11527 // as the addend, and it's handled in PerformUMLALCombine. 11528 11529 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) 11530 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); 11531 11532 // Check that we have a glued ADDC node. 11533 SDNode* AddcNode = AddeNode->getOperand(2).getNode(); 11534 if (AddcNode->getOpcode() != ARMISD::ADDC) 11535 return SDValue(); 11536 11537 // Find the converted UMAAL or quit if it doesn't exist. 11538 SDNode *UmlalNode = nullptr; 11539 SDValue AddHi; 11540 if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) { 11541 UmlalNode = AddcNode->getOperand(0).getNode(); 11542 AddHi = AddcNode->getOperand(1); 11543 } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) { 11544 UmlalNode = AddcNode->getOperand(1).getNode(); 11545 AddHi = AddcNode->getOperand(0); 11546 } else { 11547 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); 11548 } 11549 11550 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as 11551 // the ADDC as well as Zero. 11552 if (!isNullConstant(UmlalNode->getOperand(3))) 11553 return SDValue(); 11554 11555 if ((isNullConstant(AddeNode->getOperand(0)) && 11556 AddeNode->getOperand(1).getNode() == UmlalNode) || 11557 (AddeNode->getOperand(0).getNode() == UmlalNode && 11558 isNullConstant(AddeNode->getOperand(1)))) { 11559 SelectionDAG &DAG = DCI.DAG; 11560 SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1), 11561 UmlalNode->getOperand(2), AddHi }; 11562 SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode), 11563 DAG.getVTList(MVT::i32, MVT::i32), Ops); 11564 11565 // Replace the ADDs' nodes uses by the UMAAL node's values. 11566 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1)); 11567 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0)); 11568 11569 // Return original node to notify the driver to stop replacing. 11570 return SDValue(AddeNode, 0); 11571 } 11572 return SDValue(); 11573 } 11574 11575 static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG, 11576 const ARMSubtarget *Subtarget) { 11577 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) 11578 return SDValue(); 11579 11580 // Check that we have a pair of ADDC and ADDE as operands. 11581 // Both addends of the ADDE must be zero. 11582 SDNode* AddcNode = N->getOperand(2).getNode(); 11583 SDNode* AddeNode = N->getOperand(3).getNode(); 11584 if ((AddcNode->getOpcode() == ARMISD::ADDC) && 11585 (AddeNode->getOpcode() == ARMISD::ADDE) && 11586 isNullConstant(AddeNode->getOperand(0)) && 11587 isNullConstant(AddeNode->getOperand(1)) && 11588 (AddeNode->getOperand(2).getNode() == AddcNode)) 11589 return DAG.getNode(ARMISD::UMAAL, SDLoc(N), 11590 DAG.getVTList(MVT::i32, MVT::i32), 11591 {N->getOperand(0), N->getOperand(1), 11592 AddcNode->getOperand(0), AddcNode->getOperand(1)}); 11593 else 11594 return SDValue(); 11595 } 11596 11597 static SDValue PerformAddcSubcCombine(SDNode *N, 11598 TargetLowering::DAGCombinerInfo &DCI, 11599 const ARMSubtarget *Subtarget) { 11600 SelectionDAG &DAG(DCI.DAG); 11601 11602 if (N->getOpcode() == ARMISD::SUBC) { 11603 // (SUBC (ADDE 0, 0, C), 1) -> C 11604 SDValue LHS = N->getOperand(0); 11605 SDValue RHS = N->getOperand(1); 11606 if (LHS->getOpcode() == ARMISD::ADDE && 11607 isNullConstant(LHS->getOperand(0)) && 11608 isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) { 11609 return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2)); 11610 } 11611 } 11612 11613 if (Subtarget->isThumb1Only()) { 11614 SDValue RHS = N->getOperand(1); 11615 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) { 11616 int32_t imm = C->getSExtValue(); 11617 if (imm < 0 && imm > std::numeric_limits<int>::min()) { 11618 SDLoc DL(N); 11619 RHS = DAG.getConstant(-imm, DL, MVT::i32); 11620 unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC 11621 : ARMISD::ADDC; 11622 return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS); 11623 } 11624 } 11625 } 11626 11627 return SDValue(); 11628 } 11629 11630 static SDValue PerformAddeSubeCombine(SDNode *N, 11631 TargetLowering::DAGCombinerInfo &DCI, 11632 const ARMSubtarget *Subtarget) { 11633 if (Subtarget->isThumb1Only()) { 11634 SelectionDAG &DAG = DCI.DAG; 11635 SDValue RHS = N->getOperand(1); 11636 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) { 11637 int64_t imm = C->getSExtValue(); 11638 if (imm < 0) { 11639 SDLoc DL(N); 11640 11641 // The with-carry-in form matches bitwise not instead of the negation. 11642 // Effectively, the inverse interpretation of the carry flag already 11643 // accounts for part of the negation. 11644 RHS = DAG.getConstant(~imm, DL, MVT::i32); 11645 11646 unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE 11647 : ARMISD::ADDE; 11648 return DAG.getNode(Opcode, DL, N->getVTList(), 11649 N->getOperand(0), RHS, N->getOperand(2)); 11650 } 11651 } 11652 } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) { 11653 return AddCombineTo64bitMLAL(N, DCI, Subtarget); 11654 } 11655 return SDValue(); 11656 } 11657 11658 static SDValue PerformABSCombine(SDNode *N, 11659 TargetLowering::DAGCombinerInfo &DCI, 11660 const ARMSubtarget *Subtarget) { 11661 SDValue res; 11662 SelectionDAG &DAG = DCI.DAG; 11663 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11664 11665 if (TLI.isOperationLegal(N->getOpcode(), N->getValueType(0))) 11666 return SDValue(); 11667 11668 if (!TLI.expandABS(N, res, DAG)) 11669 return SDValue(); 11670 11671 return res; 11672 } 11673 11674 /// PerformADDECombine - Target-specific dag combine transform from 11675 /// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or 11676 /// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL 11677 static SDValue PerformADDECombine(SDNode *N, 11678 TargetLowering::DAGCombinerInfo &DCI, 11679 const ARMSubtarget *Subtarget) { 11680 // Only ARM and Thumb2 support UMLAL/SMLAL. 11681 if (Subtarget->isThumb1Only()) 11682 return PerformAddeSubeCombine(N, DCI, Subtarget); 11683 11684 // Only perform the checks after legalize when the pattern is available. 11685 if (DCI.isBeforeLegalize()) return SDValue(); 11686 11687 return AddCombineTo64bitUMAAL(N, DCI, Subtarget); 11688 } 11689 11690 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with 11691 /// operands N0 and N1. This is a helper for PerformADDCombine that is 11692 /// called with the default operands, and if that fails, with commuted 11693 /// operands. 11694 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, 11695 TargetLowering::DAGCombinerInfo &DCI, 11696 const ARMSubtarget *Subtarget){ 11697 // Attempt to create vpadd for this add. 11698 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget)) 11699 return Result; 11700 11701 // Attempt to create vpaddl for this add. 11702 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget)) 11703 return Result; 11704 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI, 11705 Subtarget)) 11706 return Result; 11707 11708 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 11709 if (N0.getNode()->hasOneUse()) 11710 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI)) 11711 return Result; 11712 return SDValue(); 11713 } 11714 11715 bool 11716 ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N, 11717 CombineLevel Level) const { 11718 if (Level == BeforeLegalizeTypes) 11719 return true; 11720 11721 if (N->getOpcode() != ISD::SHL) 11722 return true; 11723 11724 if (Subtarget->isThumb1Only()) { 11725 // Avoid making expensive immediates by commuting shifts. (This logic 11726 // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted 11727 // for free.) 11728 if (N->getOpcode() != ISD::SHL) 11729 return true; 11730 SDValue N1 = N->getOperand(0); 11731 if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND && 11732 N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR) 11733 return true; 11734 if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) { 11735 if (Const->getAPIntValue().ult(256)) 11736 return false; 11737 if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) && 11738 Const->getAPIntValue().sgt(-256)) 11739 return false; 11740 } 11741 return true; 11742 } 11743 11744 // Turn off commute-with-shift transform after legalization, so it doesn't 11745 // conflict with PerformSHLSimplify. (We could try to detect when 11746 // PerformSHLSimplify would trigger more precisely, but it isn't 11747 // really necessary.) 11748 return false; 11749 } 11750 11751 bool ARMTargetLowering::shouldFoldConstantShiftPairToMask( 11752 const SDNode *N, CombineLevel Level) const { 11753 if (!Subtarget->isThumb1Only()) 11754 return true; 11755 11756 if (Level == BeforeLegalizeTypes) 11757 return true; 11758 11759 return false; 11760 } 11761 11762 bool ARMTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const { 11763 if (!Subtarget->hasNEON()) { 11764 if (Subtarget->isThumb1Only()) 11765 return VT.getScalarSizeInBits() <= 32; 11766 return true; 11767 } 11768 return VT.isScalarInteger(); 11769 } 11770 11771 static SDValue PerformSHLSimplify(SDNode *N, 11772 TargetLowering::DAGCombinerInfo &DCI, 11773 const ARMSubtarget *ST) { 11774 // Allow the generic combiner to identify potential bswaps. 11775 if (DCI.isBeforeLegalize()) 11776 return SDValue(); 11777 11778 // DAG combiner will fold: 11779 // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) 11780 // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2 11781 // Other code patterns that can be also be modified have the following form: 11782 // b + ((a << 1) | 510) 11783 // b + ((a << 1) & 510) 11784 // b + ((a << 1) ^ 510) 11785 // b + ((a << 1) + 510) 11786 11787 // Many instructions can perform the shift for free, but it requires both 11788 // the operands to be registers. If c1 << c2 is too large, a mov immediate 11789 // instruction will needed. So, unfold back to the original pattern if: 11790 // - if c1 and c2 are small enough that they don't require mov imms. 11791 // - the user(s) of the node can perform an shl 11792 11793 // No shifted operands for 16-bit instructions. 11794 if (ST->isThumb() && ST->isThumb1Only()) 11795 return SDValue(); 11796 11797 // Check that all the users could perform the shl themselves. 11798 for (auto U : N->uses()) { 11799 switch(U->getOpcode()) { 11800 default: 11801 return SDValue(); 11802 case ISD::SUB: 11803 case ISD::ADD: 11804 case ISD::AND: 11805 case ISD::OR: 11806 case ISD::XOR: 11807 case ISD::SETCC: 11808 case ARMISD::CMP: 11809 // Check that the user isn't already using a constant because there 11810 // aren't any instructions that support an immediate operand and a 11811 // shifted operand. 11812 if (isa<ConstantSDNode>(U->getOperand(0)) || 11813 isa<ConstantSDNode>(U->getOperand(1))) 11814 return SDValue(); 11815 11816 // Check that it's not already using a shift. 11817 if (U->getOperand(0).getOpcode() == ISD::SHL || 11818 U->getOperand(1).getOpcode() == ISD::SHL) 11819 return SDValue(); 11820 break; 11821 } 11822 } 11823 11824 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR && 11825 N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND) 11826 return SDValue(); 11827 11828 if (N->getOperand(0).getOpcode() != ISD::SHL) 11829 return SDValue(); 11830 11831 SDValue SHL = N->getOperand(0); 11832 11833 auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1)); 11834 auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1)); 11835 if (!C1ShlC2 || !C2) 11836 return SDValue(); 11837 11838 APInt C2Int = C2->getAPIntValue(); 11839 APInt C1Int = C1ShlC2->getAPIntValue(); 11840 11841 // Check that performing a lshr will not lose any information. 11842 APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(), 11843 C2Int.getBitWidth() - C2->getZExtValue()); 11844 if ((C1Int & Mask) != C1Int) 11845 return SDValue(); 11846 11847 // Shift the first constant. 11848 C1Int.lshrInPlace(C2Int); 11849 11850 // The immediates are encoded as an 8-bit value that can be rotated. 11851 auto LargeImm = [](const APInt &Imm) { 11852 unsigned Zeros = Imm.countLeadingZeros() + Imm.countTrailingZeros(); 11853 return Imm.getBitWidth() - Zeros > 8; 11854 }; 11855 11856 if (LargeImm(C1Int) || LargeImm(C2Int)) 11857 return SDValue(); 11858 11859 SelectionDAG &DAG = DCI.DAG; 11860 SDLoc dl(N); 11861 SDValue X = SHL.getOperand(0); 11862 SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X, 11863 DAG.getConstant(C1Int, dl, MVT::i32)); 11864 // Shift left to compensate for the lshr of C1Int. 11865 SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1)); 11866 11867 LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump(); 11868 SHL.dump(); N->dump()); 11869 LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump()); 11870 return Res; 11871 } 11872 11873 11874 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. 11875 /// 11876 static SDValue PerformADDCombine(SDNode *N, 11877 TargetLowering::DAGCombinerInfo &DCI, 11878 const ARMSubtarget *Subtarget) { 11879 SDValue N0 = N->getOperand(0); 11880 SDValue N1 = N->getOperand(1); 11881 11882 // Only works one way, because it needs an immediate operand. 11883 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 11884 return Result; 11885 11886 // First try with the default operand order. 11887 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget)) 11888 return Result; 11889 11890 // If that didn't work, try again with the operands commuted. 11891 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget); 11892 } 11893 11894 /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB. 11895 /// 11896 static SDValue PerformSUBCombine(SDNode *N, 11897 TargetLowering::DAGCombinerInfo &DCI, 11898 const ARMSubtarget *Subtarget) { 11899 SDValue N0 = N->getOperand(0); 11900 SDValue N1 = N->getOperand(1); 11901 11902 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 11903 if (N1.getNode()->hasOneUse()) 11904 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI)) 11905 return Result; 11906 11907 if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector()) 11908 return SDValue(); 11909 11910 // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x)) 11911 // so that we can readily pattern match more mve instructions which can use 11912 // a scalar operand. 11913 SDValue VDup = N->getOperand(1); 11914 if (VDup->getOpcode() != ARMISD::VDUP) 11915 return SDValue(); 11916 11917 SDValue VMov = N->getOperand(0); 11918 if (VMov->getOpcode() == ISD::BITCAST) 11919 VMov = VMov->getOperand(0); 11920 11921 if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov)) 11922 return SDValue(); 11923 11924 SDLoc dl(N); 11925 SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32, 11926 DCI.DAG.getConstant(0, dl, MVT::i32), 11927 VDup->getOperand(0)); 11928 return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate); 11929 } 11930 11931 /// PerformVMULCombine 11932 /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the 11933 /// special multiplier accumulator forwarding. 11934 /// vmul d3, d0, d2 11935 /// vmla d3, d1, d2 11936 /// is faster than 11937 /// vadd d3, d0, d1 11938 /// vmul d3, d3, d2 11939 // However, for (A + B) * (A + B), 11940 // vadd d2, d0, d1 11941 // vmul d3, d0, d2 11942 // vmla d3, d1, d2 11943 // is slower than 11944 // vadd d2, d0, d1 11945 // vmul d3, d2, d2 11946 static SDValue PerformVMULCombine(SDNode *N, 11947 TargetLowering::DAGCombinerInfo &DCI, 11948 const ARMSubtarget *Subtarget) { 11949 if (!Subtarget->hasVMLxForwarding()) 11950 return SDValue(); 11951 11952 SelectionDAG &DAG = DCI.DAG; 11953 SDValue N0 = N->getOperand(0); 11954 SDValue N1 = N->getOperand(1); 11955 unsigned Opcode = N0.getOpcode(); 11956 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 11957 Opcode != ISD::FADD && Opcode != ISD::FSUB) { 11958 Opcode = N1.getOpcode(); 11959 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 11960 Opcode != ISD::FADD && Opcode != ISD::FSUB) 11961 return SDValue(); 11962 std::swap(N0, N1); 11963 } 11964 11965 if (N0 == N1) 11966 return SDValue(); 11967 11968 EVT VT = N->getValueType(0); 11969 SDLoc DL(N); 11970 SDValue N00 = N0->getOperand(0); 11971 SDValue N01 = N0->getOperand(1); 11972 return DAG.getNode(Opcode, DL, VT, 11973 DAG.getNode(ISD::MUL, DL, VT, N00, N1), 11974 DAG.getNode(ISD::MUL, DL, VT, N01, N1)); 11975 } 11976 11977 static SDValue PerformMULCombine(SDNode *N, 11978 TargetLowering::DAGCombinerInfo &DCI, 11979 const ARMSubtarget *Subtarget) { 11980 SelectionDAG &DAG = DCI.DAG; 11981 11982 if (Subtarget->isThumb1Only()) 11983 return SDValue(); 11984 11985 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 11986 return SDValue(); 11987 11988 EVT VT = N->getValueType(0); 11989 if (VT.is64BitVector() || VT.is128BitVector()) 11990 return PerformVMULCombine(N, DCI, Subtarget); 11991 if (VT != MVT::i32) 11992 return SDValue(); 11993 11994 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 11995 if (!C) 11996 return SDValue(); 11997 11998 int64_t MulAmt = C->getSExtValue(); 11999 unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt); 12000 12001 ShiftAmt = ShiftAmt & (32 - 1); 12002 SDValue V = N->getOperand(0); 12003 SDLoc DL(N); 12004 12005 SDValue Res; 12006 MulAmt >>= ShiftAmt; 12007 12008 if (MulAmt >= 0) { 12009 if (isPowerOf2_32(MulAmt - 1)) { 12010 // (mul x, 2^N + 1) => (add (shl x, N), x) 12011 Res = DAG.getNode(ISD::ADD, DL, VT, 12012 V, 12013 DAG.getNode(ISD::SHL, DL, VT, 12014 V, 12015 DAG.getConstant(Log2_32(MulAmt - 1), DL, 12016 MVT::i32))); 12017 } else if (isPowerOf2_32(MulAmt + 1)) { 12018 // (mul x, 2^N - 1) => (sub (shl x, N), x) 12019 Res = DAG.getNode(ISD::SUB, DL, VT, 12020 DAG.getNode(ISD::SHL, DL, VT, 12021 V, 12022 DAG.getConstant(Log2_32(MulAmt + 1), DL, 12023 MVT::i32)), 12024 V); 12025 } else 12026 return SDValue(); 12027 } else { 12028 uint64_t MulAmtAbs = -MulAmt; 12029 if (isPowerOf2_32(MulAmtAbs + 1)) { 12030 // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) 12031 Res = DAG.getNode(ISD::SUB, DL, VT, 12032 V, 12033 DAG.getNode(ISD::SHL, DL, VT, 12034 V, 12035 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL, 12036 MVT::i32))); 12037 } else if (isPowerOf2_32(MulAmtAbs - 1)) { 12038 // (mul x, -(2^N + 1)) => - (add (shl x, N), x) 12039 Res = DAG.getNode(ISD::ADD, DL, VT, 12040 V, 12041 DAG.getNode(ISD::SHL, DL, VT, 12042 V, 12043 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL, 12044 MVT::i32))); 12045 Res = DAG.getNode(ISD::SUB, DL, VT, 12046 DAG.getConstant(0, DL, MVT::i32), Res); 12047 } else 12048 return SDValue(); 12049 } 12050 12051 if (ShiftAmt != 0) 12052 Res = DAG.getNode(ISD::SHL, DL, VT, 12053 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32)); 12054 12055 // Do not add new nodes to DAG combiner worklist. 12056 DCI.CombineTo(N, Res, false); 12057 return SDValue(); 12058 } 12059 12060 static SDValue CombineANDShift(SDNode *N, 12061 TargetLowering::DAGCombinerInfo &DCI, 12062 const ARMSubtarget *Subtarget) { 12063 // Allow DAGCombine to pattern-match before we touch the canonical form. 12064 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 12065 return SDValue(); 12066 12067 if (N->getValueType(0) != MVT::i32) 12068 return SDValue(); 12069 12070 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 12071 if (!N1C) 12072 return SDValue(); 12073 12074 uint32_t C1 = (uint32_t)N1C->getZExtValue(); 12075 // Don't transform uxtb/uxth. 12076 if (C1 == 255 || C1 == 65535) 12077 return SDValue(); 12078 12079 SDNode *N0 = N->getOperand(0).getNode(); 12080 if (!N0->hasOneUse()) 12081 return SDValue(); 12082 12083 if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL) 12084 return SDValue(); 12085 12086 bool LeftShift = N0->getOpcode() == ISD::SHL; 12087 12088 ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1)); 12089 if (!N01C) 12090 return SDValue(); 12091 12092 uint32_t C2 = (uint32_t)N01C->getZExtValue(); 12093 if (!C2 || C2 >= 32) 12094 return SDValue(); 12095 12096 // Clear irrelevant bits in the mask. 12097 if (LeftShift) 12098 C1 &= (-1U << C2); 12099 else 12100 C1 &= (-1U >> C2); 12101 12102 SelectionDAG &DAG = DCI.DAG; 12103 SDLoc DL(N); 12104 12105 // We have a pattern of the form "(and (shl x, c2) c1)" or 12106 // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to 12107 // transform to a pair of shifts, to save materializing c1. 12108 12109 // First pattern: right shift, then mask off leading bits. 12110 // FIXME: Use demanded bits? 12111 if (!LeftShift && isMask_32(C1)) { 12112 uint32_t C3 = countLeadingZeros(C1); 12113 if (C2 < C3) { 12114 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 12115 DAG.getConstant(C3 - C2, DL, MVT::i32)); 12116 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL, 12117 DAG.getConstant(C3, DL, MVT::i32)); 12118 } 12119 } 12120 12121 // First pattern, reversed: left shift, then mask off trailing bits. 12122 if (LeftShift && isMask_32(~C1)) { 12123 uint32_t C3 = countTrailingZeros(C1); 12124 if (C2 < C3) { 12125 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0), 12126 DAG.getConstant(C3 - C2, DL, MVT::i32)); 12127 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL, 12128 DAG.getConstant(C3, DL, MVT::i32)); 12129 } 12130 } 12131 12132 // Second pattern: left shift, then mask off leading bits. 12133 // FIXME: Use demanded bits? 12134 if (LeftShift && isShiftedMask_32(C1)) { 12135 uint32_t Trailing = countTrailingZeros(C1); 12136 uint32_t C3 = countLeadingZeros(C1); 12137 if (Trailing == C2 && C2 + C3 < 32) { 12138 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 12139 DAG.getConstant(C2 + C3, DL, MVT::i32)); 12140 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL, 12141 DAG.getConstant(C3, DL, MVT::i32)); 12142 } 12143 } 12144 12145 // Second pattern, reversed: right shift, then mask off trailing bits. 12146 // FIXME: Handle other patterns of known/demanded bits. 12147 if (!LeftShift && isShiftedMask_32(C1)) { 12148 uint32_t Leading = countLeadingZeros(C1); 12149 uint32_t C3 = countTrailingZeros(C1); 12150 if (Leading == C2 && C2 + C3 < 32) { 12151 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0), 12152 DAG.getConstant(C2 + C3, DL, MVT::i32)); 12153 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL, 12154 DAG.getConstant(C3, DL, MVT::i32)); 12155 } 12156 } 12157 12158 // FIXME: Transform "(and (shl x, c2) c1)" -> 12159 // "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than 12160 // c1. 12161 return SDValue(); 12162 } 12163 12164 static SDValue PerformANDCombine(SDNode *N, 12165 TargetLowering::DAGCombinerInfo &DCI, 12166 const ARMSubtarget *Subtarget) { 12167 // Attempt to use immediate-form VBIC 12168 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 12169 SDLoc dl(N); 12170 EVT VT = N->getValueType(0); 12171 SelectionDAG &DAG = DCI.DAG; 12172 12173 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 12174 return SDValue(); 12175 12176 APInt SplatBits, SplatUndef; 12177 unsigned SplatBitSize; 12178 bool HasAnyUndefs; 12179 if (BVN && Subtarget->hasNEON() && 12180 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 12181 if (SplatBitSize <= 64) { 12182 EVT VbicVT; 12183 SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(), 12184 SplatUndef.getZExtValue(), SplatBitSize, 12185 DAG, dl, VbicVT, VT.is128BitVector(), 12186 OtherModImm); 12187 if (Val.getNode()) { 12188 SDValue Input = 12189 DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0)); 12190 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val); 12191 return DAG.getNode(ISD::BITCAST, dl, VT, Vbic); 12192 } 12193 } 12194 } 12195 12196 if (!Subtarget->isThumb1Only()) { 12197 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) 12198 if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI)) 12199 return Result; 12200 12201 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 12202 return Result; 12203 } 12204 12205 if (Subtarget->isThumb1Only()) 12206 if (SDValue Result = CombineANDShift(N, DCI, Subtarget)) 12207 return Result; 12208 12209 return SDValue(); 12210 } 12211 12212 // Try combining OR nodes to SMULWB, SMULWT. 12213 static SDValue PerformORCombineToSMULWBT(SDNode *OR, 12214 TargetLowering::DAGCombinerInfo &DCI, 12215 const ARMSubtarget *Subtarget) { 12216 if (!Subtarget->hasV6Ops() || 12217 (Subtarget->isThumb() && 12218 (!Subtarget->hasThumb2() || !Subtarget->hasDSP()))) 12219 return SDValue(); 12220 12221 SDValue SRL = OR->getOperand(0); 12222 SDValue SHL = OR->getOperand(1); 12223 12224 if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) { 12225 SRL = OR->getOperand(1); 12226 SHL = OR->getOperand(0); 12227 } 12228 if (!isSRL16(SRL) || !isSHL16(SHL)) 12229 return SDValue(); 12230 12231 // The first operands to the shifts need to be the two results from the 12232 // same smul_lohi node. 12233 if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) || 12234 SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI) 12235 return SDValue(); 12236 12237 SDNode *SMULLOHI = SRL.getOperand(0).getNode(); 12238 if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) || 12239 SHL.getOperand(0) != SDValue(SMULLOHI, 1)) 12240 return SDValue(); 12241 12242 // Now we have: 12243 // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16))) 12244 // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments. 12245 // For SMUWB the 16-bit value will signed extended somehow. 12246 // For SMULWT only the SRA is required. 12247 // Check both sides of SMUL_LOHI 12248 SDValue OpS16 = SMULLOHI->getOperand(0); 12249 SDValue OpS32 = SMULLOHI->getOperand(1); 12250 12251 SelectionDAG &DAG = DCI.DAG; 12252 if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) { 12253 OpS16 = OpS32; 12254 OpS32 = SMULLOHI->getOperand(0); 12255 } 12256 12257 SDLoc dl(OR); 12258 unsigned Opcode = 0; 12259 if (isS16(OpS16, DAG)) 12260 Opcode = ARMISD::SMULWB; 12261 else if (isSRA16(OpS16)) { 12262 Opcode = ARMISD::SMULWT; 12263 OpS16 = OpS16->getOperand(0); 12264 } 12265 else 12266 return SDValue(); 12267 12268 SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16); 12269 DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res); 12270 return SDValue(OR, 0); 12271 } 12272 12273 static SDValue PerformORCombineToBFI(SDNode *N, 12274 TargetLowering::DAGCombinerInfo &DCI, 12275 const ARMSubtarget *Subtarget) { 12276 // BFI is only available on V6T2+ 12277 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops()) 12278 return SDValue(); 12279 12280 EVT VT = N->getValueType(0); 12281 SDValue N0 = N->getOperand(0); 12282 SDValue N1 = N->getOperand(1); 12283 SelectionDAG &DAG = DCI.DAG; 12284 SDLoc DL(N); 12285 // 1) or (and A, mask), val => ARMbfi A, val, mask 12286 // iff (val & mask) == val 12287 // 12288 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 12289 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2) 12290 // && mask == ~mask2 12291 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2) 12292 // && ~mask == mask2 12293 // (i.e., copy a bitfield value into another bitfield of the same width) 12294 12295 if (VT != MVT::i32) 12296 return SDValue(); 12297 12298 SDValue N00 = N0.getOperand(0); 12299 12300 // The value and the mask need to be constants so we can verify this is 12301 // actually a bitfield set. If the mask is 0xffff, we can do better 12302 // via a movt instruction, so don't use BFI in that case. 12303 SDValue MaskOp = N0.getOperand(1); 12304 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp); 12305 if (!MaskC) 12306 return SDValue(); 12307 unsigned Mask = MaskC->getZExtValue(); 12308 if (Mask == 0xffff) 12309 return SDValue(); 12310 SDValue Res; 12311 // Case (1): or (and A, mask), val => ARMbfi A, val, mask 12312 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 12313 if (N1C) { 12314 unsigned Val = N1C->getZExtValue(); 12315 if ((Val & ~Mask) != Val) 12316 return SDValue(); 12317 12318 if (ARM::isBitFieldInvertedMask(Mask)) { 12319 Val >>= countTrailingZeros(~Mask); 12320 12321 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, 12322 DAG.getConstant(Val, DL, MVT::i32), 12323 DAG.getConstant(Mask, DL, MVT::i32)); 12324 12325 DCI.CombineTo(N, Res, false); 12326 // Return value from the original node to inform the combiner than N is 12327 // now dead. 12328 return SDValue(N, 0); 12329 } 12330 } else if (N1.getOpcode() == ISD::AND) { 12331 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 12332 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 12333 if (!N11C) 12334 return SDValue(); 12335 unsigned Mask2 = N11C->getZExtValue(); 12336 12337 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern 12338 // as is to match. 12339 if (ARM::isBitFieldInvertedMask(Mask) && 12340 (Mask == ~Mask2)) { 12341 // The pack halfword instruction works better for masks that fit it, 12342 // so use that when it's available. 12343 if (Subtarget->hasDSP() && 12344 (Mask == 0xffff || Mask == 0xffff0000)) 12345 return SDValue(); 12346 // 2a 12347 unsigned amt = countTrailingZeros(Mask2); 12348 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0), 12349 DAG.getConstant(amt, DL, MVT::i32)); 12350 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res, 12351 DAG.getConstant(Mask, DL, MVT::i32)); 12352 DCI.CombineTo(N, Res, false); 12353 // Return value from the original node to inform the combiner than N is 12354 // now dead. 12355 return SDValue(N, 0); 12356 } else if (ARM::isBitFieldInvertedMask(~Mask) && 12357 (~Mask == Mask2)) { 12358 // The pack halfword instruction works better for masks that fit it, 12359 // so use that when it's available. 12360 if (Subtarget->hasDSP() && 12361 (Mask2 == 0xffff || Mask2 == 0xffff0000)) 12362 return SDValue(); 12363 // 2b 12364 unsigned lsb = countTrailingZeros(Mask); 12365 Res = DAG.getNode(ISD::SRL, DL, VT, N00, 12366 DAG.getConstant(lsb, DL, MVT::i32)); 12367 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res, 12368 DAG.getConstant(Mask2, DL, MVT::i32)); 12369 DCI.CombineTo(N, Res, false); 12370 // Return value from the original node to inform the combiner than N is 12371 // now dead. 12372 return SDValue(N, 0); 12373 } 12374 } 12375 12376 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) && 12377 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) && 12378 ARM::isBitFieldInvertedMask(~Mask)) { 12379 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask 12380 // where lsb(mask) == #shamt and masked bits of B are known zero. 12381 SDValue ShAmt = N00.getOperand(1); 12382 unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue(); 12383 unsigned LSB = countTrailingZeros(Mask); 12384 if (ShAmtC != LSB) 12385 return SDValue(); 12386 12387 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0), 12388 DAG.getConstant(~Mask, DL, MVT::i32)); 12389 12390 DCI.CombineTo(N, Res, false); 12391 // Return value from the original node to inform the combiner than N is 12392 // now dead. 12393 return SDValue(N, 0); 12394 } 12395 12396 return SDValue(); 12397 } 12398 12399 static bool isValidMVECond(unsigned CC, bool IsFloat) { 12400 switch (CC) { 12401 case ARMCC::EQ: 12402 case ARMCC::NE: 12403 case ARMCC::LE: 12404 case ARMCC::GT: 12405 case ARMCC::GE: 12406 case ARMCC::LT: 12407 return true; 12408 case ARMCC::HS: 12409 case ARMCC::HI: 12410 return !IsFloat; 12411 default: 12412 return false; 12413 }; 12414 } 12415 12416 static SDValue PerformORCombine_i1(SDNode *N, 12417 TargetLowering::DAGCombinerInfo &DCI, 12418 const ARMSubtarget *Subtarget) { 12419 // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain 12420 // together with predicates 12421 EVT VT = N->getValueType(0); 12422 SDValue N0 = N->getOperand(0); 12423 SDValue N1 = N->getOperand(1); 12424 12425 ARMCC::CondCodes CondCode0 = ARMCC::AL; 12426 ARMCC::CondCodes CondCode1 = ARMCC::AL; 12427 if (N0->getOpcode() == ARMISD::VCMP) 12428 CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(2)) 12429 ->getZExtValue(); 12430 else if (N0->getOpcode() == ARMISD::VCMPZ) 12431 CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(1)) 12432 ->getZExtValue(); 12433 if (N1->getOpcode() == ARMISD::VCMP) 12434 CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(2)) 12435 ->getZExtValue(); 12436 else if (N1->getOpcode() == ARMISD::VCMPZ) 12437 CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(1)) 12438 ->getZExtValue(); 12439 12440 if (CondCode0 == ARMCC::AL || CondCode1 == ARMCC::AL) 12441 return SDValue(); 12442 12443 unsigned Opposite0 = ARMCC::getOppositeCondition(CondCode0); 12444 unsigned Opposite1 = ARMCC::getOppositeCondition(CondCode1); 12445 12446 if (!isValidMVECond(Opposite0, 12447 N0->getOperand(0)->getValueType(0).isFloatingPoint()) || 12448 !isValidMVECond(Opposite1, 12449 N1->getOperand(0)->getValueType(0).isFloatingPoint())) 12450 return SDValue(); 12451 12452 SmallVector<SDValue, 4> Ops0; 12453 Ops0.push_back(N0->getOperand(0)); 12454 if (N0->getOpcode() == ARMISD::VCMP) 12455 Ops0.push_back(N0->getOperand(1)); 12456 Ops0.push_back(DCI.DAG.getConstant(Opposite0, SDLoc(N0), MVT::i32)); 12457 SmallVector<SDValue, 4> Ops1; 12458 Ops1.push_back(N1->getOperand(0)); 12459 if (N1->getOpcode() == ARMISD::VCMP) 12460 Ops1.push_back(N1->getOperand(1)); 12461 Ops1.push_back(DCI.DAG.getConstant(Opposite1, SDLoc(N1), MVT::i32)); 12462 12463 SDValue NewN0 = DCI.DAG.getNode(N0->getOpcode(), SDLoc(N0), VT, Ops0); 12464 SDValue NewN1 = DCI.DAG.getNode(N1->getOpcode(), SDLoc(N1), VT, Ops1); 12465 SDValue And = DCI.DAG.getNode(ISD::AND, SDLoc(N), VT, NewN0, NewN1); 12466 return DCI.DAG.getNode(ISD::XOR, SDLoc(N), VT, And, 12467 DCI.DAG.getAllOnesConstant(SDLoc(N), VT)); 12468 } 12469 12470 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR 12471 static SDValue PerformORCombine(SDNode *N, 12472 TargetLowering::DAGCombinerInfo &DCI, 12473 const ARMSubtarget *Subtarget) { 12474 // Attempt to use immediate-form VORR 12475 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 12476 SDLoc dl(N); 12477 EVT VT = N->getValueType(0); 12478 SelectionDAG &DAG = DCI.DAG; 12479 12480 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 12481 return SDValue(); 12482 12483 APInt SplatBits, SplatUndef; 12484 unsigned SplatBitSize; 12485 bool HasAnyUndefs; 12486 if (BVN && Subtarget->hasNEON() && 12487 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 12488 if (SplatBitSize <= 64) { 12489 EVT VorrVT; 12490 SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(), 12491 SplatUndef.getZExtValue(), SplatBitSize, 12492 DAG, dl, VorrVT, VT.is128BitVector(), 12493 OtherModImm); 12494 if (Val.getNode()) { 12495 SDValue Input = 12496 DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0)); 12497 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val); 12498 return DAG.getNode(ISD::BITCAST, dl, VT, Vorr); 12499 } 12500 } 12501 } 12502 12503 if (!Subtarget->isThumb1Only()) { 12504 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 12505 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) 12506 return Result; 12507 if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget)) 12508 return Result; 12509 } 12510 12511 SDValue N0 = N->getOperand(0); 12512 SDValue N1 = N->getOperand(1); 12513 12514 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant. 12515 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() && 12516 DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 12517 12518 // The code below optimizes (or (and X, Y), Z). 12519 // The AND operand needs to have a single user to make these optimizations 12520 // profitable. 12521 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse()) 12522 return SDValue(); 12523 12524 APInt SplatUndef; 12525 unsigned SplatBitSize; 12526 bool HasAnyUndefs; 12527 12528 APInt SplatBits0, SplatBits1; 12529 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1)); 12530 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1)); 12531 // Ensure that the second operand of both ands are constants 12532 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize, 12533 HasAnyUndefs) && !HasAnyUndefs) { 12534 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, 12535 HasAnyUndefs) && !HasAnyUndefs) { 12536 // Ensure that the bit width of the constants are the same and that 12537 // the splat arguments are logical inverses as per the pattern we 12538 // are trying to simplify. 12539 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() && 12540 SplatBits0 == ~SplatBits1) { 12541 // Canonicalize the vector type to make instruction selection 12542 // simpler. 12543 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 12544 SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT, 12545 N0->getOperand(1), 12546 N0->getOperand(0), 12547 N1->getOperand(0)); 12548 return DAG.getNode(ISD::BITCAST, dl, VT, Result); 12549 } 12550 } 12551 } 12552 } 12553 12554 if (Subtarget->hasMVEIntegerOps() && 12555 (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)) 12556 return PerformORCombine_i1(N, DCI, Subtarget); 12557 12558 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when 12559 // reasonable. 12560 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { 12561 if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget)) 12562 return Res; 12563 } 12564 12565 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 12566 return Result; 12567 12568 return SDValue(); 12569 } 12570 12571 static SDValue PerformXORCombine(SDNode *N, 12572 TargetLowering::DAGCombinerInfo &DCI, 12573 const ARMSubtarget *Subtarget) { 12574 EVT VT = N->getValueType(0); 12575 SelectionDAG &DAG = DCI.DAG; 12576 12577 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 12578 return SDValue(); 12579 12580 if (!Subtarget->isThumb1Only()) { 12581 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 12582 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) 12583 return Result; 12584 12585 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 12586 return Result; 12587 } 12588 12589 return SDValue(); 12590 } 12591 12592 // ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it, 12593 // and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and 12594 // their position in "to" (Rd). 12595 static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) { 12596 assert(N->getOpcode() == ARMISD::BFI); 12597 12598 SDValue From = N->getOperand(1); 12599 ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue(); 12600 FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation()); 12601 12602 // If the Base came from a SHR #C, we can deduce that it is really testing bit 12603 // #C in the base of the SHR. 12604 if (From->getOpcode() == ISD::SRL && 12605 isa<ConstantSDNode>(From->getOperand(1))) { 12606 APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue(); 12607 assert(Shift.getLimitedValue() < 32 && "Shift too large!"); 12608 FromMask <<= Shift.getLimitedValue(31); 12609 From = From->getOperand(0); 12610 } 12611 12612 return From; 12613 } 12614 12615 // If A and B contain one contiguous set of bits, does A | B == A . B? 12616 // 12617 // Neither A nor B must be zero. 12618 static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) { 12619 unsigned LastActiveBitInA = A.countTrailingZeros(); 12620 unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1; 12621 return LastActiveBitInA - 1 == FirstActiveBitInB; 12622 } 12623 12624 static SDValue FindBFIToCombineWith(SDNode *N) { 12625 // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with, 12626 // if one exists. 12627 APInt ToMask, FromMask; 12628 SDValue From = ParseBFI(N, ToMask, FromMask); 12629 SDValue To = N->getOperand(0); 12630 12631 // Now check for a compatible BFI to merge with. We can pass through BFIs that 12632 // aren't compatible, but not if they set the same bit in their destination as 12633 // we do (or that of any BFI we're going to combine with). 12634 SDValue V = To; 12635 APInt CombinedToMask = ToMask; 12636 while (V.getOpcode() == ARMISD::BFI) { 12637 APInt NewToMask, NewFromMask; 12638 SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask); 12639 if (NewFrom != From) { 12640 // This BFI has a different base. Keep going. 12641 CombinedToMask |= NewToMask; 12642 V = V.getOperand(0); 12643 continue; 12644 } 12645 12646 // Do the written bits conflict with any we've seen so far? 12647 if ((NewToMask & CombinedToMask).getBoolValue()) 12648 // Conflicting bits - bail out because going further is unsafe. 12649 return SDValue(); 12650 12651 // Are the new bits contiguous when combined with the old bits? 12652 if (BitsProperlyConcatenate(ToMask, NewToMask) && 12653 BitsProperlyConcatenate(FromMask, NewFromMask)) 12654 return V; 12655 if (BitsProperlyConcatenate(NewToMask, ToMask) && 12656 BitsProperlyConcatenate(NewFromMask, FromMask)) 12657 return V; 12658 12659 // We've seen a write to some bits, so track it. 12660 CombinedToMask |= NewToMask; 12661 // Keep going... 12662 V = V.getOperand(0); 12663 } 12664 12665 return SDValue(); 12666 } 12667 12668 static SDValue PerformBFICombine(SDNode *N, 12669 TargetLowering::DAGCombinerInfo &DCI) { 12670 SDValue N1 = N->getOperand(1); 12671 if (N1.getOpcode() == ISD::AND) { 12672 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff 12673 // the bits being cleared by the AND are not demanded by the BFI. 12674 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 12675 if (!N11C) 12676 return SDValue(); 12677 unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 12678 unsigned LSB = countTrailingZeros(~InvMask); 12679 unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB; 12680 assert(Width < 12681 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) && 12682 "undefined behavior"); 12683 unsigned Mask = (1u << Width) - 1; 12684 unsigned Mask2 = N11C->getZExtValue(); 12685 if ((Mask & (~Mask2)) == 0) 12686 return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0), 12687 N->getOperand(0), N1.getOperand(0), 12688 N->getOperand(2)); 12689 } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) { 12690 // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes. 12691 // Keep track of any consecutive bits set that all come from the same base 12692 // value. We can combine these together into a single BFI. 12693 SDValue CombineBFI = FindBFIToCombineWith(N); 12694 if (CombineBFI == SDValue()) 12695 return SDValue(); 12696 12697 // We've found a BFI. 12698 APInt ToMask1, FromMask1; 12699 SDValue From1 = ParseBFI(N, ToMask1, FromMask1); 12700 12701 APInt ToMask2, FromMask2; 12702 SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2); 12703 assert(From1 == From2); 12704 (void)From2; 12705 12706 // First, unlink CombineBFI. 12707 DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0)); 12708 // Then create a new BFI, combining the two together. 12709 APInt NewFromMask = FromMask1 | FromMask2; 12710 APInt NewToMask = ToMask1 | ToMask2; 12711 12712 EVT VT = N->getValueType(0); 12713 SDLoc dl(N); 12714 12715 if (NewFromMask[0] == 0) 12716 From1 = DCI.DAG.getNode( 12717 ISD::SRL, dl, VT, From1, 12718 DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT)); 12719 return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1, 12720 DCI.DAG.getConstant(~NewToMask, dl, VT)); 12721 } 12722 return SDValue(); 12723 } 12724 12725 /// PerformVMOVRRDCombine - Target-specific dag combine xforms for 12726 /// ARMISD::VMOVRRD. 12727 static SDValue PerformVMOVRRDCombine(SDNode *N, 12728 TargetLowering::DAGCombinerInfo &DCI, 12729 const ARMSubtarget *Subtarget) { 12730 // vmovrrd(vmovdrr x, y) -> x,y 12731 SDValue InDouble = N->getOperand(0); 12732 if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64()) 12733 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1)); 12734 12735 // vmovrrd(load f64) -> (load i32), (load i32) 12736 SDNode *InNode = InDouble.getNode(); 12737 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() && 12738 InNode->getValueType(0) == MVT::f64 && 12739 InNode->getOperand(1).getOpcode() == ISD::FrameIndex && 12740 !cast<LoadSDNode>(InNode)->isVolatile()) { 12741 // TODO: Should this be done for non-FrameIndex operands? 12742 LoadSDNode *LD = cast<LoadSDNode>(InNode); 12743 12744 SelectionDAG &DAG = DCI.DAG; 12745 SDLoc DL(LD); 12746 SDValue BasePtr = LD->getBasePtr(); 12747 SDValue NewLD1 = 12748 DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(), 12749 LD->getAlignment(), LD->getMemOperand()->getFlags()); 12750 12751 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 12752 DAG.getConstant(4, DL, MVT::i32)); 12753 12754 SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr, 12755 LD->getPointerInfo().getWithOffset(4), 12756 std::min(4U, LD->getAlignment()), 12757 LD->getMemOperand()->getFlags()); 12758 12759 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1)); 12760 if (DCI.DAG.getDataLayout().isBigEndian()) 12761 std::swap (NewLD1, NewLD2); 12762 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2); 12763 return Result; 12764 } 12765 12766 return SDValue(); 12767 } 12768 12769 /// PerformVMOVDRRCombine - Target-specific dag combine xforms for 12770 /// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands. 12771 static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) { 12772 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X) 12773 SDValue Op0 = N->getOperand(0); 12774 SDValue Op1 = N->getOperand(1); 12775 if (Op0.getOpcode() == ISD::BITCAST) 12776 Op0 = Op0.getOperand(0); 12777 if (Op1.getOpcode() == ISD::BITCAST) 12778 Op1 = Op1.getOperand(0); 12779 if (Op0.getOpcode() == ARMISD::VMOVRRD && 12780 Op0.getNode() == Op1.getNode() && 12781 Op0.getResNo() == 0 && Op1.getResNo() == 1) 12782 return DAG.getNode(ISD::BITCAST, SDLoc(N), 12783 N->getValueType(0), Op0.getOperand(0)); 12784 return SDValue(); 12785 } 12786 12787 /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node 12788 /// are normal, non-volatile loads. If so, it is profitable to bitcast an 12789 /// i64 vector to have f64 elements, since the value can then be loaded 12790 /// directly into a VFP register. 12791 static bool hasNormalLoadOperand(SDNode *N) { 12792 unsigned NumElts = N->getValueType(0).getVectorNumElements(); 12793 for (unsigned i = 0; i < NumElts; ++i) { 12794 SDNode *Elt = N->getOperand(i).getNode(); 12795 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile()) 12796 return true; 12797 } 12798 return false; 12799 } 12800 12801 /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for 12802 /// ISD::BUILD_VECTOR. 12803 static SDValue PerformBUILD_VECTORCombine(SDNode *N, 12804 TargetLowering::DAGCombinerInfo &DCI, 12805 const ARMSubtarget *Subtarget) { 12806 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X): 12807 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value 12808 // into a pair of GPRs, which is fine when the value is used as a scalar, 12809 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD. 12810 SelectionDAG &DAG = DCI.DAG; 12811 if (N->getNumOperands() == 2) 12812 if (SDValue RV = PerformVMOVDRRCombine(N, DAG)) 12813 return RV; 12814 12815 // Load i64 elements as f64 values so that type legalization does not split 12816 // them up into i32 values. 12817 EVT VT = N->getValueType(0); 12818 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N)) 12819 return SDValue(); 12820 SDLoc dl(N); 12821 SmallVector<SDValue, 8> Ops; 12822 unsigned NumElts = VT.getVectorNumElements(); 12823 for (unsigned i = 0; i < NumElts; ++i) { 12824 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i)); 12825 Ops.push_back(V); 12826 // Make the DAGCombiner fold the bitcast. 12827 DCI.AddToWorklist(V.getNode()); 12828 } 12829 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts); 12830 SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops); 12831 return DAG.getNode(ISD::BITCAST, dl, VT, BV); 12832 } 12833 12834 /// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR. 12835 static SDValue 12836 PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 12837 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR. 12838 // At that time, we may have inserted bitcasts from integer to float. 12839 // If these bitcasts have survived DAGCombine, change the lowering of this 12840 // BUILD_VECTOR in something more vector friendly, i.e., that does not 12841 // force to use floating point types. 12842 12843 // Make sure we can change the type of the vector. 12844 // This is possible iff: 12845 // 1. The vector is only used in a bitcast to a integer type. I.e., 12846 // 1.1. Vector is used only once. 12847 // 1.2. Use is a bit convert to an integer type. 12848 // 2. The size of its operands are 32-bits (64-bits are not legal). 12849 EVT VT = N->getValueType(0); 12850 EVT EltVT = VT.getVectorElementType(); 12851 12852 // Check 1.1. and 2. 12853 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse()) 12854 return SDValue(); 12855 12856 // By construction, the input type must be float. 12857 assert(EltVT == MVT::f32 && "Unexpected type!"); 12858 12859 // Check 1.2. 12860 SDNode *Use = *N->use_begin(); 12861 if (Use->getOpcode() != ISD::BITCAST || 12862 Use->getValueType(0).isFloatingPoint()) 12863 return SDValue(); 12864 12865 // Check profitability. 12866 // Model is, if more than half of the relevant operands are bitcast from 12867 // i32, turn the build_vector into a sequence of insert_vector_elt. 12868 // Relevant operands are everything that is not statically 12869 // (i.e., at compile time) bitcasted. 12870 unsigned NumOfBitCastedElts = 0; 12871 unsigned NumElts = VT.getVectorNumElements(); 12872 unsigned NumOfRelevantElts = NumElts; 12873 for (unsigned Idx = 0; Idx < NumElts; ++Idx) { 12874 SDValue Elt = N->getOperand(Idx); 12875 if (Elt->getOpcode() == ISD::BITCAST) { 12876 // Assume only bit cast to i32 will go away. 12877 if (Elt->getOperand(0).getValueType() == MVT::i32) 12878 ++NumOfBitCastedElts; 12879 } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt)) 12880 // Constants are statically casted, thus do not count them as 12881 // relevant operands. 12882 --NumOfRelevantElts; 12883 } 12884 12885 // Check if more than half of the elements require a non-free bitcast. 12886 if (NumOfBitCastedElts <= NumOfRelevantElts / 2) 12887 return SDValue(); 12888 12889 SelectionDAG &DAG = DCI.DAG; 12890 // Create the new vector type. 12891 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); 12892 // Check if the type is legal. 12893 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12894 if (!TLI.isTypeLegal(VecVT)) 12895 return SDValue(); 12896 12897 // Combine: 12898 // ARMISD::BUILD_VECTOR E1, E2, ..., EN. 12899 // => BITCAST INSERT_VECTOR_ELT 12900 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1), 12901 // (BITCAST EN), N. 12902 SDValue Vec = DAG.getUNDEF(VecVT); 12903 SDLoc dl(N); 12904 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) { 12905 SDValue V = N->getOperand(Idx); 12906 if (V.isUndef()) 12907 continue; 12908 if (V.getOpcode() == ISD::BITCAST && 12909 V->getOperand(0).getValueType() == MVT::i32) 12910 // Fold obvious case. 12911 V = V.getOperand(0); 12912 else { 12913 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V); 12914 // Make the DAGCombiner fold the bitcasts. 12915 DCI.AddToWorklist(V.getNode()); 12916 } 12917 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32); 12918 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx); 12919 } 12920 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec); 12921 // Make the DAGCombiner fold the bitcasts. 12922 DCI.AddToWorklist(Vec.getNode()); 12923 return Vec; 12924 } 12925 12926 static SDValue 12927 PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 12928 EVT VT = N->getValueType(0); 12929 SDValue Op = N->getOperand(0); 12930 SDLoc dl(N); 12931 12932 // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x) 12933 if (Op->getOpcode() == ARMISD::PREDICATE_CAST) { 12934 // If the valuetypes are the same, we can remove the cast entirely. 12935 if (Op->getOperand(0).getValueType() == VT) 12936 return Op->getOperand(0); 12937 return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, 12938 Op->getOperand(0).getValueType(), Op->getOperand(0)); 12939 } 12940 12941 return SDValue(); 12942 } 12943 12944 static SDValue PerformVCMPCombine(SDNode *N, 12945 TargetLowering::DAGCombinerInfo &DCI, 12946 const ARMSubtarget *Subtarget) { 12947 if (!Subtarget->hasMVEIntegerOps()) 12948 return SDValue(); 12949 12950 EVT VT = N->getValueType(0); 12951 SDValue Op0 = N->getOperand(0); 12952 SDValue Op1 = N->getOperand(1); 12953 ARMCC::CondCodes Cond = 12954 (ARMCC::CondCodes)cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 12955 SDLoc dl(N); 12956 12957 // vcmp X, 0, cc -> vcmpz X, cc 12958 if (isZeroVector(Op1)) 12959 return DCI.DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, 12960 N->getOperand(2)); 12961 12962 unsigned SwappedCond = getSwappedCondition(Cond); 12963 if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) { 12964 // vcmp 0, X, cc -> vcmpz X, reversed(cc) 12965 if (isZeroVector(Op0)) 12966 return DCI.DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1, 12967 DCI.DAG.getConstant(SwappedCond, dl, MVT::i32)); 12968 // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc) 12969 if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP) 12970 return DCI.DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0, 12971 DCI.DAG.getConstant(SwappedCond, dl, MVT::i32)); 12972 } 12973 12974 return SDValue(); 12975 } 12976 12977 /// PerformInsertEltCombine - Target-specific dag combine xforms for 12978 /// ISD::INSERT_VECTOR_ELT. 12979 static SDValue PerformInsertEltCombine(SDNode *N, 12980 TargetLowering::DAGCombinerInfo &DCI) { 12981 // Bitcast an i64 load inserted into a vector to f64. 12982 // Otherwise, the i64 value will be legalized to a pair of i32 values. 12983 EVT VT = N->getValueType(0); 12984 SDNode *Elt = N->getOperand(1).getNode(); 12985 if (VT.getVectorElementType() != MVT::i64 || 12986 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile()) 12987 return SDValue(); 12988 12989 SelectionDAG &DAG = DCI.DAG; 12990 SDLoc dl(N); 12991 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 12992 VT.getVectorNumElements()); 12993 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0)); 12994 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1)); 12995 // Make the DAGCombiner fold the bitcasts. 12996 DCI.AddToWorklist(Vec.getNode()); 12997 DCI.AddToWorklist(V.getNode()); 12998 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT, 12999 Vec, V, N->getOperand(2)); 13000 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt); 13001 } 13002 13003 /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for 13004 /// ISD::VECTOR_SHUFFLE. 13005 static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { 13006 // The LLVM shufflevector instruction does not require the shuffle mask 13007 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does 13008 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the 13009 // operands do not match the mask length, they are extended by concatenating 13010 // them with undef vectors. That is probably the right thing for other 13011 // targets, but for NEON it is better to concatenate two double-register 13012 // size vector operands into a single quad-register size vector. Do that 13013 // transformation here: 13014 // shuffle(concat(v1, undef), concat(v2, undef)) -> 13015 // shuffle(concat(v1, v2), undef) 13016 SDValue Op0 = N->getOperand(0); 13017 SDValue Op1 = N->getOperand(1); 13018 if (Op0.getOpcode() != ISD::CONCAT_VECTORS || 13019 Op1.getOpcode() != ISD::CONCAT_VECTORS || 13020 Op0.getNumOperands() != 2 || 13021 Op1.getNumOperands() != 2) 13022 return SDValue(); 13023 SDValue Concat0Op1 = Op0.getOperand(1); 13024 SDValue Concat1Op1 = Op1.getOperand(1); 13025 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef()) 13026 return SDValue(); 13027 // Skip the transformation if any of the types are illegal. 13028 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13029 EVT VT = N->getValueType(0); 13030 if (!TLI.isTypeLegal(VT) || 13031 !TLI.isTypeLegal(Concat0Op1.getValueType()) || 13032 !TLI.isTypeLegal(Concat1Op1.getValueType())) 13033 return SDValue(); 13034 13035 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, 13036 Op0.getOperand(0), Op1.getOperand(0)); 13037 // Translate the shuffle mask. 13038 SmallVector<int, 16> NewMask; 13039 unsigned NumElts = VT.getVectorNumElements(); 13040 unsigned HalfElts = NumElts/2; 13041 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 13042 for (unsigned n = 0; n < NumElts; ++n) { 13043 int MaskElt = SVN->getMaskElt(n); 13044 int NewElt = -1; 13045 if (MaskElt < (int)HalfElts) 13046 NewElt = MaskElt; 13047 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts)) 13048 NewElt = HalfElts + MaskElt - NumElts; 13049 NewMask.push_back(NewElt); 13050 } 13051 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat, 13052 DAG.getUNDEF(VT), NewMask); 13053 } 13054 13055 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, 13056 /// NEON load/store intrinsics, and generic vector load/stores, to merge 13057 /// base address updates. 13058 /// For generic load/stores, the memory type is assumed to be a vector. 13059 /// The caller is assumed to have checked legality. 13060 static SDValue CombineBaseUpdate(SDNode *N, 13061 TargetLowering::DAGCombinerInfo &DCI) { 13062 SelectionDAG &DAG = DCI.DAG; 13063 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || 13064 N->getOpcode() == ISD::INTRINSIC_W_CHAIN); 13065 const bool isStore = N->getOpcode() == ISD::STORE; 13066 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1); 13067 SDValue Addr = N->getOperand(AddrOpIdx); 13068 MemSDNode *MemN = cast<MemSDNode>(N); 13069 SDLoc dl(N); 13070 13071 // Search for a use of the address operand that is an increment. 13072 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), 13073 UE = Addr.getNode()->use_end(); UI != UE; ++UI) { 13074 SDNode *User = *UI; 13075 if (User->getOpcode() != ISD::ADD || 13076 UI.getUse().getResNo() != Addr.getResNo()) 13077 continue; 13078 13079 // Check that the add is independent of the load/store. Otherwise, folding 13080 // it would create a cycle. We can avoid searching through Addr as it's a 13081 // predecessor to both. 13082 SmallPtrSet<const SDNode *, 32> Visited; 13083 SmallVector<const SDNode *, 16> Worklist; 13084 Visited.insert(Addr.getNode()); 13085 Worklist.push_back(N); 13086 Worklist.push_back(User); 13087 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || 13088 SDNode::hasPredecessorHelper(User, Visited, Worklist)) 13089 continue; 13090 13091 // Find the new opcode for the updating load/store. 13092 bool isLoadOp = true; 13093 bool isLaneOp = false; 13094 unsigned NewOpc = 0; 13095 unsigned NumVecs = 0; 13096 if (isIntrinsic) { 13097 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 13098 switch (IntNo) { 13099 default: llvm_unreachable("unexpected intrinsic for Neon base update"); 13100 case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD; 13101 NumVecs = 1; break; 13102 case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD; 13103 NumVecs = 2; break; 13104 case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD; 13105 NumVecs = 3; break; 13106 case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD; 13107 NumVecs = 4; break; 13108 case Intrinsic::arm_neon_vld2dup: 13109 case Intrinsic::arm_neon_vld3dup: 13110 case Intrinsic::arm_neon_vld4dup: 13111 // TODO: Support updating VLDxDUP nodes. For now, we just skip 13112 // combining base updates for such intrinsics. 13113 continue; 13114 case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD; 13115 NumVecs = 2; isLaneOp = true; break; 13116 case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD; 13117 NumVecs = 3; isLaneOp = true; break; 13118 case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD; 13119 NumVecs = 4; isLaneOp = true; break; 13120 case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD; 13121 NumVecs = 1; isLoadOp = false; break; 13122 case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD; 13123 NumVecs = 2; isLoadOp = false; break; 13124 case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD; 13125 NumVecs = 3; isLoadOp = false; break; 13126 case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD; 13127 NumVecs = 4; isLoadOp = false; break; 13128 case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD; 13129 NumVecs = 2; isLoadOp = false; isLaneOp = true; break; 13130 case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD; 13131 NumVecs = 3; isLoadOp = false; isLaneOp = true; break; 13132 case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD; 13133 NumVecs = 4; isLoadOp = false; isLaneOp = true; break; 13134 } 13135 } else { 13136 isLaneOp = true; 13137 switch (N->getOpcode()) { 13138 default: llvm_unreachable("unexpected opcode for Neon base update"); 13139 case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break; 13140 case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break; 13141 case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break; 13142 case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break; 13143 case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD; 13144 NumVecs = 1; isLaneOp = false; break; 13145 case ISD::STORE: NewOpc = ARMISD::VST1_UPD; 13146 NumVecs = 1; isLaneOp = false; isLoadOp = false; break; 13147 } 13148 } 13149 13150 // Find the size of memory referenced by the load/store. 13151 EVT VecTy; 13152 if (isLoadOp) { 13153 VecTy = N->getValueType(0); 13154 } else if (isIntrinsic) { 13155 VecTy = N->getOperand(AddrOpIdx+1).getValueType(); 13156 } else { 13157 assert(isStore && "Node has to be a load, a store, or an intrinsic!"); 13158 VecTy = N->getOperand(1).getValueType(); 13159 } 13160 13161 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; 13162 if (isLaneOp) 13163 NumBytes /= VecTy.getVectorNumElements(); 13164 13165 // If the increment is a constant, it must match the memory ref size. 13166 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 13167 ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode()); 13168 if (NumBytes >= 3 * 16 && (!CInc || CInc->getZExtValue() != NumBytes)) { 13169 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two 13170 // separate instructions that make it harder to use a non-constant update. 13171 continue; 13172 } 13173 13174 // OK, we found an ADD we can fold into the base update. 13175 // Now, create a _UPD node, taking care of not breaking alignment. 13176 13177 EVT AlignedVecTy = VecTy; 13178 unsigned Alignment = MemN->getAlignment(); 13179 13180 // If this is a less-than-standard-aligned load/store, change the type to 13181 // match the standard alignment. 13182 // The alignment is overlooked when selecting _UPD variants; and it's 13183 // easier to introduce bitcasts here than fix that. 13184 // There are 3 ways to get to this base-update combine: 13185 // - intrinsics: they are assumed to be properly aligned (to the standard 13186 // alignment of the memory type), so we don't need to do anything. 13187 // - ARMISD::VLDx nodes: they are only generated from the aforementioned 13188 // intrinsics, so, likewise, there's nothing to do. 13189 // - generic load/store instructions: the alignment is specified as an 13190 // explicit operand, rather than implicitly as the standard alignment 13191 // of the memory type (like the intrisics). We need to change the 13192 // memory type to match the explicit alignment. That way, we don't 13193 // generate non-standard-aligned ARMISD::VLDx nodes. 13194 if (isa<LSBaseSDNode>(N)) { 13195 if (Alignment == 0) 13196 Alignment = 1; 13197 if (Alignment < VecTy.getScalarSizeInBits() / 8) { 13198 MVT EltTy = MVT::getIntegerVT(Alignment * 8); 13199 assert(NumVecs == 1 && "Unexpected multi-element generic load/store."); 13200 assert(!isLaneOp && "Unexpected generic load/store lane."); 13201 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8); 13202 AlignedVecTy = MVT::getVectorVT(EltTy, NumElts); 13203 } 13204 // Don't set an explicit alignment on regular load/stores that we want 13205 // to transform to VLD/VST 1_UPD nodes. 13206 // This matches the behavior of regular load/stores, which only get an 13207 // explicit alignment if the MMO alignment is larger than the standard 13208 // alignment of the memory type. 13209 // Intrinsics, however, always get an explicit alignment, set to the 13210 // alignment of the MMO. 13211 Alignment = 1; 13212 } 13213 13214 // Create the new updating load/store node. 13215 // First, create an SDVTList for the new updating node's results. 13216 EVT Tys[6]; 13217 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); 13218 unsigned n; 13219 for (n = 0; n < NumResultVecs; ++n) 13220 Tys[n] = AlignedVecTy; 13221 Tys[n++] = MVT::i32; 13222 Tys[n] = MVT::Other; 13223 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2)); 13224 13225 // Then, gather the new node's operands. 13226 SmallVector<SDValue, 8> Ops; 13227 Ops.push_back(N->getOperand(0)); // incoming chain 13228 Ops.push_back(N->getOperand(AddrOpIdx)); 13229 Ops.push_back(Inc); 13230 13231 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) { 13232 // Try to match the intrinsic's signature 13233 Ops.push_back(StN->getValue()); 13234 } else { 13235 // Loads (and of course intrinsics) match the intrinsics' signature, 13236 // so just add all but the alignment operand. 13237 for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i) 13238 Ops.push_back(N->getOperand(i)); 13239 } 13240 13241 // For all node types, the alignment operand is always the last one. 13242 Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32)); 13243 13244 // If this is a non-standard-aligned STORE, the penultimate operand is the 13245 // stored value. Bitcast it to the aligned type. 13246 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) { 13247 SDValue &StVal = Ops[Ops.size()-2]; 13248 StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal); 13249 } 13250 13251 EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy; 13252 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT, 13253 MemN->getMemOperand()); 13254 13255 // Update the uses. 13256 SmallVector<SDValue, 5> NewResults; 13257 for (unsigned i = 0; i < NumResultVecs; ++i) 13258 NewResults.push_back(SDValue(UpdN.getNode(), i)); 13259 13260 // If this is an non-standard-aligned LOAD, the first result is the loaded 13261 // value. Bitcast it to the expected result type. 13262 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) { 13263 SDValue &LdVal = NewResults[0]; 13264 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal); 13265 } 13266 13267 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain 13268 DCI.CombineTo(N, NewResults); 13269 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); 13270 13271 break; 13272 } 13273 return SDValue(); 13274 } 13275 13276 static SDValue PerformVLDCombine(SDNode *N, 13277 TargetLowering::DAGCombinerInfo &DCI) { 13278 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 13279 return SDValue(); 13280 13281 return CombineBaseUpdate(N, DCI); 13282 } 13283 13284 /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a 13285 /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic 13286 /// are also VDUPLANEs. If so, combine them to a vldN-dup operation and 13287 /// return true. 13288 static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 13289 SelectionDAG &DAG = DCI.DAG; 13290 EVT VT = N->getValueType(0); 13291 // vldN-dup instructions only support 64-bit vectors for N > 1. 13292 if (!VT.is64BitVector()) 13293 return false; 13294 13295 // Check if the VDUPLANE operand is a vldN-dup intrinsic. 13296 SDNode *VLD = N->getOperand(0).getNode(); 13297 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN) 13298 return false; 13299 unsigned NumVecs = 0; 13300 unsigned NewOpc = 0; 13301 unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue(); 13302 if (IntNo == Intrinsic::arm_neon_vld2lane) { 13303 NumVecs = 2; 13304 NewOpc = ARMISD::VLD2DUP; 13305 } else if (IntNo == Intrinsic::arm_neon_vld3lane) { 13306 NumVecs = 3; 13307 NewOpc = ARMISD::VLD3DUP; 13308 } else if (IntNo == Intrinsic::arm_neon_vld4lane) { 13309 NumVecs = 4; 13310 NewOpc = ARMISD::VLD4DUP; 13311 } else { 13312 return false; 13313 } 13314 13315 // First check that all the vldN-lane uses are VDUPLANEs and that the lane 13316 // numbers match the load. 13317 unsigned VLDLaneNo = 13318 cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue(); 13319 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 13320 UI != UE; ++UI) { 13321 // Ignore uses of the chain result. 13322 if (UI.getUse().getResNo() == NumVecs) 13323 continue; 13324 SDNode *User = *UI; 13325 if (User->getOpcode() != ARMISD::VDUPLANE || 13326 VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue()) 13327 return false; 13328 } 13329 13330 // Create the vldN-dup node. 13331 EVT Tys[5]; 13332 unsigned n; 13333 for (n = 0; n < NumVecs; ++n) 13334 Tys[n] = VT; 13335 Tys[n] = MVT::Other; 13336 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1)); 13337 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) }; 13338 MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD); 13339 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, 13340 Ops, VLDMemInt->getMemoryVT(), 13341 VLDMemInt->getMemOperand()); 13342 13343 // Update the uses. 13344 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 13345 UI != UE; ++UI) { 13346 unsigned ResNo = UI.getUse().getResNo(); 13347 // Ignore uses of the chain result. 13348 if (ResNo == NumVecs) 13349 continue; 13350 SDNode *User = *UI; 13351 DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo)); 13352 } 13353 13354 // Now the vldN-lane intrinsic is dead except for its chain result. 13355 // Update uses of the chain. 13356 std::vector<SDValue> VLDDupResults; 13357 for (unsigned n = 0; n < NumVecs; ++n) 13358 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n)); 13359 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs)); 13360 DCI.CombineTo(VLD, VLDDupResults); 13361 13362 return true; 13363 } 13364 13365 /// PerformVDUPLANECombine - Target-specific dag combine xforms for 13366 /// ARMISD::VDUPLANE. 13367 static SDValue PerformVDUPLANECombine(SDNode *N, 13368 TargetLowering::DAGCombinerInfo &DCI) { 13369 SDValue Op = N->getOperand(0); 13370 13371 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses 13372 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation. 13373 if (CombineVLDDUP(N, DCI)) 13374 return SDValue(N, 0); 13375 13376 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is 13377 // redundant. Ignore bit_converts for now; element sizes are checked below. 13378 while (Op.getOpcode() == ISD::BITCAST) 13379 Op = Op.getOperand(0); 13380 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM) 13381 return SDValue(); 13382 13383 // Make sure the VMOV element size is not bigger than the VDUPLANE elements. 13384 unsigned EltSize = Op.getScalarValueSizeInBits(); 13385 // The canonical VMOV for a zero vector uses a 32-bit element size. 13386 unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 13387 unsigned EltBits; 13388 if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0) 13389 EltSize = 8; 13390 EVT VT = N->getValueType(0); 13391 if (EltSize > VT.getScalarSizeInBits()) 13392 return SDValue(); 13393 13394 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); 13395 } 13396 13397 /// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP. 13398 static SDValue PerformVDUPCombine(SDNode *N, 13399 TargetLowering::DAGCombinerInfo &DCI, 13400 const ARMSubtarget *Subtarget) { 13401 SelectionDAG &DAG = DCI.DAG; 13402 SDValue Op = N->getOperand(0); 13403 13404 if (!Subtarget->hasNEON()) 13405 return SDValue(); 13406 13407 // Match VDUP(LOAD) -> VLD1DUP. 13408 // We match this pattern here rather than waiting for isel because the 13409 // transform is only legal for unindexed loads. 13410 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode()); 13411 if (LD && Op.hasOneUse() && LD->isUnindexed() && 13412 LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) { 13413 SDValue Ops[] = { LD->getOperand(0), LD->getOperand(1), 13414 DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32) }; 13415 SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other); 13416 SDValue VLDDup = DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys, 13417 Ops, LD->getMemoryVT(), 13418 LD->getMemOperand()); 13419 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1)); 13420 return VLDDup; 13421 } 13422 13423 return SDValue(); 13424 } 13425 13426 static SDValue PerformLOADCombine(SDNode *N, 13427 TargetLowering::DAGCombinerInfo &DCI) { 13428 EVT VT = N->getValueType(0); 13429 13430 // If this is a legal vector load, try to combine it into a VLD1_UPD. 13431 if (ISD::isNormalLoad(N) && VT.isVector() && 13432 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 13433 return CombineBaseUpdate(N, DCI); 13434 13435 return SDValue(); 13436 } 13437 13438 // Optimize trunc store (of multiple scalars) to shuffle and store. First, 13439 // pack all of the elements in one place. Next, store to memory in fewer 13440 // chunks. 13441 static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, 13442 SelectionDAG &DAG) { 13443 SDValue StVal = St->getValue(); 13444 EVT VT = StVal.getValueType(); 13445 if (!St->isTruncatingStore() || !VT.isVector()) 13446 return SDValue(); 13447 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13448 EVT StVT = St->getMemoryVT(); 13449 unsigned NumElems = VT.getVectorNumElements(); 13450 assert(StVT != VT && "Cannot truncate to the same type"); 13451 unsigned FromEltSz = VT.getScalarSizeInBits(); 13452 unsigned ToEltSz = StVT.getScalarSizeInBits(); 13453 13454 // From, To sizes and ElemCount must be pow of two 13455 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) 13456 return SDValue(); 13457 13458 // We are going to use the original vector elt for storing. 13459 // Accumulated smaller vector elements must be a multiple of the store size. 13460 if (0 != (NumElems * FromEltSz) % ToEltSz) 13461 return SDValue(); 13462 13463 unsigned SizeRatio = FromEltSz / ToEltSz; 13464 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits()); 13465 13466 // Create a type on which we perform the shuffle. 13467 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), 13468 NumElems * SizeRatio); 13469 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 13470 13471 SDLoc DL(St); 13472 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); 13473 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 13474 for (unsigned i = 0; i < NumElems; ++i) 13475 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1 13476 : i * SizeRatio; 13477 13478 // Can't shuffle using an illegal type. 13479 if (!TLI.isTypeLegal(WideVecVT)) 13480 return SDValue(); 13481 13482 SDValue Shuff = DAG.getVectorShuffle( 13483 WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec); 13484 // At this point all of the data is stored at the bottom of the 13485 // register. We now need to save it to mem. 13486 13487 // Find the largest store unit 13488 MVT StoreType = MVT::i8; 13489 for (MVT Tp : MVT::integer_valuetypes()) { 13490 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz) 13491 StoreType = Tp; 13492 } 13493 // Didn't find a legal store type. 13494 if (!TLI.isTypeLegal(StoreType)) 13495 return SDValue(); 13496 13497 // Bitcast the original vector into a vector of store-size units 13498 EVT StoreVecVT = 13499 EVT::getVectorVT(*DAG.getContext(), StoreType, 13500 VT.getSizeInBits() / EVT(StoreType).getSizeInBits()); 13501 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); 13502 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); 13503 SmallVector<SDValue, 8> Chains; 13504 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL, 13505 TLI.getPointerTy(DAG.getDataLayout())); 13506 SDValue BasePtr = St->getBasePtr(); 13507 13508 // Perform one or more big stores into memory. 13509 unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits(); 13510 for (unsigned I = 0; I < E; I++) { 13511 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType, 13512 ShuffWide, DAG.getIntPtrConstant(I, DL)); 13513 SDValue Ch = 13514 DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(), 13515 St->getAlignment(), St->getMemOperand()->getFlags()); 13516 BasePtr = 13517 DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment); 13518 Chains.push_back(Ch); 13519 } 13520 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 13521 } 13522 13523 // Try taking a single vector store from an truncate (which would otherwise turn 13524 // into an expensive buildvector) and splitting it into a series of narrowing 13525 // stores. 13526 static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, 13527 SelectionDAG &DAG) { 13528 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed()) 13529 return SDValue(); 13530 SDValue Trunc = St->getValue(); 13531 if (Trunc->getOpcode() != ISD::TRUNCATE) 13532 return SDValue(); 13533 EVT FromVT = Trunc->getOperand(0).getValueType(); 13534 EVT ToVT = Trunc.getValueType(); 13535 if (!ToVT.isVector()) 13536 return SDValue(); 13537 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements()); 13538 EVT ToEltVT = ToVT.getVectorElementType(); 13539 EVT FromEltVT = FromVT.getVectorElementType(); 13540 13541 unsigned NumElements = 0; 13542 if (FromEltVT == MVT::i32 && (ToEltVT == MVT::i16 || ToEltVT == MVT::i8)) 13543 NumElements = 4; 13544 if (FromEltVT == MVT::i16 && ToEltVT == MVT::i8) 13545 NumElements = 8; 13546 if (NumElements == 0 || FromVT.getVectorNumElements() == NumElements || 13547 FromVT.getVectorNumElements() % NumElements != 0) 13548 return SDValue(); 13549 13550 SDLoc DL(St); 13551 // Details about the old store 13552 SDValue Ch = St->getChain(); 13553 SDValue BasePtr = St->getBasePtr(); 13554 unsigned Alignment = St->getOriginalAlignment(); 13555 MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags(); 13556 AAMDNodes AAInfo = St->getAAInfo(); 13557 13558 EVT NewFromVT = EVT::getVectorVT(*DAG.getContext(), FromEltVT, NumElements); 13559 EVT NewToVT = EVT::getVectorVT(*DAG.getContext(), ToEltVT, NumElements); 13560 13561 SmallVector<SDValue, 4> Stores; 13562 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) { 13563 unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8; 13564 SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset); 13565 13566 SDValue Extract = 13567 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0), 13568 DAG.getConstant(i * NumElements, DL, MVT::i32)); 13569 SDValue Store = DAG.getTruncStore( 13570 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset), 13571 NewToVT, Alignment, MMOFlags, AAInfo); 13572 Stores.push_back(Store); 13573 } 13574 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); 13575 } 13576 13577 /// PerformSTORECombine - Target-specific dag combine xforms for 13578 /// ISD::STORE. 13579 static SDValue PerformSTORECombine(SDNode *N, 13580 TargetLowering::DAGCombinerInfo &DCI, 13581 const ARMSubtarget *Subtarget) { 13582 StoreSDNode *St = cast<StoreSDNode>(N); 13583 if (St->isVolatile()) 13584 return SDValue(); 13585 SDValue StVal = St->getValue(); 13586 EVT VT = StVal.getValueType(); 13587 13588 if (Subtarget->hasNEON()) 13589 if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG)) 13590 return Store; 13591 13592 if (Subtarget->hasMVEIntegerOps()) 13593 if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG)) 13594 return NewToken; 13595 13596 if (!ISD::isNormalStore(St)) 13597 return SDValue(); 13598 13599 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and 13600 // ARM stores of arguments in the same cache line. 13601 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR && 13602 StVal.getNode()->hasOneUse()) { 13603 SelectionDAG &DAG = DCI.DAG; 13604 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 13605 SDLoc DL(St); 13606 SDValue BasePtr = St->getBasePtr(); 13607 SDValue NewST1 = DAG.getStore( 13608 St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0), 13609 BasePtr, St->getPointerInfo(), St->getAlignment(), 13610 St->getMemOperand()->getFlags()); 13611 13612 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 13613 DAG.getConstant(4, DL, MVT::i32)); 13614 return DAG.getStore(NewST1.getValue(0), DL, 13615 StVal.getNode()->getOperand(isBigEndian ? 0 : 1), 13616 OffsetPtr, St->getPointerInfo(), 13617 std::min(4U, St->getAlignment() / 2), 13618 St->getMemOperand()->getFlags()); 13619 } 13620 13621 if (StVal.getValueType() == MVT::i64 && 13622 StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 13623 13624 // Bitcast an i64 store extracted from a vector to f64. 13625 // Otherwise, the i64 value will be legalized to a pair of i32 values. 13626 SelectionDAG &DAG = DCI.DAG; 13627 SDLoc dl(StVal); 13628 SDValue IntVec = StVal.getOperand(0); 13629 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 13630 IntVec.getValueType().getVectorNumElements()); 13631 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec); 13632 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 13633 Vec, StVal.getOperand(1)); 13634 dl = SDLoc(N); 13635 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt); 13636 // Make the DAGCombiner fold the bitcasts. 13637 DCI.AddToWorklist(Vec.getNode()); 13638 DCI.AddToWorklist(ExtElt.getNode()); 13639 DCI.AddToWorklist(V.getNode()); 13640 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(), 13641 St->getPointerInfo(), St->getAlignment(), 13642 St->getMemOperand()->getFlags(), St->getAAInfo()); 13643 } 13644 13645 // If this is a legal vector store, try to combine it into a VST1_UPD. 13646 if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() && 13647 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 13648 return CombineBaseUpdate(N, DCI); 13649 13650 return SDValue(); 13651 } 13652 13653 /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) 13654 /// can replace combinations of VMUL and VCVT (floating-point to integer) 13655 /// when the VMUL has a constant operand that is a power of 2. 13656 /// 13657 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 13658 /// vmul.f32 d16, d17, d16 13659 /// vcvt.s32.f32 d16, d16 13660 /// becomes: 13661 /// vcvt.s32.f32 d16, d16, #3 13662 static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, 13663 const ARMSubtarget *Subtarget) { 13664 if (!Subtarget->hasNEON()) 13665 return SDValue(); 13666 13667 SDValue Op = N->getOperand(0); 13668 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || 13669 Op.getOpcode() != ISD::FMUL) 13670 return SDValue(); 13671 13672 SDValue ConstVec = Op->getOperand(1); 13673 if (!isa<BuildVectorSDNode>(ConstVec)) 13674 return SDValue(); 13675 13676 MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); 13677 uint32_t FloatBits = FloatTy.getSizeInBits(); 13678 MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); 13679 uint32_t IntBits = IntTy.getSizeInBits(); 13680 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 13681 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) { 13682 // These instructions only exist converting from f32 to i32. We can handle 13683 // smaller integers by generating an extra truncate, but larger ones would 13684 // be lossy. We also can't handle anything other than 2 or 4 lanes, since 13685 // these intructions only support v2i32/v4i32 types. 13686 return SDValue(); 13687 } 13688 13689 BitVector UndefElements; 13690 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 13691 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); 13692 if (C == -1 || C == 0 || C > 32) 13693 return SDValue(); 13694 13695 SDLoc dl(N); 13696 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT; 13697 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs : 13698 Intrinsic::arm_neon_vcvtfp2fxu; 13699 SDValue FixConv = DAG.getNode( 13700 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, 13701 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0), 13702 DAG.getConstant(C, dl, MVT::i32)); 13703 13704 if (IntBits < FloatBits) 13705 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv); 13706 13707 return FixConv; 13708 } 13709 13710 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD) 13711 /// can replace combinations of VCVT (integer to floating-point) and VDIV 13712 /// when the VDIV has a constant operand that is a power of 2. 13713 /// 13714 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 13715 /// vcvt.f32.s32 d16, d16 13716 /// vdiv.f32 d16, d17, d16 13717 /// becomes: 13718 /// vcvt.f32.s32 d16, d16, #3 13719 static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG, 13720 const ARMSubtarget *Subtarget) { 13721 if (!Subtarget->hasNEON()) 13722 return SDValue(); 13723 13724 SDValue Op = N->getOperand(0); 13725 unsigned OpOpcode = Op.getNode()->getOpcode(); 13726 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() || 13727 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP)) 13728 return SDValue(); 13729 13730 SDValue ConstVec = N->getOperand(1); 13731 if (!isa<BuildVectorSDNode>(ConstVec)) 13732 return SDValue(); 13733 13734 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); 13735 uint32_t FloatBits = FloatTy.getSizeInBits(); 13736 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); 13737 uint32_t IntBits = IntTy.getSizeInBits(); 13738 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 13739 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) { 13740 // These instructions only exist converting from i32 to f32. We can handle 13741 // smaller integers by generating an extra extend, but larger ones would 13742 // be lossy. We also can't handle anything other than 2 or 4 lanes, since 13743 // these intructions only support v2i32/v4i32 types. 13744 return SDValue(); 13745 } 13746 13747 BitVector UndefElements; 13748 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 13749 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); 13750 if (C == -1 || C == 0 || C > 32) 13751 return SDValue(); 13752 13753 SDLoc dl(N); 13754 bool isSigned = OpOpcode == ISD::SINT_TO_FP; 13755 SDValue ConvInput = Op.getOperand(0); 13756 if (IntBits < FloatBits) 13757 ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, 13758 dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, 13759 ConvInput); 13760 13761 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp : 13762 Intrinsic::arm_neon_vcvtfxu2fp; 13763 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, 13764 Op.getValueType(), 13765 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), 13766 ConvInput, DAG.getConstant(C, dl, MVT::i32)); 13767 } 13768 13769 /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. 13770 static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { 13771 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 13772 switch (IntNo) { 13773 default: 13774 // Don't do anything for most intrinsics. 13775 break; 13776 13777 // Vector shifts: check for immediate versions and lower them. 13778 // Note: This is done during DAG combining instead of DAG legalizing because 13779 // the build_vectors for 64-bit vector element shift counts are generally 13780 // not legal, and it is hard to see their values after they get legalized to 13781 // loads from a constant pool. 13782 case Intrinsic::arm_neon_vshifts: 13783 case Intrinsic::arm_neon_vshiftu: 13784 case Intrinsic::arm_neon_vrshifts: 13785 case Intrinsic::arm_neon_vrshiftu: 13786 case Intrinsic::arm_neon_vrshiftn: 13787 case Intrinsic::arm_neon_vqshifts: 13788 case Intrinsic::arm_neon_vqshiftu: 13789 case Intrinsic::arm_neon_vqshiftsu: 13790 case Intrinsic::arm_neon_vqshiftns: 13791 case Intrinsic::arm_neon_vqshiftnu: 13792 case Intrinsic::arm_neon_vqshiftnsu: 13793 case Intrinsic::arm_neon_vqrshiftns: 13794 case Intrinsic::arm_neon_vqrshiftnu: 13795 case Intrinsic::arm_neon_vqrshiftnsu: { 13796 EVT VT = N->getOperand(1).getValueType(); 13797 int64_t Cnt; 13798 unsigned VShiftOpc = 0; 13799 13800 switch (IntNo) { 13801 case Intrinsic::arm_neon_vshifts: 13802 case Intrinsic::arm_neon_vshiftu: 13803 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) { 13804 VShiftOpc = ARMISD::VSHLIMM; 13805 break; 13806 } 13807 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) { 13808 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM 13809 : ARMISD::VSHRuIMM); 13810 break; 13811 } 13812 return SDValue(); 13813 13814 case Intrinsic::arm_neon_vrshifts: 13815 case Intrinsic::arm_neon_vrshiftu: 13816 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) 13817 break; 13818 return SDValue(); 13819 13820 case Intrinsic::arm_neon_vqshifts: 13821 case Intrinsic::arm_neon_vqshiftu: 13822 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 13823 break; 13824 return SDValue(); 13825 13826 case Intrinsic::arm_neon_vqshiftsu: 13827 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 13828 break; 13829 llvm_unreachable("invalid shift count for vqshlu intrinsic"); 13830 13831 case Intrinsic::arm_neon_vrshiftn: 13832 case Intrinsic::arm_neon_vqshiftns: 13833 case Intrinsic::arm_neon_vqshiftnu: 13834 case Intrinsic::arm_neon_vqshiftnsu: 13835 case Intrinsic::arm_neon_vqrshiftns: 13836 case Intrinsic::arm_neon_vqrshiftnu: 13837 case Intrinsic::arm_neon_vqrshiftnsu: 13838 // Narrowing shifts require an immediate right shift. 13839 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt)) 13840 break; 13841 llvm_unreachable("invalid shift count for narrowing vector shift " 13842 "intrinsic"); 13843 13844 default: 13845 llvm_unreachable("unhandled vector shift"); 13846 } 13847 13848 switch (IntNo) { 13849 case Intrinsic::arm_neon_vshifts: 13850 case Intrinsic::arm_neon_vshiftu: 13851 // Opcode already set above. 13852 break; 13853 case Intrinsic::arm_neon_vrshifts: 13854 VShiftOpc = ARMISD::VRSHRsIMM; 13855 break; 13856 case Intrinsic::arm_neon_vrshiftu: 13857 VShiftOpc = ARMISD::VRSHRuIMM; 13858 break; 13859 case Intrinsic::arm_neon_vrshiftn: 13860 VShiftOpc = ARMISD::VRSHRNIMM; 13861 break; 13862 case Intrinsic::arm_neon_vqshifts: 13863 VShiftOpc = ARMISD::VQSHLsIMM; 13864 break; 13865 case Intrinsic::arm_neon_vqshiftu: 13866 VShiftOpc = ARMISD::VQSHLuIMM; 13867 break; 13868 case Intrinsic::arm_neon_vqshiftsu: 13869 VShiftOpc = ARMISD::VQSHLsuIMM; 13870 break; 13871 case Intrinsic::arm_neon_vqshiftns: 13872 VShiftOpc = ARMISD::VQSHRNsIMM; 13873 break; 13874 case Intrinsic::arm_neon_vqshiftnu: 13875 VShiftOpc = ARMISD::VQSHRNuIMM; 13876 break; 13877 case Intrinsic::arm_neon_vqshiftnsu: 13878 VShiftOpc = ARMISD::VQSHRNsuIMM; 13879 break; 13880 case Intrinsic::arm_neon_vqrshiftns: 13881 VShiftOpc = ARMISD::VQRSHRNsIMM; 13882 break; 13883 case Intrinsic::arm_neon_vqrshiftnu: 13884 VShiftOpc = ARMISD::VQRSHRNuIMM; 13885 break; 13886 case Intrinsic::arm_neon_vqrshiftnsu: 13887 VShiftOpc = ARMISD::VQRSHRNsuIMM; 13888 break; 13889 } 13890 13891 SDLoc dl(N); 13892 return DAG.getNode(VShiftOpc, dl, N->getValueType(0), 13893 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32)); 13894 } 13895 13896 case Intrinsic::arm_neon_vshiftins: { 13897 EVT VT = N->getOperand(1).getValueType(); 13898 int64_t Cnt; 13899 unsigned VShiftOpc = 0; 13900 13901 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt)) 13902 VShiftOpc = ARMISD::VSLIIMM; 13903 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt)) 13904 VShiftOpc = ARMISD::VSRIIMM; 13905 else { 13906 llvm_unreachable("invalid shift count for vsli/vsri intrinsic"); 13907 } 13908 13909 SDLoc dl(N); 13910 return DAG.getNode(VShiftOpc, dl, N->getValueType(0), 13911 N->getOperand(1), N->getOperand(2), 13912 DAG.getConstant(Cnt, dl, MVT::i32)); 13913 } 13914 13915 case Intrinsic::arm_neon_vqrshifts: 13916 case Intrinsic::arm_neon_vqrshiftu: 13917 // No immediate versions of these to check for. 13918 break; 13919 } 13920 13921 return SDValue(); 13922 } 13923 13924 /// PerformShiftCombine - Checks for immediate versions of vector shifts and 13925 /// lowers them. As with the vector shift intrinsics, this is done during DAG 13926 /// combining instead of DAG legalizing because the build_vectors for 64-bit 13927 /// vector element shift counts are generally not legal, and it is hard to see 13928 /// their values after they get legalized to loads from a constant pool. 13929 static SDValue PerformShiftCombine(SDNode *N, 13930 TargetLowering::DAGCombinerInfo &DCI, 13931 const ARMSubtarget *ST) { 13932 SelectionDAG &DAG = DCI.DAG; 13933 EVT VT = N->getValueType(0); 13934 if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) { 13935 // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high 13936 // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16. 13937 SDValue N1 = N->getOperand(1); 13938 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) { 13939 SDValue N0 = N->getOperand(0); 13940 if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP && 13941 DAG.MaskedValueIsZero(N0.getOperand(0), 13942 APInt::getHighBitsSet(32, 16))) 13943 return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1); 13944 } 13945 } 13946 13947 if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 && 13948 N->getOperand(0)->getOpcode() == ISD::AND && 13949 N->getOperand(0)->hasOneUse()) { 13950 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 13951 return SDValue(); 13952 // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't 13953 // usually show up because instcombine prefers to canonicalize it to 13954 // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come 13955 // out of GEP lowering in some cases. 13956 SDValue N0 = N->getOperand(0); 13957 ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1)); 13958 if (!ShiftAmtNode) 13959 return SDValue(); 13960 uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue()); 13961 ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1)); 13962 if (!AndMaskNode) 13963 return SDValue(); 13964 uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue()); 13965 // Don't transform uxtb/uxth. 13966 if (AndMask == 255 || AndMask == 65535) 13967 return SDValue(); 13968 if (isMask_32(AndMask)) { 13969 uint32_t MaskedBits = countLeadingZeros(AndMask); 13970 if (MaskedBits > ShiftAmt) { 13971 SDLoc DL(N); 13972 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 13973 DAG.getConstant(MaskedBits, DL, MVT::i32)); 13974 return DAG.getNode( 13975 ISD::SRL, DL, MVT::i32, SHL, 13976 DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32)); 13977 } 13978 } 13979 } 13980 13981 // Nothing to be done for scalar shifts. 13982 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13983 if (!VT.isVector() || !TLI.isTypeLegal(VT)) 13984 return SDValue(); 13985 if (ST->hasMVEIntegerOps() && VT == MVT::v2i64) 13986 return SDValue(); 13987 13988 int64_t Cnt; 13989 13990 switch (N->getOpcode()) { 13991 default: llvm_unreachable("unexpected shift opcode"); 13992 13993 case ISD::SHL: 13994 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) { 13995 SDLoc dl(N); 13996 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0), 13997 DAG.getConstant(Cnt, dl, MVT::i32)); 13998 } 13999 break; 14000 14001 case ISD::SRA: 14002 case ISD::SRL: 14003 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { 14004 unsigned VShiftOpc = 14005 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM); 14006 SDLoc dl(N); 14007 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), 14008 DAG.getConstant(Cnt, dl, MVT::i32)); 14009 } 14010 } 14011 return SDValue(); 14012 } 14013 14014 // Look for a sign/zero extend of a larger than legal load. This can be split 14015 // into two extending loads, which are simpler to deal with than an arbitrary 14016 // sign extend. 14017 static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) { 14018 SDValue N0 = N->getOperand(0); 14019 if (N0.getOpcode() != ISD::LOAD) 14020 return SDValue(); 14021 LoadSDNode *LD = cast<LoadSDNode>(N0.getNode()); 14022 if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() || 14023 LD->getExtensionType() != ISD::NON_EXTLOAD) 14024 return SDValue(); 14025 EVT FromVT = LD->getValueType(0); 14026 EVT ToVT = N->getValueType(0); 14027 if (!ToVT.isVector()) 14028 return SDValue(); 14029 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements()); 14030 EVT ToEltVT = ToVT.getVectorElementType(); 14031 EVT FromEltVT = FromVT.getVectorElementType(); 14032 14033 unsigned NumElements = 0; 14034 if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8)) 14035 NumElements = 4; 14036 if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8) 14037 NumElements = 8; 14038 if (NumElements == 0 || 14039 FromVT.getVectorNumElements() == NumElements || 14040 FromVT.getVectorNumElements() % NumElements != 0 || 14041 !isPowerOf2_32(NumElements)) 14042 return SDValue(); 14043 14044 SDLoc DL(LD); 14045 // Details about the old load 14046 SDValue Ch = LD->getChain(); 14047 SDValue BasePtr = LD->getBasePtr(); 14048 unsigned Alignment = LD->getOriginalAlignment(); 14049 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags(); 14050 AAMDNodes AAInfo = LD->getAAInfo(); 14051 14052 ISD::LoadExtType NewExtType = 14053 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 14054 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType()); 14055 EVT NewFromVT = FromVT.getHalfNumVectorElementsVT(*DAG.getContext()); 14056 EVT NewToVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext()); 14057 unsigned NewOffset = NewFromVT.getSizeInBits() / 8; 14058 SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset); 14059 14060 // Split the load in half, each side of which is extended separately. This 14061 // is good enough, as legalisation will take it from there. They are either 14062 // already legal or they will be split further into something that is 14063 // legal. 14064 SDValue NewLoad1 = 14065 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, BasePtr, Offset, 14066 LD->getPointerInfo(), NewFromVT, Alignment, MMOFlags, AAInfo); 14067 SDValue NewLoad2 = 14068 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset, 14069 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT, 14070 Alignment, MMOFlags, AAInfo); 14071 14072 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 14073 SDValue(NewLoad1.getNode(), 1), 14074 SDValue(NewLoad2.getNode(), 1)); 14075 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain); 14076 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, NewLoad1, NewLoad2); 14077 } 14078 14079 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, 14080 /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND. 14081 static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, 14082 const ARMSubtarget *ST) { 14083 SDValue N0 = N->getOperand(0); 14084 14085 // Check for sign- and zero-extensions of vector extract operations of 8- and 14086 // 16-bit vector elements. NEON and MVE support these directly. They are 14087 // handled during DAG combining because type legalization will promote them 14088 // to 32-bit types and it is messy to recognize the operations after that. 14089 if ((ST->hasNEON() || ST->hasMVEIntegerOps()) && 14090 N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 14091 SDValue Vec = N0.getOperand(0); 14092 SDValue Lane = N0.getOperand(1); 14093 EVT VT = N->getValueType(0); 14094 EVT EltVT = N0.getValueType(); 14095 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 14096 14097 if (VT == MVT::i32 && 14098 (EltVT == MVT::i8 || EltVT == MVT::i16) && 14099 TLI.isTypeLegal(Vec.getValueType()) && 14100 isa<ConstantSDNode>(Lane)) { 14101 14102 unsigned Opc = 0; 14103 switch (N->getOpcode()) { 14104 default: llvm_unreachable("unexpected opcode"); 14105 case ISD::SIGN_EXTEND: 14106 Opc = ARMISD::VGETLANEs; 14107 break; 14108 case ISD::ZERO_EXTEND: 14109 case ISD::ANY_EXTEND: 14110 Opc = ARMISD::VGETLANEu; 14111 break; 14112 } 14113 return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane); 14114 } 14115 } 14116 14117 if (ST->hasMVEIntegerOps()) 14118 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG)) 14119 return NewLoad; 14120 14121 return SDValue(); 14122 } 14123 14124 static const APInt *isPowerOf2Constant(SDValue V) { 14125 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 14126 if (!C) 14127 return nullptr; 14128 const APInt *CV = &C->getAPIntValue(); 14129 return CV->isPowerOf2() ? CV : nullptr; 14130 } 14131 14132 SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const { 14133 // If we have a CMOV, OR and AND combination such as: 14134 // if (x & CN) 14135 // y |= CM; 14136 // 14137 // And: 14138 // * CN is a single bit; 14139 // * All bits covered by CM are known zero in y 14140 // 14141 // Then we can convert this into a sequence of BFI instructions. This will 14142 // always be a win if CM is a single bit, will always be no worse than the 14143 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is 14144 // three bits (due to the extra IT instruction). 14145 14146 SDValue Op0 = CMOV->getOperand(0); 14147 SDValue Op1 = CMOV->getOperand(1); 14148 auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2)); 14149 auto CC = CCNode->getAPIntValue().getLimitedValue(); 14150 SDValue CmpZ = CMOV->getOperand(4); 14151 14152 // The compare must be against zero. 14153 if (!isNullConstant(CmpZ->getOperand(1))) 14154 return SDValue(); 14155 14156 assert(CmpZ->getOpcode() == ARMISD::CMPZ); 14157 SDValue And = CmpZ->getOperand(0); 14158 if (And->getOpcode() != ISD::AND) 14159 return SDValue(); 14160 const APInt *AndC = isPowerOf2Constant(And->getOperand(1)); 14161 if (!AndC) 14162 return SDValue(); 14163 SDValue X = And->getOperand(0); 14164 14165 if (CC == ARMCC::EQ) { 14166 // We're performing an "equal to zero" compare. Swap the operands so we 14167 // canonicalize on a "not equal to zero" compare. 14168 std::swap(Op0, Op1); 14169 } else { 14170 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?"); 14171 } 14172 14173 if (Op1->getOpcode() != ISD::OR) 14174 return SDValue(); 14175 14176 ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1)); 14177 if (!OrC) 14178 return SDValue(); 14179 SDValue Y = Op1->getOperand(0); 14180 14181 if (Op0 != Y) 14182 return SDValue(); 14183 14184 // Now, is it profitable to continue? 14185 APInt OrCI = OrC->getAPIntValue(); 14186 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2; 14187 if (OrCI.countPopulation() > Heuristic) 14188 return SDValue(); 14189 14190 // Lastly, can we determine that the bits defined by OrCI 14191 // are zero in Y? 14192 KnownBits Known = DAG.computeKnownBits(Y); 14193 if ((OrCI & Known.Zero) != OrCI) 14194 return SDValue(); 14195 14196 // OK, we can do the combine. 14197 SDValue V = Y; 14198 SDLoc dl(X); 14199 EVT VT = X.getValueType(); 14200 unsigned BitInX = AndC->logBase2(); 14201 14202 if (BitInX != 0) { 14203 // We must shift X first. 14204 X = DAG.getNode(ISD::SRL, dl, VT, X, 14205 DAG.getConstant(BitInX, dl, VT)); 14206 } 14207 14208 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits(); 14209 BitInY < NumActiveBits; ++BitInY) { 14210 if (OrCI[BitInY] == 0) 14211 continue; 14212 APInt Mask(VT.getSizeInBits(), 0); 14213 Mask.setBit(BitInY); 14214 V = DAG.getNode(ARMISD::BFI, dl, VT, V, X, 14215 // Confusingly, the operand is an *inverted* mask. 14216 DAG.getConstant(~Mask, dl, VT)); 14217 } 14218 14219 return V; 14220 } 14221 14222 // Given N, the value controlling the conditional branch, search for the loop 14223 // intrinsic, returning it, along with how the value is used. We need to handle 14224 // patterns such as the following: 14225 // (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit) 14226 // (brcond (setcc (loop.decrement), 0, eq), exit) 14227 // (brcond (setcc (loop.decrement), 0, ne), header) 14228 static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm, 14229 bool &Negate) { 14230 switch (N->getOpcode()) { 14231 default: 14232 break; 14233 case ISD::XOR: { 14234 if (!isa<ConstantSDNode>(N.getOperand(1))) 14235 return SDValue(); 14236 if (!cast<ConstantSDNode>(N.getOperand(1))->isOne()) 14237 return SDValue(); 14238 Negate = !Negate; 14239 return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate); 14240 } 14241 case ISD::SETCC: { 14242 auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1)); 14243 if (!Const) 14244 return SDValue(); 14245 if (Const->isNullValue()) 14246 Imm = 0; 14247 else if (Const->isOne()) 14248 Imm = 1; 14249 else 14250 return SDValue(); 14251 CC = cast<CondCodeSDNode>(N.getOperand(2))->get(); 14252 return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate); 14253 } 14254 case ISD::INTRINSIC_W_CHAIN: { 14255 unsigned IntOp = cast<ConstantSDNode>(N.getOperand(1))->getZExtValue(); 14256 if (IntOp != Intrinsic::test_set_loop_iterations && 14257 IntOp != Intrinsic::loop_decrement_reg) 14258 return SDValue(); 14259 return N; 14260 } 14261 } 14262 return SDValue(); 14263 } 14264 14265 static SDValue PerformHWLoopCombine(SDNode *N, 14266 TargetLowering::DAGCombinerInfo &DCI, 14267 const ARMSubtarget *ST) { 14268 14269 // The hwloop intrinsics that we're interested are used for control-flow, 14270 // either for entering or exiting the loop: 14271 // - test.set.loop.iterations will test whether its operand is zero. If it 14272 // is zero, the proceeding branch should not enter the loop. 14273 // - loop.decrement.reg also tests whether its operand is zero. If it is 14274 // zero, the proceeding branch should not branch back to the beginning of 14275 // the loop. 14276 // So here, we need to check that how the brcond is using the result of each 14277 // of the intrinsics to ensure that we're branching to the right place at the 14278 // right time. 14279 14280 ISD::CondCode CC; 14281 SDValue Cond; 14282 int Imm = 1; 14283 bool Negate = false; 14284 SDValue Chain = N->getOperand(0); 14285 SDValue Dest; 14286 14287 if (N->getOpcode() == ISD::BRCOND) { 14288 CC = ISD::SETEQ; 14289 Cond = N->getOperand(1); 14290 Dest = N->getOperand(2); 14291 } else { 14292 assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!"); 14293 CC = cast<CondCodeSDNode>(N->getOperand(1))->get(); 14294 Cond = N->getOperand(2); 14295 Dest = N->getOperand(4); 14296 if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) { 14297 if (!Const->isOne() && !Const->isNullValue()) 14298 return SDValue(); 14299 Imm = Const->getZExtValue(); 14300 } else 14301 return SDValue(); 14302 } 14303 14304 SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate); 14305 if (!Int) 14306 return SDValue(); 14307 14308 if (Negate) 14309 CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32); 14310 14311 auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) { 14312 return (CC == ISD::SETEQ && Imm == 0) || 14313 (CC == ISD::SETNE && Imm == 1) || 14314 (CC == ISD::SETLT && Imm == 1) || 14315 (CC == ISD::SETULT && Imm == 1); 14316 }; 14317 14318 auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) { 14319 return (CC == ISD::SETEQ && Imm == 1) || 14320 (CC == ISD::SETNE && Imm == 0) || 14321 (CC == ISD::SETGT && Imm == 0) || 14322 (CC == ISD::SETUGT && Imm == 0) || 14323 (CC == ISD::SETGE && Imm == 1) || 14324 (CC == ISD::SETUGE && Imm == 1); 14325 }; 14326 14327 assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) && 14328 "unsupported condition"); 14329 14330 SDLoc dl(Int); 14331 SelectionDAG &DAG = DCI.DAG; 14332 SDValue Elements = Int.getOperand(2); 14333 unsigned IntOp = cast<ConstantSDNode>(Int->getOperand(1))->getZExtValue(); 14334 assert((N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BR) 14335 && "expected single br user"); 14336 SDNode *Br = *N->use_begin(); 14337 SDValue OtherTarget = Br->getOperand(1); 14338 14339 // Update the unconditional branch to branch to the given Dest. 14340 auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) { 14341 SDValue NewBrOps[] = { Br->getOperand(0), Dest }; 14342 SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps); 14343 DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr); 14344 }; 14345 14346 if (IntOp == Intrinsic::test_set_loop_iterations) { 14347 SDValue Res; 14348 // We expect this 'instruction' to branch when the counter is zero. 14349 if (IsTrueIfZero(CC, Imm)) { 14350 SDValue Ops[] = { Chain, Elements, Dest }; 14351 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); 14352 } else { 14353 // The logic is the reverse of what we need for WLS, so find the other 14354 // basic block target: the target of the proceeding br. 14355 UpdateUncondBr(Br, Dest, DAG); 14356 14357 SDValue Ops[] = { Chain, Elements, OtherTarget }; 14358 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); 14359 } 14360 DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0)); 14361 return Res; 14362 } else { 14363 SDValue Size = DAG.getTargetConstant( 14364 cast<ConstantSDNode>(Int.getOperand(3))->getZExtValue(), dl, MVT::i32); 14365 SDValue Args[] = { Int.getOperand(0), Elements, Size, }; 14366 SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl, 14367 DAG.getVTList(MVT::i32, MVT::Other), Args); 14368 DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode()); 14369 14370 // We expect this instruction to branch when the count is not zero. 14371 SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget; 14372 14373 // Update the unconditional branch to target the loop preheader if we've 14374 // found the condition has been reversed. 14375 if (Target == OtherTarget) 14376 UpdateUncondBr(Br, Dest, DAG); 14377 14378 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 14379 SDValue(LoopDec.getNode(), 1), Chain); 14380 14381 SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target }; 14382 return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs); 14383 } 14384 return SDValue(); 14385 } 14386 14387 /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND. 14388 SDValue 14389 ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const { 14390 SDValue Cmp = N->getOperand(4); 14391 if (Cmp.getOpcode() != ARMISD::CMPZ) 14392 // Only looking at NE cases. 14393 return SDValue(); 14394 14395 EVT VT = N->getValueType(0); 14396 SDLoc dl(N); 14397 SDValue LHS = Cmp.getOperand(0); 14398 SDValue RHS = Cmp.getOperand(1); 14399 SDValue Chain = N->getOperand(0); 14400 SDValue BB = N->getOperand(1); 14401 SDValue ARMcc = N->getOperand(2); 14402 ARMCC::CondCodes CC = 14403 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); 14404 14405 // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0)) 14406 // -> (brcond Chain BB CC CPSR Cmp) 14407 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() && 14408 LHS->getOperand(0)->getOpcode() == ARMISD::CMOV && 14409 LHS->getOperand(0)->hasOneUse()) { 14410 auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0)); 14411 auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1)); 14412 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); 14413 auto *RHSC = dyn_cast<ConstantSDNode>(RHS); 14414 if ((LHS00C && LHS00C->getZExtValue() == 0) && 14415 (LHS01C && LHS01C->getZExtValue() == 1) && 14416 (LHS1C && LHS1C->getZExtValue() == 1) && 14417 (RHSC && RHSC->getZExtValue() == 0)) { 14418 return DAG.getNode( 14419 ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2), 14420 LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4)); 14421 } 14422 } 14423 14424 return SDValue(); 14425 } 14426 14427 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV. 14428 SDValue 14429 ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { 14430 SDValue Cmp = N->getOperand(4); 14431 if (Cmp.getOpcode() != ARMISD::CMPZ) 14432 // Only looking at EQ and NE cases. 14433 return SDValue(); 14434 14435 EVT VT = N->getValueType(0); 14436 SDLoc dl(N); 14437 SDValue LHS = Cmp.getOperand(0); 14438 SDValue RHS = Cmp.getOperand(1); 14439 SDValue FalseVal = N->getOperand(0); 14440 SDValue TrueVal = N->getOperand(1); 14441 SDValue ARMcc = N->getOperand(2); 14442 ARMCC::CondCodes CC = 14443 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); 14444 14445 // BFI is only available on V6T2+. 14446 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) { 14447 SDValue R = PerformCMOVToBFICombine(N, DAG); 14448 if (R) 14449 return R; 14450 } 14451 14452 // Simplify 14453 // mov r1, r0 14454 // cmp r1, x 14455 // mov r0, y 14456 // moveq r0, x 14457 // to 14458 // cmp r0, x 14459 // movne r0, y 14460 // 14461 // mov r1, r0 14462 // cmp r1, x 14463 // mov r0, x 14464 // movne r0, y 14465 // to 14466 // cmp r0, x 14467 // movne r0, y 14468 /// FIXME: Turn this into a target neutral optimization? 14469 SDValue Res; 14470 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) { 14471 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, 14472 N->getOperand(3), Cmp); 14473 } else if (CC == ARMCC::EQ && TrueVal == RHS) { 14474 SDValue ARMcc; 14475 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl); 14476 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, 14477 N->getOperand(3), NewCmp); 14478 } 14479 14480 // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0)) 14481 // -> (cmov F T CC CPSR Cmp) 14482 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) { 14483 auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)); 14484 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); 14485 auto *RHSC = dyn_cast<ConstantSDNode>(RHS); 14486 if ((LHS0C && LHS0C->getZExtValue() == 0) && 14487 (LHS1C && LHS1C->getZExtValue() == 1) && 14488 (RHSC && RHSC->getZExtValue() == 0)) { 14489 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, 14490 LHS->getOperand(2), LHS->getOperand(3), 14491 LHS->getOperand(4)); 14492 } 14493 } 14494 14495 if (!VT.isInteger()) 14496 return SDValue(); 14497 14498 // Materialize a boolean comparison for integers so we can avoid branching. 14499 if (isNullConstant(FalseVal)) { 14500 if (CC == ARMCC::EQ && isOneConstant(TrueVal)) { 14501 if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) { 14502 // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it 14503 // right 5 bits will make that 32 be 1, otherwise it will be 0. 14504 // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5 14505 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); 14506 Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub), 14507 DAG.getConstant(5, dl, MVT::i32)); 14508 } else { 14509 // CMOV 0, 1, ==, (CMPZ x, y) -> 14510 // (ADDCARRY (SUB x, y), t:0, t:1) 14511 // where t = (SUBCARRY 0, (SUB x, y), 0) 14512 // 14513 // The SUBCARRY computes 0 - (x - y) and this will give a borrow when 14514 // x != y. In other words, a carry C == 1 when x == y, C == 0 14515 // otherwise. 14516 // The final ADDCARRY computes 14517 // x - y + (0 - (x - y)) + C == C 14518 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); 14519 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 14520 SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub); 14521 // ISD::SUBCARRY returns a borrow but we want the carry here 14522 // actually. 14523 SDValue Carry = 14524 DAG.getNode(ISD::SUB, dl, MVT::i32, 14525 DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1)); 14526 Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry); 14527 } 14528 } else if (CC == ARMCC::NE && !isNullConstant(RHS) && 14529 (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) { 14530 // This seems pointless but will allow us to combine it further below. 14531 // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1 14532 SDValue Sub = 14533 DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS); 14534 SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, 14535 Sub.getValue(1), SDValue()); 14536 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc, 14537 N->getOperand(3), CPSRGlue.getValue(1)); 14538 FalseVal = Sub; 14539 } 14540 } else if (isNullConstant(TrueVal)) { 14541 if (CC == ARMCC::EQ && !isNullConstant(RHS) && 14542 (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) { 14543 // This seems pointless but will allow us to combine it further below 14544 // Note that we change == for != as this is the dual for the case above. 14545 // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1 14546 SDValue Sub = 14547 DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS); 14548 SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, 14549 Sub.getValue(1), SDValue()); 14550 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal, 14551 DAG.getConstant(ARMCC::NE, dl, MVT::i32), 14552 N->getOperand(3), CPSRGlue.getValue(1)); 14553 FalseVal = Sub; 14554 } 14555 } 14556 14557 // On Thumb1, the DAG above may be further combined if z is a power of 2 14558 // (z == 2 ^ K). 14559 // CMOV (SUBS x, y), z, !=, (SUBS x, y):1 -> 14560 // t1 = (USUBO (SUB x, y), 1) 14561 // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1) 14562 // Result = if K != 0 then (SHL t2:0, K) else t2:0 14563 // 14564 // This also handles the special case of comparing against zero; it's 14565 // essentially, the same pattern, except there's no SUBS: 14566 // CMOV x, z, !=, (CMPZ x, 0) -> 14567 // t1 = (USUBO x, 1) 14568 // t2 = (SUBCARRY x, t1:0, t1:1) 14569 // Result = if K != 0 then (SHL t2:0, K) else t2:0 14570 const APInt *TrueConst; 14571 if (Subtarget->isThumb1Only() && CC == ARMCC::NE && 14572 ((FalseVal.getOpcode() == ARMISD::SUBS && 14573 FalseVal.getOperand(0) == LHS && FalseVal.getOperand(1) == RHS) || 14574 (FalseVal == LHS && isNullConstant(RHS))) && 14575 (TrueConst = isPowerOf2Constant(TrueVal))) { 14576 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 14577 unsigned ShiftAmount = TrueConst->logBase2(); 14578 if (ShiftAmount) 14579 TrueVal = DAG.getConstant(1, dl, VT); 14580 SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal); 14581 Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1)); 14582 14583 if (ShiftAmount) 14584 Res = DAG.getNode(ISD::SHL, dl, VT, Res, 14585 DAG.getConstant(ShiftAmount, dl, MVT::i32)); 14586 } 14587 14588 if (Res.getNode()) { 14589 KnownBits Known = DAG.computeKnownBits(SDValue(N,0)); 14590 // Capture demanded bits information that would be otherwise lost. 14591 if (Known.Zero == 0xfffffffe) 14592 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 14593 DAG.getValueType(MVT::i1)); 14594 else if (Known.Zero == 0xffffff00) 14595 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 14596 DAG.getValueType(MVT::i8)); 14597 else if (Known.Zero == 0xffff0000) 14598 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 14599 DAG.getValueType(MVT::i16)); 14600 } 14601 14602 return Res; 14603 } 14604 14605 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, 14606 DAGCombinerInfo &DCI) const { 14607 switch (N->getOpcode()) { 14608 default: break; 14609 case ISD::ABS: return PerformABSCombine(N, DCI, Subtarget); 14610 case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget); 14611 case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget); 14612 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget); 14613 case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget); 14614 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); 14615 case ISD::OR: return PerformORCombine(N, DCI, Subtarget); 14616 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget); 14617 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget); 14618 case ISD::BRCOND: 14619 case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget); 14620 case ARMISD::ADDC: 14621 case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget); 14622 case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget); 14623 case ARMISD::BFI: return PerformBFICombine(N, DCI); 14624 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget); 14625 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); 14626 case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget); 14627 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget); 14628 case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); 14629 case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); 14630 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); 14631 case ARMISD::VDUP: return PerformVDUPCombine(N, DCI, Subtarget); 14632 case ISD::FP_TO_SINT: 14633 case ISD::FP_TO_UINT: 14634 return PerformVCVTCombine(N, DCI.DAG, Subtarget); 14635 case ISD::FDIV: 14636 return PerformVDIVCombine(N, DCI.DAG, Subtarget); 14637 case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); 14638 case ISD::SHL: 14639 case ISD::SRA: 14640 case ISD::SRL: 14641 return PerformShiftCombine(N, DCI, Subtarget); 14642 case ISD::SIGN_EXTEND: 14643 case ISD::ZERO_EXTEND: 14644 case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); 14645 case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG); 14646 case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG); 14647 case ISD::LOAD: return PerformLOADCombine(N, DCI); 14648 case ARMISD::VLD1DUP: 14649 case ARMISD::VLD2DUP: 14650 case ARMISD::VLD3DUP: 14651 case ARMISD::VLD4DUP: 14652 return PerformVLDCombine(N, DCI); 14653 case ARMISD::BUILD_VECTOR: 14654 return PerformARMBUILD_VECTORCombine(N, DCI); 14655 case ARMISD::PREDICATE_CAST: 14656 return PerformPREDICATE_CASTCombine(N, DCI); 14657 case ARMISD::VCMP: 14658 return PerformVCMPCombine(N, DCI, Subtarget); 14659 case ARMISD::SMULWB: { 14660 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 14661 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); 14662 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) 14663 return SDValue(); 14664 break; 14665 } 14666 case ARMISD::SMULWT: { 14667 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 14668 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); 14669 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) 14670 return SDValue(); 14671 break; 14672 } 14673 case ARMISD::SMLALBB: 14674 case ARMISD::QADD16b: 14675 case ARMISD::QSUB16b: { 14676 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 14677 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); 14678 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 14679 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 14680 return SDValue(); 14681 break; 14682 } 14683 case ARMISD::SMLALBT: { 14684 unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits(); 14685 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); 14686 unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits(); 14687 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); 14688 if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) || 14689 (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI))) 14690 return SDValue(); 14691 break; 14692 } 14693 case ARMISD::SMLALTB: { 14694 unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits(); 14695 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); 14696 unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits(); 14697 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); 14698 if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) || 14699 (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI))) 14700 return SDValue(); 14701 break; 14702 } 14703 case ARMISD::SMLALTT: { 14704 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 14705 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); 14706 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 14707 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 14708 return SDValue(); 14709 break; 14710 } 14711 case ARMISD::QADD8b: 14712 case ARMISD::QSUB8b: { 14713 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 14714 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8); 14715 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 14716 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 14717 return SDValue(); 14718 break; 14719 } 14720 case ISD::INTRINSIC_VOID: 14721 case ISD::INTRINSIC_W_CHAIN: 14722 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 14723 case Intrinsic::arm_neon_vld1: 14724 case Intrinsic::arm_neon_vld1x2: 14725 case Intrinsic::arm_neon_vld1x3: 14726 case Intrinsic::arm_neon_vld1x4: 14727 case Intrinsic::arm_neon_vld2: 14728 case Intrinsic::arm_neon_vld3: 14729 case Intrinsic::arm_neon_vld4: 14730 case Intrinsic::arm_neon_vld2lane: 14731 case Intrinsic::arm_neon_vld3lane: 14732 case Intrinsic::arm_neon_vld4lane: 14733 case Intrinsic::arm_neon_vld2dup: 14734 case Intrinsic::arm_neon_vld3dup: 14735 case Intrinsic::arm_neon_vld4dup: 14736 case Intrinsic::arm_neon_vst1: 14737 case Intrinsic::arm_neon_vst1x2: 14738 case Intrinsic::arm_neon_vst1x3: 14739 case Intrinsic::arm_neon_vst1x4: 14740 case Intrinsic::arm_neon_vst2: 14741 case Intrinsic::arm_neon_vst3: 14742 case Intrinsic::arm_neon_vst4: 14743 case Intrinsic::arm_neon_vst2lane: 14744 case Intrinsic::arm_neon_vst3lane: 14745 case Intrinsic::arm_neon_vst4lane: 14746 return PerformVLDCombine(N, DCI); 14747 default: break; 14748 } 14749 break; 14750 } 14751 return SDValue(); 14752 } 14753 14754 bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc, 14755 EVT VT) const { 14756 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE); 14757 } 14758 14759 bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, 14760 unsigned Alignment, 14761 MachineMemOperand::Flags, 14762 bool *Fast) const { 14763 // Depends what it gets converted into if the type is weird. 14764 if (!VT.isSimple()) 14765 return false; 14766 14767 // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus 14768 bool AllowsUnaligned = Subtarget->allowsUnalignedMem(); 14769 auto Ty = VT.getSimpleVT().SimpleTy; 14770 14771 if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) { 14772 // Unaligned access can use (for example) LRDB, LRDH, LDR 14773 if (AllowsUnaligned) { 14774 if (Fast) 14775 *Fast = Subtarget->hasV7Ops(); 14776 return true; 14777 } 14778 } 14779 14780 if (Ty == MVT::f64 || Ty == MVT::v2f64) { 14781 // For any little-endian targets with neon, we can support unaligned ld/st 14782 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8. 14783 // A big-endian target may also explicitly support unaligned accesses 14784 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) { 14785 if (Fast) 14786 *Fast = true; 14787 return true; 14788 } 14789 } 14790 14791 if (!Subtarget->hasMVEIntegerOps()) 14792 return false; 14793 14794 // These are for predicates 14795 if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1)) { 14796 if (Fast) 14797 *Fast = true; 14798 return true; 14799 } 14800 14801 // These are for truncated stores/narrowing loads. They are fine so long as 14802 // the alignment is at least the size of the item being loaded 14803 if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) && 14804 Alignment >= VT.getScalarSizeInBits() / 8) { 14805 if (Fast) 14806 *Fast = true; 14807 return true; 14808 } 14809 14810 // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and 14811 // VSTRW.U32 all store the vector register in exactly the same format, and 14812 // differ only in the range of their immediate offset field and the required 14813 // alignment. So there is always a store that can be used, regardless of 14814 // actual type. 14815 // 14816 // For big endian, that is not the case. But can still emit a (VSTRB.U8; 14817 // VREV64.8) pair and get the same effect. This will likely be better than 14818 // aligning the vector through the stack. 14819 if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 || 14820 Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 || 14821 Ty == MVT::v2f64) { 14822 if (Fast) 14823 *Fast = true; 14824 return true; 14825 } 14826 14827 return false; 14828 } 14829 14830 static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, 14831 unsigned AlignCheck) { 14832 return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && 14833 (DstAlign == 0 || DstAlign % AlignCheck == 0)); 14834 } 14835 14836 EVT ARMTargetLowering::getOptimalMemOpType( 14837 uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, 14838 bool ZeroMemset, bool MemcpyStrSrc, 14839 const AttributeList &FuncAttributes) const { 14840 // See if we can use NEON instructions for this... 14841 if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() && 14842 !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) { 14843 bool Fast; 14844 if (Size >= 16 && 14845 (memOpAlign(SrcAlign, DstAlign, 16) || 14846 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, 14847 MachineMemOperand::MONone, &Fast) && 14848 Fast))) { 14849 return MVT::v2f64; 14850 } else if (Size >= 8 && 14851 (memOpAlign(SrcAlign, DstAlign, 8) || 14852 (allowsMisalignedMemoryAccesses( 14853 MVT::f64, 0, 1, MachineMemOperand::MONone, &Fast) && 14854 Fast))) { 14855 return MVT::f64; 14856 } 14857 } 14858 14859 // Let the target-independent logic figure it out. 14860 return MVT::Other; 14861 } 14862 14863 // 64-bit integers are split into their high and low parts and held in two 14864 // different registers, so the trunc is free since the low register can just 14865 // be used. 14866 bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const { 14867 if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy()) 14868 return false; 14869 unsigned SrcBits = SrcTy->getPrimitiveSizeInBits(); 14870 unsigned DestBits = DstTy->getPrimitiveSizeInBits(); 14871 return (SrcBits == 64 && DestBits == 32); 14872 } 14873 14874 bool ARMTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const { 14875 if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() || 14876 !DstVT.isInteger()) 14877 return false; 14878 unsigned SrcBits = SrcVT.getSizeInBits(); 14879 unsigned DestBits = DstVT.getSizeInBits(); 14880 return (SrcBits == 64 && DestBits == 32); 14881 } 14882 14883 bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 14884 if (Val.getOpcode() != ISD::LOAD) 14885 return false; 14886 14887 EVT VT1 = Val.getValueType(); 14888 if (!VT1.isSimple() || !VT1.isInteger() || 14889 !VT2.isSimple() || !VT2.isInteger()) 14890 return false; 14891 14892 switch (VT1.getSimpleVT().SimpleTy) { 14893 default: break; 14894 case MVT::i1: 14895 case MVT::i8: 14896 case MVT::i16: 14897 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits. 14898 return true; 14899 } 14900 14901 return false; 14902 } 14903 14904 bool ARMTargetLowering::isFNegFree(EVT VT) const { 14905 if (!VT.isSimple()) 14906 return false; 14907 14908 // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that 14909 // negate values directly (fneg is free). So, we don't want to let the DAG 14910 // combiner rewrite fneg into xors and some other instructions. For f16 and 14911 // FullFP16 argument passing, some bitcast nodes may be introduced, 14912 // triggering this DAG combine rewrite, so we are avoiding that with this. 14913 switch (VT.getSimpleVT().SimpleTy) { 14914 default: break; 14915 case MVT::f16: 14916 return Subtarget->hasFullFP16(); 14917 } 14918 14919 return false; 14920 } 14921 14922 /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth 14923 /// of the vector elements. 14924 static bool areExtractExts(Value *Ext1, Value *Ext2) { 14925 auto areExtDoubled = [](Instruction *Ext) { 14926 return Ext->getType()->getScalarSizeInBits() == 14927 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); 14928 }; 14929 14930 if (!match(Ext1, m_ZExtOrSExt(m_Value())) || 14931 !match(Ext2, m_ZExtOrSExt(m_Value())) || 14932 !areExtDoubled(cast<Instruction>(Ext1)) || 14933 !areExtDoubled(cast<Instruction>(Ext2))) 14934 return false; 14935 14936 return true; 14937 } 14938 14939 /// Check if sinking \p I's operands to I's basic block is profitable, because 14940 /// the operands can be folded into a target instruction, e.g. 14941 /// sext/zext can be folded into vsubl. 14942 bool ARMTargetLowering::shouldSinkOperands(Instruction *I, 14943 SmallVectorImpl<Use *> &Ops) const { 14944 if (!I->getType()->isVectorTy()) 14945 return false; 14946 14947 if (Subtarget->hasNEON()) { 14948 switch (I->getOpcode()) { 14949 case Instruction::Sub: 14950 case Instruction::Add: { 14951 if (!areExtractExts(I->getOperand(0), I->getOperand(1))) 14952 return false; 14953 Ops.push_back(&I->getOperandUse(0)); 14954 Ops.push_back(&I->getOperandUse(1)); 14955 return true; 14956 } 14957 default: 14958 return false; 14959 } 14960 } 14961 14962 if (!Subtarget->hasMVEIntegerOps()) 14963 return false; 14964 14965 auto IsSinker = [](Instruction *I, int Operand) { 14966 switch (I->getOpcode()) { 14967 case Instruction::Add: 14968 case Instruction::Mul: 14969 case Instruction::ICmp: 14970 return true; 14971 case Instruction::Sub: 14972 case Instruction::Shl: 14973 case Instruction::LShr: 14974 case Instruction::AShr: 14975 return Operand == 1; 14976 default: 14977 return false; 14978 } 14979 }; 14980 14981 int Op = 0; 14982 if (!isa<ShuffleVectorInst>(I->getOperand(Op))) 14983 Op = 1; 14984 if (!IsSinker(I, Op)) 14985 return false; 14986 if (!match(I->getOperand(Op), 14987 m_ShuffleVector(m_InsertElement(m_Undef(), m_Value(), m_ZeroInt()), 14988 m_Undef(), m_Zero()))) { 14989 return false; 14990 } 14991 Instruction *Shuffle = cast<Instruction>(I->getOperand(Op)); 14992 // All uses of the shuffle should be sunk to avoid duplicating it across gpr 14993 // and vector registers 14994 for (Use &U : Shuffle->uses()) { 14995 Instruction *Insn = cast<Instruction>(U.getUser()); 14996 if (!IsSinker(Insn, U.getOperandNo())) 14997 return false; 14998 } 14999 Ops.push_back(&Shuffle->getOperandUse(0)); 15000 Ops.push_back(&I->getOperandUse(Op)); 15001 return true; 15002 } 15003 15004 bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { 15005 EVT VT = ExtVal.getValueType(); 15006 15007 if (!isTypeLegal(VT)) 15008 return false; 15009 15010 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) { 15011 if (Ld->isExpandingLoad()) 15012 return false; 15013 } 15014 15015 // Don't create a loadext if we can fold the extension into a wide/long 15016 // instruction. 15017 // If there's more than one user instruction, the loadext is desirable no 15018 // matter what. There can be two uses by the same instruction. 15019 if (ExtVal->use_empty() || 15020 !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode())) 15021 return true; 15022 15023 SDNode *U = *ExtVal->use_begin(); 15024 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB || 15025 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM)) 15026 return false; 15027 15028 return true; 15029 } 15030 15031 bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { 15032 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 15033 return false; 15034 15035 if (!isTypeLegal(EVT::getEVT(Ty1))) 15036 return false; 15037 15038 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); 15039 15040 // Assuming the caller doesn't have a zeroext or signext return parameter, 15041 // truncation all the way down to i1 is valid. 15042 return true; 15043 } 15044 15045 int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL, 15046 const AddrMode &AM, Type *Ty, 15047 unsigned AS) const { 15048 if (isLegalAddressingMode(DL, AM, Ty, AS)) { 15049 if (Subtarget->hasFPAO()) 15050 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster 15051 return 0; 15052 } 15053 return -1; 15054 } 15055 15056 /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster 15057 /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be 15058 /// expanded to FMAs when this method returns true, otherwise fmuladd is 15059 /// expanded to fmul + fadd. 15060 /// 15061 /// ARM supports both fused and unfused multiply-add operations; we already 15062 /// lower a pair of fmul and fadd to the latter so it's not clear that there 15063 /// would be a gain or that the gain would be worthwhile enough to risk 15064 /// correctness bugs. 15065 /// 15066 /// For MVE, we set this to true as it helps simplify the need for some 15067 /// patterns (and we don't have the non-fused floating point instruction). 15068 bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, 15069 EVT VT) const { 15070 if (!VT.isSimple()) 15071 return false; 15072 15073 switch (VT.getSimpleVT().SimpleTy) { 15074 case MVT::v4f32: 15075 case MVT::v8f16: 15076 return Subtarget->hasMVEFloatOps(); 15077 case MVT::f16: 15078 return Subtarget->useFPVFMx16(); 15079 case MVT::f32: 15080 return Subtarget->useFPVFMx(); 15081 case MVT::f64: 15082 return Subtarget->useFPVFMx64(); 15083 default: 15084 break; 15085 } 15086 15087 return false; 15088 } 15089 15090 static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { 15091 if (V < 0) 15092 return false; 15093 15094 unsigned Scale = 1; 15095 switch (VT.getSimpleVT().SimpleTy) { 15096 case MVT::i1: 15097 case MVT::i8: 15098 // Scale == 1; 15099 break; 15100 case MVT::i16: 15101 // Scale == 2; 15102 Scale = 2; 15103 break; 15104 default: 15105 // On thumb1 we load most things (i32, i64, floats, etc) with a LDR 15106 // Scale == 4; 15107 Scale = 4; 15108 break; 15109 } 15110 15111 if ((V & (Scale - 1)) != 0) 15112 return false; 15113 return isUInt<5>(V / Scale); 15114 } 15115 15116 static bool isLegalT2AddressImmediate(int64_t V, EVT VT, 15117 const ARMSubtarget *Subtarget) { 15118 if (!VT.isInteger() && !VT.isFloatingPoint()) 15119 return false; 15120 if (VT.isVector() && Subtarget->hasNEON()) 15121 return false; 15122 if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() && 15123 !Subtarget->hasMVEFloatOps()) 15124 return false; 15125 15126 bool IsNeg = false; 15127 if (V < 0) { 15128 IsNeg = true; 15129 V = -V; 15130 } 15131 15132 unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U); 15133 15134 // MVE: size * imm7 15135 if (VT.isVector() && Subtarget->hasMVEIntegerOps()) { 15136 switch (VT.getSimpleVT().getVectorElementType().SimpleTy) { 15137 case MVT::i32: 15138 case MVT::f32: 15139 return isShiftedUInt<7,2>(V); 15140 case MVT::i16: 15141 case MVT::f16: 15142 return isShiftedUInt<7,1>(V); 15143 case MVT::i8: 15144 return isUInt<7>(V); 15145 default: 15146 return false; 15147 } 15148 } 15149 15150 // half VLDR: 2 * imm8 15151 if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16()) 15152 return isShiftedUInt<8, 1>(V); 15153 // VLDR and LDRD: 4 * imm8 15154 if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8) 15155 return isShiftedUInt<8, 2>(V); 15156 15157 if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) { 15158 // + imm12 or - imm8 15159 if (IsNeg) 15160 return isUInt<8>(V); 15161 return isUInt<12>(V); 15162 } 15163 15164 return false; 15165 } 15166 15167 /// isLegalAddressImmediate - Return true if the integer value can be used 15168 /// as the offset of the target addressing mode for load / store of the 15169 /// given type. 15170 static bool isLegalAddressImmediate(int64_t V, EVT VT, 15171 const ARMSubtarget *Subtarget) { 15172 if (V == 0) 15173 return true; 15174 15175 if (!VT.isSimple()) 15176 return false; 15177 15178 if (Subtarget->isThumb1Only()) 15179 return isLegalT1AddressImmediate(V, VT); 15180 else if (Subtarget->isThumb2()) 15181 return isLegalT2AddressImmediate(V, VT, Subtarget); 15182 15183 // ARM mode. 15184 if (V < 0) 15185 V = - V; 15186 switch (VT.getSimpleVT().SimpleTy) { 15187 default: return false; 15188 case MVT::i1: 15189 case MVT::i8: 15190 case MVT::i32: 15191 // +- imm12 15192 return isUInt<12>(V); 15193 case MVT::i16: 15194 // +- imm8 15195 return isUInt<8>(V); 15196 case MVT::f32: 15197 case MVT::f64: 15198 if (!Subtarget->hasVFP2Base()) // FIXME: NEON? 15199 return false; 15200 return isShiftedUInt<8, 2>(V); 15201 } 15202 } 15203 15204 bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM, 15205 EVT VT) const { 15206 int Scale = AM.Scale; 15207 if (Scale < 0) 15208 return false; 15209 15210 switch (VT.getSimpleVT().SimpleTy) { 15211 default: return false; 15212 case MVT::i1: 15213 case MVT::i8: 15214 case MVT::i16: 15215 case MVT::i32: 15216 if (Scale == 1) 15217 return true; 15218 // r + r << imm 15219 Scale = Scale & ~1; 15220 return Scale == 2 || Scale == 4 || Scale == 8; 15221 case MVT::i64: 15222 // FIXME: What are we trying to model here? ldrd doesn't have an r + r 15223 // version in Thumb mode. 15224 // r + r 15225 if (Scale == 1) 15226 return true; 15227 // r * 2 (this can be lowered to r + r). 15228 if (!AM.HasBaseReg && Scale == 2) 15229 return true; 15230 return false; 15231 case MVT::isVoid: 15232 // Note, we allow "void" uses (basically, uses that aren't loads or 15233 // stores), because arm allows folding a scale into many arithmetic 15234 // operations. This should be made more precise and revisited later. 15235 15236 // Allow r << imm, but the imm has to be a multiple of two. 15237 if (Scale & 1) return false; 15238 return isPowerOf2_32(Scale); 15239 } 15240 } 15241 15242 bool ARMTargetLowering::isLegalT1ScaledAddressingMode(const AddrMode &AM, 15243 EVT VT) const { 15244 const int Scale = AM.Scale; 15245 15246 // Negative scales are not supported in Thumb1. 15247 if (Scale < 0) 15248 return false; 15249 15250 // Thumb1 addressing modes do not support register scaling excepting the 15251 // following cases: 15252 // 1. Scale == 1 means no scaling. 15253 // 2. Scale == 2 this can be lowered to r + r if there is no base register. 15254 return (Scale == 1) || (!AM.HasBaseReg && Scale == 2); 15255 } 15256 15257 /// isLegalAddressingMode - Return true if the addressing mode represented 15258 /// by AM is legal for this target, for a load/store of the specified type. 15259 bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL, 15260 const AddrMode &AM, Type *Ty, 15261 unsigned AS, Instruction *I) const { 15262 EVT VT = getValueType(DL, Ty, true); 15263 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget)) 15264 return false; 15265 15266 // Can never fold addr of global into load/store. 15267 if (AM.BaseGV) 15268 return false; 15269 15270 switch (AM.Scale) { 15271 case 0: // no scale reg, must be "r+i" or "r", or "i". 15272 break; 15273 default: 15274 // ARM doesn't support any R+R*scale+imm addr modes. 15275 if (AM.BaseOffs) 15276 return false; 15277 15278 if (!VT.isSimple()) 15279 return false; 15280 15281 if (Subtarget->isThumb1Only()) 15282 return isLegalT1ScaledAddressingMode(AM, VT); 15283 15284 if (Subtarget->isThumb2()) 15285 return isLegalT2ScaledAddressingMode(AM, VT); 15286 15287 int Scale = AM.Scale; 15288 switch (VT.getSimpleVT().SimpleTy) { 15289 default: return false; 15290 case MVT::i1: 15291 case MVT::i8: 15292 case MVT::i32: 15293 if (Scale < 0) Scale = -Scale; 15294 if (Scale == 1) 15295 return true; 15296 // r + r << imm 15297 return isPowerOf2_32(Scale & ~1); 15298 case MVT::i16: 15299 case MVT::i64: 15300 // r +/- r 15301 if (Scale == 1 || (AM.HasBaseReg && Scale == -1)) 15302 return true; 15303 // r * 2 (this can be lowered to r + r). 15304 if (!AM.HasBaseReg && Scale == 2) 15305 return true; 15306 return false; 15307 15308 case MVT::isVoid: 15309 // Note, we allow "void" uses (basically, uses that aren't loads or 15310 // stores), because arm allows folding a scale into many arithmetic 15311 // operations. This should be made more precise and revisited later. 15312 15313 // Allow r << imm, but the imm has to be a multiple of two. 15314 if (Scale & 1) return false; 15315 return isPowerOf2_32(Scale); 15316 } 15317 } 15318 return true; 15319 } 15320 15321 /// isLegalICmpImmediate - Return true if the specified immediate is legal 15322 /// icmp immediate, that is the target has icmp instructions which can compare 15323 /// a register against the immediate without having to materialize the 15324 /// immediate into a register. 15325 bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const { 15326 // Thumb2 and ARM modes can use cmn for negative immediates. 15327 if (!Subtarget->isThumb()) 15328 return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 || 15329 ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1; 15330 if (Subtarget->isThumb2()) 15331 return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 || 15332 ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1; 15333 // Thumb1 doesn't have cmn, and only 8-bit immediates. 15334 return Imm >= 0 && Imm <= 255; 15335 } 15336 15337 /// isLegalAddImmediate - Return true if the specified immediate is a legal add 15338 /// *or sub* immediate, that is the target has add or sub instructions which can 15339 /// add a register with the immediate without having to materialize the 15340 /// immediate into a register. 15341 bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const { 15342 // Same encoding for add/sub, just flip the sign. 15343 int64_t AbsImm = std::abs(Imm); 15344 if (!Subtarget->isThumb()) 15345 return ARM_AM::getSOImmVal(AbsImm) != -1; 15346 if (Subtarget->isThumb2()) 15347 return ARM_AM::getT2SOImmVal(AbsImm) != -1; 15348 // Thumb1 only has 8-bit unsigned immediate. 15349 return AbsImm >= 0 && AbsImm <= 255; 15350 } 15351 15352 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, 15353 bool isSEXTLoad, SDValue &Base, 15354 SDValue &Offset, bool &isInc, 15355 SelectionDAG &DAG) { 15356 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 15357 return false; 15358 15359 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) { 15360 // AddressingMode 3 15361 Base = Ptr->getOperand(0); 15362 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 15363 int RHSC = (int)RHS->getZExtValue(); 15364 if (RHSC < 0 && RHSC > -256) { 15365 assert(Ptr->getOpcode() == ISD::ADD); 15366 isInc = false; 15367 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 15368 return true; 15369 } 15370 } 15371 isInc = (Ptr->getOpcode() == ISD::ADD); 15372 Offset = Ptr->getOperand(1); 15373 return true; 15374 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) { 15375 // AddressingMode 2 15376 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 15377 int RHSC = (int)RHS->getZExtValue(); 15378 if (RHSC < 0 && RHSC > -0x1000) { 15379 assert(Ptr->getOpcode() == ISD::ADD); 15380 isInc = false; 15381 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 15382 Base = Ptr->getOperand(0); 15383 return true; 15384 } 15385 } 15386 15387 if (Ptr->getOpcode() == ISD::ADD) { 15388 isInc = true; 15389 ARM_AM::ShiftOpc ShOpcVal= 15390 ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode()); 15391 if (ShOpcVal != ARM_AM::no_shift) { 15392 Base = Ptr->getOperand(1); 15393 Offset = Ptr->getOperand(0); 15394 } else { 15395 Base = Ptr->getOperand(0); 15396 Offset = Ptr->getOperand(1); 15397 } 15398 return true; 15399 } 15400 15401 isInc = (Ptr->getOpcode() == ISD::ADD); 15402 Base = Ptr->getOperand(0); 15403 Offset = Ptr->getOperand(1); 15404 return true; 15405 } 15406 15407 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store. 15408 return false; 15409 } 15410 15411 static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, 15412 bool isSEXTLoad, SDValue &Base, 15413 SDValue &Offset, bool &isInc, 15414 SelectionDAG &DAG) { 15415 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 15416 return false; 15417 15418 Base = Ptr->getOperand(0); 15419 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 15420 int RHSC = (int)RHS->getZExtValue(); 15421 if (RHSC < 0 && RHSC > -0x100) { // 8 bits. 15422 assert(Ptr->getOpcode() == ISD::ADD); 15423 isInc = false; 15424 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 15425 return true; 15426 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero. 15427 isInc = Ptr->getOpcode() == ISD::ADD; 15428 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); 15429 return true; 15430 } 15431 } 15432 15433 return false; 15434 } 15435 15436 static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align, 15437 bool isSEXTLoad, bool IsMasked, bool isLE, 15438 SDValue &Base, SDValue &Offset, 15439 bool &isInc, SelectionDAG &DAG) { 15440 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 15441 return false; 15442 if (!isa<ConstantSDNode>(Ptr->getOperand(1))) 15443 return false; 15444 15445 // We allow LE non-masked loads to change the type (for example use a vldrb.8 15446 // as opposed to a vldrw.32). This can allow extra addressing modes or 15447 // alignments for what is otherwise an equivalent instruction. 15448 bool CanChangeType = isLE && !IsMasked; 15449 15450 ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1)); 15451 int RHSC = (int)RHS->getZExtValue(); 15452 15453 auto IsInRange = [&](int RHSC, int Limit, int Scale) { 15454 if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) { 15455 assert(Ptr->getOpcode() == ISD::ADD); 15456 isInc = false; 15457 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 15458 return true; 15459 } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) { 15460 isInc = Ptr->getOpcode() == ISD::ADD; 15461 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); 15462 return true; 15463 } 15464 return false; 15465 }; 15466 15467 // Try to find a matching instruction based on s/zext, Alignment, Offset and 15468 // (in BE/masked) type. 15469 Base = Ptr->getOperand(0); 15470 if (VT == MVT::v4i16) { 15471 if (Align >= 2 && IsInRange(RHSC, 0x80, 2)) 15472 return true; 15473 } else if (VT == MVT::v4i8 || VT == MVT::v8i8) { 15474 if (IsInRange(RHSC, 0x80, 1)) 15475 return true; 15476 } else if (Align >= 4 && 15477 (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) && 15478 IsInRange(RHSC, 0x80, 4)) 15479 return true; 15480 else if (Align >= 2 && 15481 (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) && 15482 IsInRange(RHSC, 0x80, 2)) 15483 return true; 15484 else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1)) 15485 return true; 15486 return false; 15487 } 15488 15489 /// getPreIndexedAddressParts - returns true by value, base pointer and 15490 /// offset pointer and addressing mode by reference if the node's address 15491 /// can be legally represented as pre-indexed load / store address. 15492 bool 15493 ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, 15494 SDValue &Offset, 15495 ISD::MemIndexedMode &AM, 15496 SelectionDAG &DAG) const { 15497 if (Subtarget->isThumb1Only()) 15498 return false; 15499 15500 EVT VT; 15501 SDValue Ptr; 15502 unsigned Align; 15503 bool isSEXTLoad = false; 15504 bool IsMasked = false; 15505 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 15506 Ptr = LD->getBasePtr(); 15507 VT = LD->getMemoryVT(); 15508 Align = LD->getAlignment(); 15509 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 15510 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 15511 Ptr = ST->getBasePtr(); 15512 VT = ST->getMemoryVT(); 15513 Align = ST->getAlignment(); 15514 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) { 15515 Ptr = LD->getBasePtr(); 15516 VT = LD->getMemoryVT(); 15517 Align = LD->getAlignment(); 15518 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 15519 IsMasked = true; 15520 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) { 15521 Ptr = ST->getBasePtr(); 15522 VT = ST->getMemoryVT(); 15523 Align = ST->getAlignment(); 15524 IsMasked = true; 15525 } else 15526 return false; 15527 15528 bool isInc; 15529 bool isLegal = false; 15530 if (VT.isVector()) 15531 isLegal = Subtarget->hasMVEIntegerOps() && 15532 getMVEIndexedAddressParts(Ptr.getNode(), VT, Align, isSEXTLoad, 15533 IsMasked, Subtarget->isLittle(), Base, 15534 Offset, isInc, DAG); 15535 else { 15536 if (Subtarget->isThumb2()) 15537 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 15538 Offset, isInc, DAG); 15539 else 15540 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 15541 Offset, isInc, DAG); 15542 } 15543 if (!isLegal) 15544 return false; 15545 15546 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC; 15547 return true; 15548 } 15549 15550 /// getPostIndexedAddressParts - returns true by value, base pointer and 15551 /// offset pointer and addressing mode by reference if this node can be 15552 /// combined with a load / store to form a post-indexed load / store. 15553 bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, 15554 SDValue &Base, 15555 SDValue &Offset, 15556 ISD::MemIndexedMode &AM, 15557 SelectionDAG &DAG) const { 15558 EVT VT; 15559 SDValue Ptr; 15560 unsigned Align; 15561 bool isSEXTLoad = false, isNonExt; 15562 bool IsMasked = false; 15563 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 15564 VT = LD->getMemoryVT(); 15565 Ptr = LD->getBasePtr(); 15566 Align = LD->getAlignment(); 15567 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 15568 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; 15569 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 15570 VT = ST->getMemoryVT(); 15571 Ptr = ST->getBasePtr(); 15572 Align = ST->getAlignment(); 15573 isNonExt = !ST->isTruncatingStore(); 15574 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) { 15575 VT = LD->getMemoryVT(); 15576 Ptr = LD->getBasePtr(); 15577 Align = LD->getAlignment(); 15578 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 15579 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; 15580 IsMasked = true; 15581 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) { 15582 VT = ST->getMemoryVT(); 15583 Ptr = ST->getBasePtr(); 15584 Align = ST->getAlignment(); 15585 isNonExt = !ST->isTruncatingStore(); 15586 IsMasked = true; 15587 } else 15588 return false; 15589 15590 if (Subtarget->isThumb1Only()) { 15591 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It 15592 // must be non-extending/truncating, i32, with an offset of 4. 15593 assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!"); 15594 if (Op->getOpcode() != ISD::ADD || !isNonExt) 15595 return false; 15596 auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 15597 if (!RHS || RHS->getZExtValue() != 4) 15598 return false; 15599 15600 Offset = Op->getOperand(1); 15601 Base = Op->getOperand(0); 15602 AM = ISD::POST_INC; 15603 return true; 15604 } 15605 15606 bool isInc; 15607 bool isLegal = false; 15608 if (VT.isVector()) 15609 isLegal = Subtarget->hasMVEIntegerOps() && 15610 getMVEIndexedAddressParts(Op, VT, Align, isSEXTLoad, IsMasked, 15611 Subtarget->isLittle(), Base, Offset, 15612 isInc, DAG); 15613 else { 15614 if (Subtarget->isThumb2()) 15615 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 15616 isInc, DAG); 15617 else 15618 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 15619 isInc, DAG); 15620 } 15621 if (!isLegal) 15622 return false; 15623 15624 if (Ptr != Base) { 15625 // Swap base ptr and offset to catch more post-index load / store when 15626 // it's legal. In Thumb2 mode, offset must be an immediate. 15627 if (Ptr == Offset && Op->getOpcode() == ISD::ADD && 15628 !Subtarget->isThumb2()) 15629 std::swap(Base, Offset); 15630 15631 // Post-indexed load / store update the base pointer. 15632 if (Ptr != Base) 15633 return false; 15634 } 15635 15636 AM = isInc ? ISD::POST_INC : ISD::POST_DEC; 15637 return true; 15638 } 15639 15640 void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, 15641 KnownBits &Known, 15642 const APInt &DemandedElts, 15643 const SelectionDAG &DAG, 15644 unsigned Depth) const { 15645 unsigned BitWidth = Known.getBitWidth(); 15646 Known.resetAll(); 15647 switch (Op.getOpcode()) { 15648 default: break; 15649 case ARMISD::ADDC: 15650 case ARMISD::ADDE: 15651 case ARMISD::SUBC: 15652 case ARMISD::SUBE: 15653 // Special cases when we convert a carry to a boolean. 15654 if (Op.getResNo() == 0) { 15655 SDValue LHS = Op.getOperand(0); 15656 SDValue RHS = Op.getOperand(1); 15657 // (ADDE 0, 0, C) will give us a single bit. 15658 if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) && 15659 isNullConstant(RHS)) { 15660 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); 15661 return; 15662 } 15663 } 15664 break; 15665 case ARMISD::CMOV: { 15666 // Bits are known zero/one if known on the LHS and RHS. 15667 Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1); 15668 if (Known.isUnknown()) 15669 return; 15670 15671 KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1); 15672 Known.Zero &= KnownRHS.Zero; 15673 Known.One &= KnownRHS.One; 15674 return; 15675 } 15676 case ISD::INTRINSIC_W_CHAIN: { 15677 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); 15678 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); 15679 switch (IntID) { 15680 default: return; 15681 case Intrinsic::arm_ldaex: 15682 case Intrinsic::arm_ldrex: { 15683 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); 15684 unsigned MemBits = VT.getScalarSizeInBits(); 15685 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); 15686 return; 15687 } 15688 } 15689 } 15690 case ARMISD::BFI: { 15691 // Conservatively, we can recurse down the first operand 15692 // and just mask out all affected bits. 15693 Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); 15694 15695 // The operand to BFI is already a mask suitable for removing the bits it 15696 // sets. 15697 ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2)); 15698 const APInt &Mask = CI->getAPIntValue(); 15699 Known.Zero &= Mask; 15700 Known.One &= Mask; 15701 return; 15702 } 15703 case ARMISD::VGETLANEs: 15704 case ARMISD::VGETLANEu: { 15705 const SDValue &SrcSV = Op.getOperand(0); 15706 EVT VecVT = SrcSV.getValueType(); 15707 assert(VecVT.isVector() && "VGETLANE expected a vector type"); 15708 const unsigned NumSrcElts = VecVT.getVectorNumElements(); 15709 ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode()); 15710 assert(Pos->getAPIntValue().ult(NumSrcElts) && 15711 "VGETLANE index out of bounds"); 15712 unsigned Idx = Pos->getZExtValue(); 15713 APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx); 15714 Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1); 15715 15716 EVT VT = Op.getValueType(); 15717 const unsigned DstSz = VT.getScalarSizeInBits(); 15718 const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits(); 15719 (void)SrcSz; 15720 assert(SrcSz == Known.getBitWidth()); 15721 assert(DstSz > SrcSz); 15722 if (Op.getOpcode() == ARMISD::VGETLANEs) 15723 Known = Known.sext(DstSz); 15724 else { 15725 Known = Known.zext(DstSz, true /* extended bits are known zero */); 15726 } 15727 assert(DstSz == Known.getBitWidth()); 15728 break; 15729 } 15730 } 15731 } 15732 15733 bool 15734 ARMTargetLowering::targetShrinkDemandedConstant(SDValue Op, 15735 const APInt &DemandedAPInt, 15736 TargetLoweringOpt &TLO) const { 15737 // Delay optimization, so we don't have to deal with illegal types, or block 15738 // optimizations. 15739 if (!TLO.LegalOps) 15740 return false; 15741 15742 // Only optimize AND for now. 15743 if (Op.getOpcode() != ISD::AND) 15744 return false; 15745 15746 EVT VT = Op.getValueType(); 15747 15748 // Ignore vectors. 15749 if (VT.isVector()) 15750 return false; 15751 15752 assert(VT == MVT::i32 && "Unexpected integer type"); 15753 15754 // Make sure the RHS really is a constant. 15755 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 15756 if (!C) 15757 return false; 15758 15759 unsigned Mask = C->getZExtValue(); 15760 15761 unsigned Demanded = DemandedAPInt.getZExtValue(); 15762 unsigned ShrunkMask = Mask & Demanded; 15763 unsigned ExpandedMask = Mask | ~Demanded; 15764 15765 // If the mask is all zeros, let the target-independent code replace the 15766 // result with zero. 15767 if (ShrunkMask == 0) 15768 return false; 15769 15770 // If the mask is all ones, erase the AND. (Currently, the target-independent 15771 // code won't do this, so we have to do it explicitly to avoid an infinite 15772 // loop in obscure cases.) 15773 if (ExpandedMask == ~0U) 15774 return TLO.CombineTo(Op, Op.getOperand(0)); 15775 15776 auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool { 15777 return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0; 15778 }; 15779 auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool { 15780 if (NewMask == Mask) 15781 return true; 15782 SDLoc DL(Op); 15783 SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT); 15784 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC); 15785 return TLO.CombineTo(Op, NewOp); 15786 }; 15787 15788 // Prefer uxtb mask. 15789 if (IsLegalMask(0xFF)) 15790 return UseMask(0xFF); 15791 15792 // Prefer uxth mask. 15793 if (IsLegalMask(0xFFFF)) 15794 return UseMask(0xFFFF); 15795 15796 // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2. 15797 // FIXME: Prefer a contiguous sequence of bits for other optimizations. 15798 if (ShrunkMask < 256) 15799 return UseMask(ShrunkMask); 15800 15801 // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2. 15802 // FIXME: Prefer a contiguous sequence of bits for other optimizations. 15803 if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256) 15804 return UseMask(ExpandedMask); 15805 15806 // Potential improvements: 15807 // 15808 // We could try to recognize lsls+lsrs or lsrs+lsls pairs here. 15809 // We could try to prefer Thumb1 immediates which can be lowered to a 15810 // two-instruction sequence. 15811 // We could try to recognize more legal ARM/Thumb2 immediates here. 15812 15813 return false; 15814 } 15815 15816 15817 //===----------------------------------------------------------------------===// 15818 // ARM Inline Assembly Support 15819 //===----------------------------------------------------------------------===// 15820 15821 bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const { 15822 // Looking for "rev" which is V6+. 15823 if (!Subtarget->hasV6Ops()) 15824 return false; 15825 15826 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 15827 std::string AsmStr = IA->getAsmString(); 15828 SmallVector<StringRef, 4> AsmPieces; 15829 SplitString(AsmStr, AsmPieces, ";\n"); 15830 15831 switch (AsmPieces.size()) { 15832 default: return false; 15833 case 1: 15834 AsmStr = AsmPieces[0]; 15835 AsmPieces.clear(); 15836 SplitString(AsmStr, AsmPieces, " \t,"); 15837 15838 // rev $0, $1 15839 if (AsmPieces.size() == 3 && 15840 AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" && 15841 IA->getConstraintString().compare(0, 4, "=l,l") == 0) { 15842 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 15843 if (Ty && Ty->getBitWidth() == 32) 15844 return IntrinsicLowering::LowerToByteSwap(CI); 15845 } 15846 break; 15847 } 15848 15849 return false; 15850 } 15851 15852 const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const { 15853 // At this point, we have to lower this constraint to something else, so we 15854 // lower it to an "r" or "w". However, by doing this we will force the result 15855 // to be in register, while the X constraint is much more permissive. 15856 // 15857 // Although we are correct (we are free to emit anything, without 15858 // constraints), we might break use cases that would expect us to be more 15859 // efficient and emit something else. 15860 if (!Subtarget->hasVFP2Base()) 15861 return "r"; 15862 if (ConstraintVT.isFloatingPoint()) 15863 return "w"; 15864 if (ConstraintVT.isVector() && Subtarget->hasNEON() && 15865 (ConstraintVT.getSizeInBits() == 64 || 15866 ConstraintVT.getSizeInBits() == 128)) 15867 return "w"; 15868 15869 return "r"; 15870 } 15871 15872 /// getConstraintType - Given a constraint letter, return the type of 15873 /// constraint it is for this target. 15874 ARMTargetLowering::ConstraintType 15875 ARMTargetLowering::getConstraintType(StringRef Constraint) const { 15876 unsigned S = Constraint.size(); 15877 if (S == 1) { 15878 switch (Constraint[0]) { 15879 default: break; 15880 case 'l': return C_RegisterClass; 15881 case 'w': return C_RegisterClass; 15882 case 'h': return C_RegisterClass; 15883 case 'x': return C_RegisterClass; 15884 case 't': return C_RegisterClass; 15885 case 'j': return C_Immediate; // Constant for movw. 15886 // An address with a single base register. Due to the way we 15887 // currently handle addresses it is the same as an 'r' memory constraint. 15888 case 'Q': return C_Memory; 15889 } 15890 } else if (S == 2) { 15891 switch (Constraint[0]) { 15892 default: break; 15893 case 'T': return C_RegisterClass; 15894 // All 'U+' constraints are addresses. 15895 case 'U': return C_Memory; 15896 } 15897 } 15898 return TargetLowering::getConstraintType(Constraint); 15899 } 15900 15901 /// Examine constraint type and operand type and determine a weight value. 15902 /// This object must already have been set up with the operand type 15903 /// and the current alternative constraint selected. 15904 TargetLowering::ConstraintWeight 15905 ARMTargetLowering::getSingleConstraintMatchWeight( 15906 AsmOperandInfo &info, const char *constraint) const { 15907 ConstraintWeight weight = CW_Invalid; 15908 Value *CallOperandVal = info.CallOperandVal; 15909 // If we don't have a value, we can't do a match, 15910 // but allow it at the lowest weight. 15911 if (!CallOperandVal) 15912 return CW_Default; 15913 Type *type = CallOperandVal->getType(); 15914 // Look at the constraint type. 15915 switch (*constraint) { 15916 default: 15917 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 15918 break; 15919 case 'l': 15920 if (type->isIntegerTy()) { 15921 if (Subtarget->isThumb()) 15922 weight = CW_SpecificReg; 15923 else 15924 weight = CW_Register; 15925 } 15926 break; 15927 case 'w': 15928 if (type->isFloatingPointTy()) 15929 weight = CW_Register; 15930 break; 15931 } 15932 return weight; 15933 } 15934 15935 using RCPair = std::pair<unsigned, const TargetRegisterClass *>; 15936 15937 RCPair ARMTargetLowering::getRegForInlineAsmConstraint( 15938 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { 15939 switch (Constraint.size()) { 15940 case 1: 15941 // GCC ARM Constraint Letters 15942 switch (Constraint[0]) { 15943 case 'l': // Low regs or general regs. 15944 if (Subtarget->isThumb()) 15945 return RCPair(0U, &ARM::tGPRRegClass); 15946 return RCPair(0U, &ARM::GPRRegClass); 15947 case 'h': // High regs or no regs. 15948 if (Subtarget->isThumb()) 15949 return RCPair(0U, &ARM::hGPRRegClass); 15950 break; 15951 case 'r': 15952 if (Subtarget->isThumb1Only()) 15953 return RCPair(0U, &ARM::tGPRRegClass); 15954 return RCPair(0U, &ARM::GPRRegClass); 15955 case 'w': 15956 if (VT == MVT::Other) 15957 break; 15958 if (VT == MVT::f32) 15959 return RCPair(0U, &ARM::SPRRegClass); 15960 if (VT.getSizeInBits() == 64) 15961 return RCPair(0U, &ARM::DPRRegClass); 15962 if (VT.getSizeInBits() == 128) 15963 return RCPair(0U, &ARM::QPRRegClass); 15964 break; 15965 case 'x': 15966 if (VT == MVT::Other) 15967 break; 15968 if (VT == MVT::f32) 15969 return RCPair(0U, &ARM::SPR_8RegClass); 15970 if (VT.getSizeInBits() == 64) 15971 return RCPair(0U, &ARM::DPR_8RegClass); 15972 if (VT.getSizeInBits() == 128) 15973 return RCPair(0U, &ARM::QPR_8RegClass); 15974 break; 15975 case 't': 15976 if (VT == MVT::Other) 15977 break; 15978 if (VT == MVT::f32 || VT == MVT::i32) 15979 return RCPair(0U, &ARM::SPRRegClass); 15980 if (VT.getSizeInBits() == 64) 15981 return RCPair(0U, &ARM::DPR_VFP2RegClass); 15982 if (VT.getSizeInBits() == 128) 15983 return RCPair(0U, &ARM::QPR_VFP2RegClass); 15984 break; 15985 } 15986 break; 15987 15988 case 2: 15989 if (Constraint[0] == 'T') { 15990 switch (Constraint[1]) { 15991 default: 15992 break; 15993 case 'e': 15994 return RCPair(0U, &ARM::tGPREvenRegClass); 15995 case 'o': 15996 return RCPair(0U, &ARM::tGPROddRegClass); 15997 } 15998 } 15999 break; 16000 16001 default: 16002 break; 16003 } 16004 16005 if (StringRef("{cc}").equals_lower(Constraint)) 16006 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass); 16007 16008 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 16009 } 16010 16011 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 16012 /// vector. If it is invalid, don't add anything to Ops. 16013 void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, 16014 std::string &Constraint, 16015 std::vector<SDValue>&Ops, 16016 SelectionDAG &DAG) const { 16017 SDValue Result; 16018 16019 // Currently only support length 1 constraints. 16020 if (Constraint.length() != 1) return; 16021 16022 char ConstraintLetter = Constraint[0]; 16023 switch (ConstraintLetter) { 16024 default: break; 16025 case 'j': 16026 case 'I': case 'J': case 'K': case 'L': 16027 case 'M': case 'N': case 'O': 16028 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 16029 if (!C) 16030 return; 16031 16032 int64_t CVal64 = C->getSExtValue(); 16033 int CVal = (int) CVal64; 16034 // None of these constraints allow values larger than 32 bits. Check 16035 // that the value fits in an int. 16036 if (CVal != CVal64) 16037 return; 16038 16039 switch (ConstraintLetter) { 16040 case 'j': 16041 // Constant suitable for movw, must be between 0 and 16042 // 65535. 16043 if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps())) 16044 if (CVal >= 0 && CVal <= 65535) 16045 break; 16046 return; 16047 case 'I': 16048 if (Subtarget->isThumb1Only()) { 16049 // This must be a constant between 0 and 255, for ADD 16050 // immediates. 16051 if (CVal >= 0 && CVal <= 255) 16052 break; 16053 } else if (Subtarget->isThumb2()) { 16054 // A constant that can be used as an immediate value in a 16055 // data-processing instruction. 16056 if (ARM_AM::getT2SOImmVal(CVal) != -1) 16057 break; 16058 } else { 16059 // A constant that can be used as an immediate value in a 16060 // data-processing instruction. 16061 if (ARM_AM::getSOImmVal(CVal) != -1) 16062 break; 16063 } 16064 return; 16065 16066 case 'J': 16067 if (Subtarget->isThumb1Only()) { 16068 // This must be a constant between -255 and -1, for negated ADD 16069 // immediates. This can be used in GCC with an "n" modifier that 16070 // prints the negated value, for use with SUB instructions. It is 16071 // not useful otherwise but is implemented for compatibility. 16072 if (CVal >= -255 && CVal <= -1) 16073 break; 16074 } else { 16075 // This must be a constant between -4095 and 4095. It is not clear 16076 // what this constraint is intended for. Implemented for 16077 // compatibility with GCC. 16078 if (CVal >= -4095 && CVal <= 4095) 16079 break; 16080 } 16081 return; 16082 16083 case 'K': 16084 if (Subtarget->isThumb1Only()) { 16085 // A 32-bit value where only one byte has a nonzero value. Exclude 16086 // zero to match GCC. This constraint is used by GCC internally for 16087 // constants that can be loaded with a move/shift combination. 16088 // It is not useful otherwise but is implemented for compatibility. 16089 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal)) 16090 break; 16091 } else if (Subtarget->isThumb2()) { 16092 // A constant whose bitwise inverse can be used as an immediate 16093 // value in a data-processing instruction. This can be used in GCC 16094 // with a "B" modifier that prints the inverted value, for use with 16095 // BIC and MVN instructions. It is not useful otherwise but is 16096 // implemented for compatibility. 16097 if (ARM_AM::getT2SOImmVal(~CVal) != -1) 16098 break; 16099 } else { 16100 // A constant whose bitwise inverse can be used as an immediate 16101 // value in a data-processing instruction. This can be used in GCC 16102 // with a "B" modifier that prints the inverted value, for use with 16103 // BIC and MVN instructions. It is not useful otherwise but is 16104 // implemented for compatibility. 16105 if (ARM_AM::getSOImmVal(~CVal) != -1) 16106 break; 16107 } 16108 return; 16109 16110 case 'L': 16111 if (Subtarget->isThumb1Only()) { 16112 // This must be a constant between -7 and 7, 16113 // for 3-operand ADD/SUB immediate instructions. 16114 if (CVal >= -7 && CVal < 7) 16115 break; 16116 } else if (Subtarget->isThumb2()) { 16117 // A constant whose negation can be used as an immediate value in a 16118 // data-processing instruction. This can be used in GCC with an "n" 16119 // modifier that prints the negated value, for use with SUB 16120 // instructions. It is not useful otherwise but is implemented for 16121 // compatibility. 16122 if (ARM_AM::getT2SOImmVal(-CVal) != -1) 16123 break; 16124 } else { 16125 // A constant whose negation can be used as an immediate value in a 16126 // data-processing instruction. This can be used in GCC with an "n" 16127 // modifier that prints the negated value, for use with SUB 16128 // instructions. It is not useful otherwise but is implemented for 16129 // compatibility. 16130 if (ARM_AM::getSOImmVal(-CVal) != -1) 16131 break; 16132 } 16133 return; 16134 16135 case 'M': 16136 if (Subtarget->isThumb1Only()) { 16137 // This must be a multiple of 4 between 0 and 1020, for 16138 // ADD sp + immediate. 16139 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0)) 16140 break; 16141 } else { 16142 // A power of two or a constant between 0 and 32. This is used in 16143 // GCC for the shift amount on shifted register operands, but it is 16144 // useful in general for any shift amounts. 16145 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0)) 16146 break; 16147 } 16148 return; 16149 16150 case 'N': 16151 if (Subtarget->isThumb1Only()) { 16152 // This must be a constant between 0 and 31, for shift amounts. 16153 if (CVal >= 0 && CVal <= 31) 16154 break; 16155 } 16156 return; 16157 16158 case 'O': 16159 if (Subtarget->isThumb1Only()) { 16160 // This must be a multiple of 4 between -508 and 508, for 16161 // ADD/SUB sp = sp + immediate. 16162 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0)) 16163 break; 16164 } 16165 return; 16166 } 16167 Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType()); 16168 break; 16169 } 16170 16171 if (Result.getNode()) { 16172 Ops.push_back(Result); 16173 return; 16174 } 16175 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 16176 } 16177 16178 static RTLIB::Libcall getDivRemLibcall( 16179 const SDNode *N, MVT::SimpleValueType SVT) { 16180 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || 16181 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && 16182 "Unhandled Opcode in getDivRemLibcall"); 16183 bool isSigned = N->getOpcode() == ISD::SDIVREM || 16184 N->getOpcode() == ISD::SREM; 16185 RTLIB::Libcall LC; 16186 switch (SVT) { 16187 default: llvm_unreachable("Unexpected request for libcall!"); 16188 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break; 16189 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break; 16190 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break; 16191 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break; 16192 } 16193 return LC; 16194 } 16195 16196 static TargetLowering::ArgListTy getDivRemArgList( 16197 const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) { 16198 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || 16199 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && 16200 "Unhandled Opcode in getDivRemArgList"); 16201 bool isSigned = N->getOpcode() == ISD::SDIVREM || 16202 N->getOpcode() == ISD::SREM; 16203 TargetLowering::ArgListTy Args; 16204 TargetLowering::ArgListEntry Entry; 16205 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 16206 EVT ArgVT = N->getOperand(i).getValueType(); 16207 Type *ArgTy = ArgVT.getTypeForEVT(*Context); 16208 Entry.Node = N->getOperand(i); 16209 Entry.Ty = ArgTy; 16210 Entry.IsSExt = isSigned; 16211 Entry.IsZExt = !isSigned; 16212 Args.push_back(Entry); 16213 } 16214 if (Subtarget->isTargetWindows() && Args.size() >= 2) 16215 std::swap(Args[0], Args[1]); 16216 return Args; 16217 } 16218 16219 SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { 16220 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || 16221 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || 16222 Subtarget->isTargetWindows()) && 16223 "Register-based DivRem lowering only"); 16224 unsigned Opcode = Op->getOpcode(); 16225 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) && 16226 "Invalid opcode for Div/Rem lowering"); 16227 bool isSigned = (Opcode == ISD::SDIVREM); 16228 EVT VT = Op->getValueType(0); 16229 Type *Ty = VT.getTypeForEVT(*DAG.getContext()); 16230 SDLoc dl(Op); 16231 16232 // If the target has hardware divide, use divide + multiply + subtract: 16233 // div = a / b 16234 // rem = a - b * div 16235 // return {div, rem} 16236 // This should be lowered into UDIV/SDIV + MLS later on. 16237 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() 16238 : Subtarget->hasDivideInARMMode(); 16239 if (hasDivide && Op->getValueType(0).isSimple() && 16240 Op->getSimpleValueType(0) == MVT::i32) { 16241 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV; 16242 const SDValue Dividend = Op->getOperand(0); 16243 const SDValue Divisor = Op->getOperand(1); 16244 SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor); 16245 SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor); 16246 SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul); 16247 16248 SDValue Values[2] = {Div, Rem}; 16249 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values); 16250 } 16251 16252 RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(), 16253 VT.getSimpleVT().SimpleTy); 16254 SDValue InChain = DAG.getEntryNode(); 16255 16256 TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(), 16257 DAG.getContext(), 16258 Subtarget); 16259 16260 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 16261 getPointerTy(DAG.getDataLayout())); 16262 16263 Type *RetTy = StructType::get(Ty, Ty); 16264 16265 if (Subtarget->isTargetWindows()) 16266 InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain); 16267 16268 TargetLowering::CallLoweringInfo CLI(DAG); 16269 CLI.setDebugLoc(dl).setChain(InChain) 16270 .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) 16271 .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); 16272 16273 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); 16274 return CallInfo.first; 16275 } 16276 16277 // Lowers REM using divmod helpers 16278 // see RTABI section 4.2/4.3 16279 SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const { 16280 // Build return types (div and rem) 16281 std::vector<Type*> RetTyParams; 16282 Type *RetTyElement; 16283 16284 switch (N->getValueType(0).getSimpleVT().SimpleTy) { 16285 default: llvm_unreachable("Unexpected request for libcall!"); 16286 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break; 16287 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break; 16288 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break; 16289 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break; 16290 } 16291 16292 RetTyParams.push_back(RetTyElement); 16293 RetTyParams.push_back(RetTyElement); 16294 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams); 16295 Type *RetTy = StructType::get(*DAG.getContext(), ret); 16296 16297 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT(). 16298 SimpleTy); 16299 SDValue InChain = DAG.getEntryNode(); 16300 TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext(), 16301 Subtarget); 16302 bool isSigned = N->getOpcode() == ISD::SREM; 16303 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 16304 getPointerTy(DAG.getDataLayout())); 16305 16306 if (Subtarget->isTargetWindows()) 16307 InChain = WinDBZCheckDenominator(DAG, N, InChain); 16308 16309 // Lower call 16310 CallLoweringInfo CLI(DAG); 16311 CLI.setChain(InChain) 16312 .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args)) 16313 .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N)); 16314 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 16315 16316 // Return second (rem) result operand (first contains div) 16317 SDNode *ResNode = CallResult.first.getNode(); 16318 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands"); 16319 return ResNode->getOperand(1); 16320 } 16321 16322 SDValue 16323 ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { 16324 assert(Subtarget->isTargetWindows() && "unsupported target platform"); 16325 SDLoc DL(Op); 16326 16327 // Get the inputs. 16328 SDValue Chain = Op.getOperand(0); 16329 SDValue Size = Op.getOperand(1); 16330 16331 if (DAG.getMachineFunction().getFunction().hasFnAttribute( 16332 "no-stack-arg-probe")) { 16333 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 16334 SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); 16335 Chain = SP.getValue(1); 16336 SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size); 16337 if (Align) 16338 SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0), 16339 DAG.getConstant(-(uint64_t)Align, DL, MVT::i32)); 16340 Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP); 16341 SDValue Ops[2] = { SP, Chain }; 16342 return DAG.getMergeValues(Ops, DL); 16343 } 16344 16345 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size, 16346 DAG.getConstant(2, DL, MVT::i32)); 16347 16348 SDValue Flag; 16349 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag); 16350 Flag = Chain.getValue(1); 16351 16352 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 16353 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag); 16354 16355 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); 16356 Chain = NewSP.getValue(1); 16357 16358 SDValue Ops[2] = { NewSP, Chain }; 16359 return DAG.getMergeValues(Ops, DL); 16360 } 16361 16362 SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { 16363 bool IsStrict = Op->isStrictFPOpcode(); 16364 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 16365 const unsigned DstSz = Op.getValueType().getSizeInBits(); 16366 const unsigned SrcSz = SrcVal.getValueType().getSizeInBits(); 16367 assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 && 16368 "Unexpected type for custom-lowering FP_EXTEND"); 16369 16370 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) && 16371 "With both FP DP and 16, any FP conversion is legal!"); 16372 16373 assert(!(DstSz == 32 && Subtarget->hasFP16()) && 16374 "With FP16, 16 to 32 conversion is legal!"); 16375 16376 // Either we are converting from 16 -> 64, without FP16 and/or 16377 // FP.double-precision or without Armv8-fp. So we must do it in two 16378 // steps. 16379 // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32 16380 // without FP16. So we must do a function call. 16381 SDLoc Loc(Op); 16382 RTLIB::Libcall LC; 16383 MakeLibCallOptions CallOptions; 16384 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); 16385 for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) { 16386 bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64()); 16387 MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32); 16388 MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64); 16389 if (Supported) { 16390 if (IsStrict) { 16391 SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc, 16392 {DstVT, MVT::Other}, {Chain, SrcVal}); 16393 Chain = SrcVal.getValue(1); 16394 } else { 16395 SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, DstVT, SrcVal); 16396 } 16397 } else { 16398 LC = RTLIB::getFPEXT(SrcVT, DstVT); 16399 assert(LC != RTLIB::UNKNOWN_LIBCALL && 16400 "Unexpected type for custom-lowering FP_EXTEND"); 16401 std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions, 16402 Loc, Chain); 16403 } 16404 } 16405 16406 return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal; 16407 } 16408 16409 SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { 16410 bool IsStrict = Op->isStrictFPOpcode(); 16411 16412 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 16413 EVT SrcVT = SrcVal.getValueType(); 16414 EVT DstVT = Op.getValueType(); 16415 const unsigned DstSz = Op.getValueType().getSizeInBits(); 16416 const unsigned SrcSz = SrcVT.getSizeInBits(); 16417 (void)DstSz; 16418 assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 && 16419 "Unexpected type for custom-lowering FP_ROUND"); 16420 16421 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) && 16422 "With both FP DP and 16, any FP conversion is legal!"); 16423 16424 SDLoc Loc(Op); 16425 16426 // Instruction from 32 -> 16 if hasFP16 is valid 16427 if (SrcSz == 32 && Subtarget->hasFP16()) 16428 return Op; 16429 16430 // Lib call from 32 -> 16 / 64 -> [32, 16] 16431 RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT); 16432 assert(LC != RTLIB::UNKNOWN_LIBCALL && 16433 "Unexpected type for custom-lowering FP_ROUND"); 16434 MakeLibCallOptions CallOptions; 16435 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); 16436 SDValue Result; 16437 std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions, 16438 Loc, Chain); 16439 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result; 16440 } 16441 16442 void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results, 16443 SelectionDAG &DAG) const { 16444 assert(N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS."); 16445 MVT HalfT = MVT::i32; 16446 SDLoc dl(N); 16447 SDValue Hi, Lo, Tmp; 16448 16449 if (!isOperationLegalOrCustom(ISD::ADDCARRY, HalfT) || 16450 !isOperationLegalOrCustom(ISD::UADDO, HalfT)) 16451 return ; 16452 16453 unsigned OpTypeBits = HalfT.getScalarSizeInBits(); 16454 SDVTList VTList = DAG.getVTList(HalfT, MVT::i1); 16455 16456 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), 16457 DAG.getConstant(0, dl, HalfT)); 16458 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), 16459 DAG.getConstant(1, dl, HalfT)); 16460 16461 Tmp = DAG.getNode(ISD::SRA, dl, HalfT, Hi, 16462 DAG.getConstant(OpTypeBits - 1, dl, 16463 getShiftAmountTy(HalfT, DAG.getDataLayout()))); 16464 Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo); 16465 Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi, 16466 SDValue(Lo.getNode(), 1)); 16467 Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi); 16468 Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo); 16469 16470 Results.push_back(Lo); 16471 Results.push_back(Hi); 16472 } 16473 16474 bool 16475 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { 16476 // The ARM target isn't yet aware of offsets. 16477 return false; 16478 } 16479 16480 bool ARM::isBitFieldInvertedMask(unsigned v) { 16481 if (v == 0xffffffff) 16482 return false; 16483 16484 // there can be 1's on either or both "outsides", all the "inside" 16485 // bits must be 0's 16486 return isShiftedMask_32(~v); 16487 } 16488 16489 /// isFPImmLegal - Returns true if the target can instruction select the 16490 /// specified FP immediate natively. If false, the legalizer will 16491 /// materialize the FP immediate as a load from a constant pool. 16492 bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, 16493 bool ForCodeSize) const { 16494 if (!Subtarget->hasVFP3Base()) 16495 return false; 16496 if (VT == MVT::f16 && Subtarget->hasFullFP16()) 16497 return ARM_AM::getFP16Imm(Imm) != -1; 16498 if (VT == MVT::f32) 16499 return ARM_AM::getFP32Imm(Imm) != -1; 16500 if (VT == MVT::f64 && Subtarget->hasFP64()) 16501 return ARM_AM::getFP64Imm(Imm) != -1; 16502 return false; 16503 } 16504 16505 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as 16506 /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment 16507 /// specified in the intrinsic calls. 16508 bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 16509 const CallInst &I, 16510 MachineFunction &MF, 16511 unsigned Intrinsic) const { 16512 switch (Intrinsic) { 16513 case Intrinsic::arm_neon_vld1: 16514 case Intrinsic::arm_neon_vld2: 16515 case Intrinsic::arm_neon_vld3: 16516 case Intrinsic::arm_neon_vld4: 16517 case Intrinsic::arm_neon_vld2lane: 16518 case Intrinsic::arm_neon_vld3lane: 16519 case Intrinsic::arm_neon_vld4lane: 16520 case Intrinsic::arm_neon_vld2dup: 16521 case Intrinsic::arm_neon_vld3dup: 16522 case Intrinsic::arm_neon_vld4dup: { 16523 Info.opc = ISD::INTRINSIC_W_CHAIN; 16524 // Conservatively set memVT to the entire set of vectors loaded. 16525 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 16526 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 16527 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 16528 Info.ptrVal = I.getArgOperand(0); 16529 Info.offset = 0; 16530 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); 16531 Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue()); 16532 // volatile loads with NEON intrinsics not supported 16533 Info.flags = MachineMemOperand::MOLoad; 16534 return true; 16535 } 16536 case Intrinsic::arm_neon_vld1x2: 16537 case Intrinsic::arm_neon_vld1x3: 16538 case Intrinsic::arm_neon_vld1x4: { 16539 Info.opc = ISD::INTRINSIC_W_CHAIN; 16540 // Conservatively set memVT to the entire set of vectors loaded. 16541 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 16542 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 16543 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 16544 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); 16545 Info.offset = 0; 16546 Info.align.reset(); 16547 // volatile loads with NEON intrinsics not supported 16548 Info.flags = MachineMemOperand::MOLoad; 16549 return true; 16550 } 16551 case Intrinsic::arm_neon_vst1: 16552 case Intrinsic::arm_neon_vst2: 16553 case Intrinsic::arm_neon_vst3: 16554 case Intrinsic::arm_neon_vst4: 16555 case Intrinsic::arm_neon_vst2lane: 16556 case Intrinsic::arm_neon_vst3lane: 16557 case Intrinsic::arm_neon_vst4lane: { 16558 Info.opc = ISD::INTRINSIC_VOID; 16559 // Conservatively set memVT to the entire set of vectors stored. 16560 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 16561 unsigned NumElts = 0; 16562 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { 16563 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 16564 if (!ArgTy->isVectorTy()) 16565 break; 16566 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 16567 } 16568 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 16569 Info.ptrVal = I.getArgOperand(0); 16570 Info.offset = 0; 16571 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); 16572 Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue()); 16573 // volatile stores with NEON intrinsics not supported 16574 Info.flags = MachineMemOperand::MOStore; 16575 return true; 16576 } 16577 case Intrinsic::arm_neon_vst1x2: 16578 case Intrinsic::arm_neon_vst1x3: 16579 case Intrinsic::arm_neon_vst1x4: { 16580 Info.opc = ISD::INTRINSIC_VOID; 16581 // Conservatively set memVT to the entire set of vectors stored. 16582 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 16583 unsigned NumElts = 0; 16584 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { 16585 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 16586 if (!ArgTy->isVectorTy()) 16587 break; 16588 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 16589 } 16590 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 16591 Info.ptrVal = I.getArgOperand(0); 16592 Info.offset = 0; 16593 Info.align.reset(); 16594 // volatile stores with NEON intrinsics not supported 16595 Info.flags = MachineMemOperand::MOStore; 16596 return true; 16597 } 16598 case Intrinsic::arm_ldaex: 16599 case Intrinsic::arm_ldrex: { 16600 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 16601 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType()); 16602 Info.opc = ISD::INTRINSIC_W_CHAIN; 16603 Info.memVT = MVT::getVT(PtrTy->getElementType()); 16604 Info.ptrVal = I.getArgOperand(0); 16605 Info.offset = 0; 16606 Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); 16607 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 16608 return true; 16609 } 16610 case Intrinsic::arm_stlex: 16611 case Intrinsic::arm_strex: { 16612 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 16613 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); 16614 Info.opc = ISD::INTRINSIC_W_CHAIN; 16615 Info.memVT = MVT::getVT(PtrTy->getElementType()); 16616 Info.ptrVal = I.getArgOperand(1); 16617 Info.offset = 0; 16618 Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); 16619 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 16620 return true; 16621 } 16622 case Intrinsic::arm_stlexd: 16623 case Intrinsic::arm_strexd: 16624 Info.opc = ISD::INTRINSIC_W_CHAIN; 16625 Info.memVT = MVT::i64; 16626 Info.ptrVal = I.getArgOperand(2); 16627 Info.offset = 0; 16628 Info.align = Align(8); 16629 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 16630 return true; 16631 16632 case Intrinsic::arm_ldaexd: 16633 case Intrinsic::arm_ldrexd: 16634 Info.opc = ISD::INTRINSIC_W_CHAIN; 16635 Info.memVT = MVT::i64; 16636 Info.ptrVal = I.getArgOperand(0); 16637 Info.offset = 0; 16638 Info.align = Align(8); 16639 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 16640 return true; 16641 16642 default: 16643 break; 16644 } 16645 16646 return false; 16647 } 16648 16649 /// Returns true if it is beneficial to convert a load of a constant 16650 /// to just the constant itself. 16651 bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 16652 Type *Ty) const { 16653 assert(Ty->isIntegerTy()); 16654 16655 unsigned Bits = Ty->getPrimitiveSizeInBits(); 16656 if (Bits == 0 || Bits > 32) 16657 return false; 16658 return true; 16659 } 16660 16661 bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, 16662 unsigned Index) const { 16663 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) 16664 return false; 16665 16666 return (Index == 0 || Index == ResVT.getVectorNumElements()); 16667 } 16668 16669 Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder, 16670 ARM_MB::MemBOpt Domain) const { 16671 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 16672 16673 // First, if the target has no DMB, see what fallback we can use. 16674 if (!Subtarget->hasDataBarrier()) { 16675 // Some ARMv6 cpus can support data barriers with an mcr instruction. 16676 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 16677 // here. 16678 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) { 16679 Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr); 16680 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0), 16681 Builder.getInt32(0), Builder.getInt32(7), 16682 Builder.getInt32(10), Builder.getInt32(5)}; 16683 return Builder.CreateCall(MCR, args); 16684 } else { 16685 // Instead of using barriers, atomic accesses on these subtargets use 16686 // libcalls. 16687 llvm_unreachable("makeDMB on a target so old that it has no barriers"); 16688 } 16689 } else { 16690 Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb); 16691 // Only a full system barrier exists in the M-class architectures. 16692 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain; 16693 Constant *CDomain = Builder.getInt32(Domain); 16694 return Builder.CreateCall(DMB, CDomain); 16695 } 16696 } 16697 16698 // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html 16699 Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, 16700 Instruction *Inst, 16701 AtomicOrdering Ord) const { 16702 switch (Ord) { 16703 case AtomicOrdering::NotAtomic: 16704 case AtomicOrdering::Unordered: 16705 llvm_unreachable("Invalid fence: unordered/non-atomic"); 16706 case AtomicOrdering::Monotonic: 16707 case AtomicOrdering::Acquire: 16708 return nullptr; // Nothing to do 16709 case AtomicOrdering::SequentiallyConsistent: 16710 if (!Inst->hasAtomicStore()) 16711 return nullptr; // Nothing to do 16712 LLVM_FALLTHROUGH; 16713 case AtomicOrdering::Release: 16714 case AtomicOrdering::AcquireRelease: 16715 if (Subtarget->preferISHSTBarriers()) 16716 return makeDMB(Builder, ARM_MB::ISHST); 16717 // FIXME: add a comment with a link to documentation justifying this. 16718 else 16719 return makeDMB(Builder, ARM_MB::ISH); 16720 } 16721 llvm_unreachable("Unknown fence ordering in emitLeadingFence"); 16722 } 16723 16724 Instruction *ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder, 16725 Instruction *Inst, 16726 AtomicOrdering Ord) const { 16727 switch (Ord) { 16728 case AtomicOrdering::NotAtomic: 16729 case AtomicOrdering::Unordered: 16730 llvm_unreachable("Invalid fence: unordered/not-atomic"); 16731 case AtomicOrdering::Monotonic: 16732 case AtomicOrdering::Release: 16733 return nullptr; // Nothing to do 16734 case AtomicOrdering::Acquire: 16735 case AtomicOrdering::AcquireRelease: 16736 case AtomicOrdering::SequentiallyConsistent: 16737 return makeDMB(Builder, ARM_MB::ISH); 16738 } 16739 llvm_unreachable("Unknown fence ordering in emitTrailingFence"); 16740 } 16741 16742 // Loads and stores less than 64-bits are already atomic; ones above that 16743 // are doomed anyway, so defer to the default libcall and blame the OS when 16744 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit 16745 // anything for those. 16746 bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { 16747 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); 16748 return (Size == 64) && !Subtarget->isMClass(); 16749 } 16750 16751 // Loads and stores less than 64-bits are already atomic; ones above that 16752 // are doomed anyway, so defer to the default libcall and blame the OS when 16753 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit 16754 // anything for those. 16755 // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that 16756 // guarantee, see DDI0406C ARM architecture reference manual, 16757 // sections A8.8.72-74 LDRD) 16758 TargetLowering::AtomicExpansionKind 16759 ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { 16760 unsigned Size = LI->getType()->getPrimitiveSizeInBits(); 16761 return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly 16762 : AtomicExpansionKind::None; 16763 } 16764 16765 // For the real atomic operations, we have ldrex/strex up to 32 bits, 16766 // and up to 64 bits on the non-M profiles 16767 TargetLowering::AtomicExpansionKind 16768 ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 16769 if (AI->isFloatingPointOperation()) 16770 return AtomicExpansionKind::CmpXChg; 16771 16772 unsigned Size = AI->getType()->getPrimitiveSizeInBits(); 16773 bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); 16774 return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) 16775 ? AtomicExpansionKind::LLSC 16776 : AtomicExpansionKind::None; 16777 } 16778 16779 TargetLowering::AtomicExpansionKind 16780 ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const { 16781 // At -O0, fast-regalloc cannot cope with the live vregs necessary to 16782 // implement cmpxchg without spilling. If the address being exchanged is also 16783 // on the stack and close enough to the spill slot, this can lead to a 16784 // situation where the monitor always gets cleared and the atomic operation 16785 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. 16786 bool HasAtomicCmpXchg = 16787 !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); 16788 if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg) 16789 return AtomicExpansionKind::LLSC; 16790 return AtomicExpansionKind::None; 16791 } 16792 16793 bool ARMTargetLowering::shouldInsertFencesForAtomic( 16794 const Instruction *I) const { 16795 return InsertFencesForAtomic; 16796 } 16797 16798 // This has so far only been implemented for MachO. 16799 bool ARMTargetLowering::useLoadStackGuardNode() const { 16800 return Subtarget->isTargetMachO(); 16801 } 16802 16803 void ARMTargetLowering::insertSSPDeclarations(Module &M) const { 16804 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 16805 return TargetLowering::insertSSPDeclarations(M); 16806 16807 // MSVC CRT has a global variable holding security cookie. 16808 M.getOrInsertGlobal("__security_cookie", 16809 Type::getInt8PtrTy(M.getContext())); 16810 16811 // MSVC CRT has a function to validate security cookie. 16812 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( 16813 "__security_check_cookie", Type::getVoidTy(M.getContext()), 16814 Type::getInt8PtrTy(M.getContext())); 16815 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) 16816 F->addAttribute(1, Attribute::AttrKind::InReg); 16817 } 16818 16819 Value *ARMTargetLowering::getSDagStackGuard(const Module &M) const { 16820 // MSVC CRT has a global variable holding security cookie. 16821 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 16822 return M.getGlobalVariable("__security_cookie"); 16823 return TargetLowering::getSDagStackGuard(M); 16824 } 16825 16826 Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const { 16827 // MSVC CRT has a function to validate security cookie. 16828 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 16829 return M.getFunction("__security_check_cookie"); 16830 return TargetLowering::getSSPStackGuardCheck(M); 16831 } 16832 16833 bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, 16834 unsigned &Cost) const { 16835 // If we do not have NEON, vector types are not natively supported. 16836 if (!Subtarget->hasNEON()) 16837 return false; 16838 16839 // Floating point values and vector values map to the same register file. 16840 // Therefore, although we could do a store extract of a vector type, this is 16841 // better to leave at float as we have more freedom in the addressing mode for 16842 // those. 16843 if (VectorTy->isFPOrFPVectorTy()) 16844 return false; 16845 16846 // If the index is unknown at compile time, this is very expensive to lower 16847 // and it is not possible to combine the store with the extract. 16848 if (!isa<ConstantInt>(Idx)) 16849 return false; 16850 16851 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type"); 16852 unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth(); 16853 // We can do a store + vector extract on any vector that fits perfectly in a D 16854 // or Q register. 16855 if (BitWidth == 64 || BitWidth == 128) { 16856 Cost = 0; 16857 return true; 16858 } 16859 return false; 16860 } 16861 16862 bool ARMTargetLowering::isCheapToSpeculateCttz() const { 16863 return Subtarget->hasV6T2Ops(); 16864 } 16865 16866 bool ARMTargetLowering::isCheapToSpeculateCtlz() const { 16867 return Subtarget->hasV6T2Ops(); 16868 } 16869 16870 bool ARMTargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const { 16871 return !Subtarget->hasMinSize(); 16872 } 16873 16874 Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, 16875 AtomicOrdering Ord) const { 16876 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 16877 Type *ValTy = cast<PointerType>(Addr->getType())->getElementType(); 16878 bool IsAcquire = isAcquireOrStronger(Ord); 16879 16880 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd 16881 // intrinsic must return {i32, i32} and we have to recombine them into a 16882 // single i64 here. 16883 if (ValTy->getPrimitiveSizeInBits() == 64) { 16884 Intrinsic::ID Int = 16885 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd; 16886 Function *Ldrex = Intrinsic::getDeclaration(M, Int); 16887 16888 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 16889 Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi"); 16890 16891 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); 16892 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); 16893 if (!Subtarget->isLittle()) 16894 std::swap (Lo, Hi); 16895 Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); 16896 Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); 16897 return Builder.CreateOr( 16898 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64"); 16899 } 16900 16901 Type *Tys[] = { Addr->getType() }; 16902 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex; 16903 Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys); 16904 16905 return Builder.CreateTruncOrBitCast( 16906 Builder.CreateCall(Ldrex, Addr), 16907 cast<PointerType>(Addr->getType())->getElementType()); 16908 } 16909 16910 void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance( 16911 IRBuilder<> &Builder) const { 16912 if (!Subtarget->hasV7Ops()) 16913 return; 16914 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 16915 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex)); 16916 } 16917 16918 Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, 16919 Value *Addr, 16920 AtomicOrdering Ord) const { 16921 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 16922 bool IsRelease = isReleaseOrStronger(Ord); 16923 16924 // Since the intrinsics must have legal type, the i64 intrinsics take two 16925 // parameters: "i32, i32". We must marshal Val into the appropriate form 16926 // before the call. 16927 if (Val->getType()->getPrimitiveSizeInBits() == 64) { 16928 Intrinsic::ID Int = 16929 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd; 16930 Function *Strex = Intrinsic::getDeclaration(M, Int); 16931 Type *Int32Ty = Type::getInt32Ty(M->getContext()); 16932 16933 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo"); 16934 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi"); 16935 if (!Subtarget->isLittle()) 16936 std::swap(Lo, Hi); 16937 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 16938 return Builder.CreateCall(Strex, {Lo, Hi, Addr}); 16939 } 16940 16941 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex; 16942 Type *Tys[] = { Addr->getType() }; 16943 Function *Strex = Intrinsic::getDeclaration(M, Int, Tys); 16944 16945 return Builder.CreateCall( 16946 Strex, {Builder.CreateZExtOrBitCast( 16947 Val, Strex->getFunctionType()->getParamType(0)), 16948 Addr}); 16949 } 16950 16951 16952 bool ARMTargetLowering::alignLoopsWithOptSize() const { 16953 return Subtarget->isMClass(); 16954 } 16955 16956 /// A helper function for determining the number of interleaved accesses we 16957 /// will generate when lowering accesses of the given type. 16958 unsigned 16959 ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy, 16960 const DataLayout &DL) const { 16961 return (DL.getTypeSizeInBits(VecTy) + 127) / 128; 16962 } 16963 16964 bool ARMTargetLowering::isLegalInterleavedAccessType( 16965 unsigned Factor, VectorType *VecTy, const DataLayout &DL) const { 16966 16967 unsigned VecSize = DL.getTypeSizeInBits(VecTy); 16968 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); 16969 16970 if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps()) 16971 return false; 16972 16973 // Ensure the vector doesn't have f16 elements. Even though we could do an 16974 // i16 vldN, we can't hold the f16 vectors and will end up converting via 16975 // f32. 16976 if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy()) 16977 return false; 16978 if (Subtarget->hasMVEIntegerOps() && Factor == 3) 16979 return false; 16980 16981 // Ensure the number of vector elements is greater than 1. 16982 if (VecTy->getNumElements() < 2) 16983 return false; 16984 16985 // Ensure the element type is legal. 16986 if (ElSize != 8 && ElSize != 16 && ElSize != 32) 16987 return false; 16988 16989 // Ensure the total vector size is 64 or a multiple of 128. Types larger than 16990 // 128 will be split into multiple interleaved accesses. 16991 if (Subtarget->hasNEON() && VecSize == 64) 16992 return true; 16993 return VecSize % 128 == 0; 16994 } 16995 16996 unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const { 16997 if (Subtarget->hasNEON()) 16998 return 4; 16999 if (Subtarget->hasMVEIntegerOps()) 17000 return MVEMaxSupportedInterleaveFactor; 17001 return TargetLoweringBase::getMaxSupportedInterleaveFactor(); 17002 } 17003 17004 /// Lower an interleaved load into a vldN intrinsic. 17005 /// 17006 /// E.g. Lower an interleaved load (Factor = 2): 17007 /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4 17008 /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements 17009 /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements 17010 /// 17011 /// Into: 17012 /// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4) 17013 /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0 17014 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 17015 bool ARMTargetLowering::lowerInterleavedLoad( 17016 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, 17017 ArrayRef<unsigned> Indices, unsigned Factor) const { 17018 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 17019 "Invalid interleave factor"); 17020 assert(!Shuffles.empty() && "Empty shufflevector input"); 17021 assert(Shuffles.size() == Indices.size() && 17022 "Unmatched number of shufflevectors and indices"); 17023 17024 VectorType *VecTy = Shuffles[0]->getType(); 17025 Type *EltTy = VecTy->getVectorElementType(); 17026 17027 const DataLayout &DL = LI->getModule()->getDataLayout(); 17028 17029 // Skip if we do not have NEON and skip illegal vector types. We can 17030 // "legalize" wide vector types into multiple interleaved accesses as long as 17031 // the vector types are divisible by 128. 17032 if (!isLegalInterleavedAccessType(Factor, VecTy, DL)) 17033 return false; 17034 17035 unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); 17036 17037 // A pointer vector can not be the return type of the ldN intrinsics. Need to 17038 // load integer vectors first and then convert to pointer vectors. 17039 if (EltTy->isPointerTy()) 17040 VecTy = 17041 VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); 17042 17043 IRBuilder<> Builder(LI); 17044 17045 // The base address of the load. 17046 Value *BaseAddr = LI->getPointerOperand(); 17047 17048 if (NumLoads > 1) { 17049 // If we're going to generate more than one load, reset the sub-vector type 17050 // to something legal. 17051 VecTy = VectorType::get(VecTy->getVectorElementType(), 17052 VecTy->getVectorNumElements() / NumLoads); 17053 17054 // We will compute the pointer operand of each load from the original base 17055 // address using GEPs. Cast the base address to a pointer to the scalar 17056 // element type. 17057 BaseAddr = Builder.CreateBitCast( 17058 BaseAddr, VecTy->getVectorElementType()->getPointerTo( 17059 LI->getPointerAddressSpace())); 17060 } 17061 17062 assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!"); 17063 17064 auto createLoadIntrinsic = [&](Value *BaseAddr) { 17065 if (Subtarget->hasNEON()) { 17066 Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); 17067 Type *Tys[] = {VecTy, Int8Ptr}; 17068 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, 17069 Intrinsic::arm_neon_vld3, 17070 Intrinsic::arm_neon_vld4}; 17071 Function *VldnFunc = 17072 Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); 17073 17074 SmallVector<Value *, 2> Ops; 17075 Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); 17076 Ops.push_back(Builder.getInt32(LI->getAlignment())); 17077 17078 return Builder.CreateCall(VldnFunc, Ops, "vldN"); 17079 } else { 17080 assert((Factor == 2 || Factor == 4) && 17081 "expected interleave factor of 2 or 4 for MVE"); 17082 Intrinsic::ID LoadInts = 17083 Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q; 17084 Type *VecEltTy = VecTy->getVectorElementType()->getPointerTo( 17085 LI->getPointerAddressSpace()); 17086 Type *Tys[] = {VecTy, VecEltTy}; 17087 Function *VldnFunc = 17088 Intrinsic::getDeclaration(LI->getModule(), LoadInts, Tys); 17089 17090 SmallVector<Value *, 2> Ops; 17091 Ops.push_back(Builder.CreateBitCast(BaseAddr, VecEltTy)); 17092 return Builder.CreateCall(VldnFunc, Ops, "vldN"); 17093 } 17094 }; 17095 17096 // Holds sub-vectors extracted from the load intrinsic return values. The 17097 // sub-vectors are associated with the shufflevector instructions they will 17098 // replace. 17099 DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs; 17100 17101 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) { 17102 // If we're generating more than one load, compute the base address of 17103 // subsequent loads as an offset from the previous. 17104 if (LoadCount > 0) 17105 BaseAddr = 17106 Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr, 17107 VecTy->getVectorNumElements() * Factor); 17108 17109 CallInst *VldN = createLoadIntrinsic(BaseAddr); 17110 17111 // Replace uses of each shufflevector with the corresponding vector loaded 17112 // by ldN. 17113 for (unsigned i = 0; i < Shuffles.size(); i++) { 17114 ShuffleVectorInst *SV = Shuffles[i]; 17115 unsigned Index = Indices[i]; 17116 17117 Value *SubVec = Builder.CreateExtractValue(VldN, Index); 17118 17119 // Convert the integer vector to pointer vector if the element is pointer. 17120 if (EltTy->isPointerTy()) 17121 SubVec = Builder.CreateIntToPtr( 17122 SubVec, VectorType::get(SV->getType()->getVectorElementType(), 17123 VecTy->getVectorNumElements())); 17124 17125 SubVecs[SV].push_back(SubVec); 17126 } 17127 } 17128 17129 // Replace uses of the shufflevector instructions with the sub-vectors 17130 // returned by the load intrinsic. If a shufflevector instruction is 17131 // associated with more than one sub-vector, those sub-vectors will be 17132 // concatenated into a single wide vector. 17133 for (ShuffleVectorInst *SVI : Shuffles) { 17134 auto &SubVec = SubVecs[SVI]; 17135 auto *WideVec = 17136 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0]; 17137 SVI->replaceAllUsesWith(WideVec); 17138 } 17139 17140 return true; 17141 } 17142 17143 /// Lower an interleaved store into a vstN intrinsic. 17144 /// 17145 /// E.g. Lower an interleaved store (Factor = 3): 17146 /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, 17147 /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> 17148 /// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4 17149 /// 17150 /// Into: 17151 /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> 17152 /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> 17153 /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> 17154 /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) 17155 /// 17156 /// Note that the new shufflevectors will be removed and we'll only generate one 17157 /// vst3 instruction in CodeGen. 17158 /// 17159 /// Example for a more general valid mask (Factor 3). Lower: 17160 /// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1, 17161 /// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19> 17162 /// store <12 x i32> %i.vec, <12 x i32>* %ptr 17163 /// 17164 /// Into: 17165 /// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7> 17166 /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> 17167 /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> 17168 /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) 17169 bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, 17170 ShuffleVectorInst *SVI, 17171 unsigned Factor) const { 17172 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 17173 "Invalid interleave factor"); 17174 17175 VectorType *VecTy = SVI->getType(); 17176 assert(VecTy->getVectorNumElements() % Factor == 0 && 17177 "Invalid interleaved store"); 17178 17179 unsigned LaneLen = VecTy->getVectorNumElements() / Factor; 17180 Type *EltTy = VecTy->getVectorElementType(); 17181 VectorType *SubVecTy = VectorType::get(EltTy, LaneLen); 17182 17183 const DataLayout &DL = SI->getModule()->getDataLayout(); 17184 17185 // Skip if we do not have NEON and skip illegal vector types. We can 17186 // "legalize" wide vector types into multiple interleaved accesses as long as 17187 // the vector types are divisible by 128. 17188 if (!isLegalInterleavedAccessType(Factor, SubVecTy, DL)) 17189 return false; 17190 17191 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); 17192 17193 Value *Op0 = SVI->getOperand(0); 17194 Value *Op1 = SVI->getOperand(1); 17195 IRBuilder<> Builder(SI); 17196 17197 // StN intrinsics don't support pointer vectors as arguments. Convert pointer 17198 // vectors to integer vectors. 17199 if (EltTy->isPointerTy()) { 17200 Type *IntTy = DL.getIntPtrType(EltTy); 17201 17202 // Convert to the corresponding integer vector. 17203 Type *IntVecTy = 17204 VectorType::get(IntTy, Op0->getType()->getVectorNumElements()); 17205 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); 17206 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); 17207 17208 SubVecTy = VectorType::get(IntTy, LaneLen); 17209 } 17210 17211 // The base address of the store. 17212 Value *BaseAddr = SI->getPointerOperand(); 17213 17214 if (NumStores > 1) { 17215 // If we're going to generate more than one store, reset the lane length 17216 // and sub-vector type to something legal. 17217 LaneLen /= NumStores; 17218 SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen); 17219 17220 // We will compute the pointer operand of each store from the original base 17221 // address using GEPs. Cast the base address to a pointer to the scalar 17222 // element type. 17223 BaseAddr = Builder.CreateBitCast( 17224 BaseAddr, SubVecTy->getVectorElementType()->getPointerTo( 17225 SI->getPointerAddressSpace())); 17226 } 17227 17228 assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!"); 17229 17230 auto Mask = SVI->getShuffleMask(); 17231 17232 auto createStoreIntrinsic = [&](Value *BaseAddr, 17233 SmallVectorImpl<Value *> &Shuffles) { 17234 if (Subtarget->hasNEON()) { 17235 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, 17236 Intrinsic::arm_neon_vst3, 17237 Intrinsic::arm_neon_vst4}; 17238 Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); 17239 Type *Tys[] = {Int8Ptr, SubVecTy}; 17240 17241 Function *VstNFunc = Intrinsic::getDeclaration( 17242 SI->getModule(), StoreInts[Factor - 2], Tys); 17243 17244 SmallVector<Value *, 6> Ops; 17245 Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); 17246 for (auto S : Shuffles) 17247 Ops.push_back(S); 17248 Ops.push_back(Builder.getInt32(SI->getAlignment())); 17249 Builder.CreateCall(VstNFunc, Ops); 17250 } else { 17251 assert((Factor == 2 || Factor == 4) && 17252 "expected interleave factor of 2 or 4 for MVE"); 17253 Intrinsic::ID StoreInts = 17254 Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q; 17255 Type *EltPtrTy = SubVecTy->getVectorElementType()->getPointerTo( 17256 SI->getPointerAddressSpace()); 17257 Type *Tys[] = {EltPtrTy, SubVecTy}; 17258 Function *VstNFunc = 17259 Intrinsic::getDeclaration(SI->getModule(), StoreInts, Tys); 17260 17261 SmallVector<Value *, 6> Ops; 17262 Ops.push_back(Builder.CreateBitCast(BaseAddr, EltPtrTy)); 17263 for (auto S : Shuffles) 17264 Ops.push_back(S); 17265 for (unsigned F = 0; F < Factor; F++) { 17266 Ops.push_back(Builder.getInt32(F)); 17267 Builder.CreateCall(VstNFunc, Ops); 17268 Ops.pop_back(); 17269 } 17270 } 17271 }; 17272 17273 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { 17274 // If we generating more than one store, we compute the base address of 17275 // subsequent stores as an offset from the previous. 17276 if (StoreCount > 0) 17277 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(), 17278 BaseAddr, LaneLen * Factor); 17279 17280 SmallVector<Value *, 4> Shuffles; 17281 17282 // Split the shufflevector operands into sub vectors for the new vstN call. 17283 for (unsigned i = 0; i < Factor; i++) { 17284 unsigned IdxI = StoreCount * LaneLen * Factor + i; 17285 if (Mask[IdxI] >= 0) { 17286 Shuffles.push_back(Builder.CreateShuffleVector( 17287 Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0))); 17288 } else { 17289 unsigned StartMask = 0; 17290 for (unsigned j = 1; j < LaneLen; j++) { 17291 unsigned IdxJ = StoreCount * LaneLen * Factor + j; 17292 if (Mask[IdxJ * Factor + IdxI] >= 0) { 17293 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ; 17294 break; 17295 } 17296 } 17297 // Note: If all elements in a chunk are undefs, StartMask=0! 17298 // Note: Filling undef gaps with random elements is ok, since 17299 // those elements were being written anyway (with undefs). 17300 // In the case of all undefs we're defaulting to using elems from 0 17301 // Note: StartMask cannot be negative, it's checked in 17302 // isReInterleaveMask 17303 Shuffles.push_back(Builder.CreateShuffleVector( 17304 Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); 17305 } 17306 } 17307 17308 createStoreIntrinsic(BaseAddr, Shuffles); 17309 } 17310 return true; 17311 } 17312 17313 enum HABaseType { 17314 HA_UNKNOWN = 0, 17315 HA_FLOAT, 17316 HA_DOUBLE, 17317 HA_VECT64, 17318 HA_VECT128 17319 }; 17320 17321 static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, 17322 uint64_t &Members) { 17323 if (auto *ST = dyn_cast<StructType>(Ty)) { 17324 for (unsigned i = 0; i < ST->getNumElements(); ++i) { 17325 uint64_t SubMembers = 0; 17326 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers)) 17327 return false; 17328 Members += SubMembers; 17329 } 17330 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) { 17331 uint64_t SubMembers = 0; 17332 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers)) 17333 return false; 17334 Members += SubMembers * AT->getNumElements(); 17335 } else if (Ty->isFloatTy()) { 17336 if (Base != HA_UNKNOWN && Base != HA_FLOAT) 17337 return false; 17338 Members = 1; 17339 Base = HA_FLOAT; 17340 } else if (Ty->isDoubleTy()) { 17341 if (Base != HA_UNKNOWN && Base != HA_DOUBLE) 17342 return false; 17343 Members = 1; 17344 Base = HA_DOUBLE; 17345 } else if (auto *VT = dyn_cast<VectorType>(Ty)) { 17346 Members = 1; 17347 switch (Base) { 17348 case HA_FLOAT: 17349 case HA_DOUBLE: 17350 return false; 17351 case HA_VECT64: 17352 return VT->getBitWidth() == 64; 17353 case HA_VECT128: 17354 return VT->getBitWidth() == 128; 17355 case HA_UNKNOWN: 17356 switch (VT->getBitWidth()) { 17357 case 64: 17358 Base = HA_VECT64; 17359 return true; 17360 case 128: 17361 Base = HA_VECT128; 17362 return true; 17363 default: 17364 return false; 17365 } 17366 } 17367 } 17368 17369 return (Members > 0 && Members <= 4); 17370 } 17371 17372 /// Return the correct alignment for the current calling convention. 17373 Align ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy, 17374 DataLayout DL) const { 17375 const Align ABITypeAlign(DL.getABITypeAlignment(ArgTy)); 17376 if (!ArgTy->isVectorTy()) 17377 return ABITypeAlign; 17378 17379 // Avoid over-aligning vector parameters. It would require realigning the 17380 // stack and waste space for no real benefit. 17381 return std::min(ABITypeAlign, DL.getStackAlignment()); 17382 } 17383 17384 /// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of 17385 /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when 17386 /// passing according to AAPCS rules. 17387 bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( 17388 Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { 17389 if (getEffectiveCallingConv(CallConv, isVarArg) != 17390 CallingConv::ARM_AAPCS_VFP) 17391 return false; 17392 17393 HABaseType Base = HA_UNKNOWN; 17394 uint64_t Members = 0; 17395 bool IsHA = isHomogeneousAggregate(Ty, Base, Members); 17396 LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump()); 17397 17398 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy(); 17399 return IsHA || IsIntArray; 17400 } 17401 17402 unsigned ARMTargetLowering::getExceptionPointerRegister( 17403 const Constant *PersonalityFn) const { 17404 // Platforms which do not use SjLj EH may return values in these registers 17405 // via the personality function. 17406 return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0; 17407 } 17408 17409 unsigned ARMTargetLowering::getExceptionSelectorRegister( 17410 const Constant *PersonalityFn) const { 17411 // Platforms which do not use SjLj EH may return values in these registers 17412 // via the personality function. 17413 return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1; 17414 } 17415 17416 void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { 17417 // Update IsSplitCSR in ARMFunctionInfo. 17418 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>(); 17419 AFI->setIsSplitCSR(true); 17420 } 17421 17422 void ARMTargetLowering::insertCopiesSplitCSR( 17423 MachineBasicBlock *Entry, 17424 const SmallVectorImpl<MachineBasicBlock *> &Exits) const { 17425 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 17426 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); 17427 if (!IStart) 17428 return; 17429 17430 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 17431 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); 17432 MachineBasicBlock::iterator MBBI = Entry->begin(); 17433 for (const MCPhysReg *I = IStart; *I; ++I) { 17434 const TargetRegisterClass *RC = nullptr; 17435 if (ARM::GPRRegClass.contains(*I)) 17436 RC = &ARM::GPRRegClass; 17437 else if (ARM::DPRRegClass.contains(*I)) 17438 RC = &ARM::DPRRegClass; 17439 else 17440 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 17441 17442 Register NewVR = MRI->createVirtualRegister(RC); 17443 // Create copy from CSR to a virtual register. 17444 // FIXME: this currently does not emit CFI pseudo-instructions, it works 17445 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be 17446 // nounwind. If we want to generalize this later, we may need to emit 17447 // CFI pseudo-instructions. 17448 assert(Entry->getParent()->getFunction().hasFnAttribute( 17449 Attribute::NoUnwind) && 17450 "Function should be nounwind in insertCopiesSplitCSR!"); 17451 Entry->addLiveIn(*I); 17452 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) 17453 .addReg(*I); 17454 17455 // Insert the copy-back instructions right before the terminator. 17456 for (auto *Exit : Exits) 17457 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), 17458 TII->get(TargetOpcode::COPY), *I) 17459 .addReg(NewVR); 17460 } 17461 } 17462 17463 void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const { 17464 MF.getFrameInfo().computeMaxCallFrameSize(MF); 17465 TargetLoweringBase::finalizeLowering(MF); 17466 } 17467