1 //===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines the interfaces that ARM uses to lower LLVM code into a 10 // selection DAG. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "ARMISelLowering.h" 15 #include "ARMBaseInstrInfo.h" 16 #include "ARMBaseRegisterInfo.h" 17 #include "ARMCallingConv.h" 18 #include "ARMConstantPoolValue.h" 19 #include "ARMMachineFunctionInfo.h" 20 #include "ARMPerfectShuffle.h" 21 #include "ARMRegisterInfo.h" 22 #include "ARMSelectionDAGInfo.h" 23 #include "ARMSubtarget.h" 24 #include "MCTargetDesc/ARMAddressingModes.h" 25 #include "MCTargetDesc/ARMBaseInfo.h" 26 #include "Utils/ARMBaseInfo.h" 27 #include "llvm/ADT/APFloat.h" 28 #include "llvm/ADT/APInt.h" 29 #include "llvm/ADT/ArrayRef.h" 30 #include "llvm/ADT/BitVector.h" 31 #include "llvm/ADT/DenseMap.h" 32 #include "llvm/ADT/STLExtras.h" 33 #include "llvm/ADT/SmallPtrSet.h" 34 #include "llvm/ADT/SmallVector.h" 35 #include "llvm/ADT/Statistic.h" 36 #include "llvm/ADT/StringExtras.h" 37 #include "llvm/ADT/StringRef.h" 38 #include "llvm/ADT/StringSwitch.h" 39 #include "llvm/ADT/Triple.h" 40 #include "llvm/ADT/Twine.h" 41 #include "llvm/Analysis/VectorUtils.h" 42 #include "llvm/CodeGen/CallingConvLower.h" 43 #include "llvm/CodeGen/ISDOpcodes.h" 44 #include "llvm/CodeGen/IntrinsicLowering.h" 45 #include "llvm/CodeGen/MachineBasicBlock.h" 46 #include "llvm/CodeGen/MachineConstantPool.h" 47 #include "llvm/CodeGen/MachineFrameInfo.h" 48 #include "llvm/CodeGen/MachineFunction.h" 49 #include "llvm/CodeGen/MachineInstr.h" 50 #include "llvm/CodeGen/MachineInstrBuilder.h" 51 #include "llvm/CodeGen/MachineJumpTableInfo.h" 52 #include "llvm/CodeGen/MachineMemOperand.h" 53 #include "llvm/CodeGen/MachineOperand.h" 54 #include "llvm/CodeGen/MachineRegisterInfo.h" 55 #include "llvm/CodeGen/RuntimeLibcalls.h" 56 #include "llvm/CodeGen/SelectionDAG.h" 57 #include "llvm/CodeGen/SelectionDAGNodes.h" 58 #include "llvm/CodeGen/TargetInstrInfo.h" 59 #include "llvm/CodeGen/TargetLowering.h" 60 #include "llvm/CodeGen/TargetOpcodes.h" 61 #include "llvm/CodeGen/TargetRegisterInfo.h" 62 #include "llvm/CodeGen/TargetSubtargetInfo.h" 63 #include "llvm/CodeGen/ValueTypes.h" 64 #include "llvm/IR/Attributes.h" 65 #include "llvm/IR/CallingConv.h" 66 #include "llvm/IR/Constant.h" 67 #include "llvm/IR/Constants.h" 68 #include "llvm/IR/DataLayout.h" 69 #include "llvm/IR/DebugLoc.h" 70 #include "llvm/IR/DerivedTypes.h" 71 #include "llvm/IR/Function.h" 72 #include "llvm/IR/GlobalAlias.h" 73 #include "llvm/IR/GlobalValue.h" 74 #include "llvm/IR/GlobalVariable.h" 75 #include "llvm/IR/IRBuilder.h" 76 #include "llvm/IR/InlineAsm.h" 77 #include "llvm/IR/Instruction.h" 78 #include "llvm/IR/Instructions.h" 79 #include "llvm/IR/IntrinsicInst.h" 80 #include "llvm/IR/Intrinsics.h" 81 #include "llvm/IR/IntrinsicsARM.h" 82 #include "llvm/IR/Module.h" 83 #include "llvm/IR/PatternMatch.h" 84 #include "llvm/IR/Type.h" 85 #include "llvm/IR/User.h" 86 #include "llvm/IR/Value.h" 87 #include "llvm/MC/MCInstrDesc.h" 88 #include "llvm/MC/MCInstrItineraries.h" 89 #include "llvm/MC/MCRegisterInfo.h" 90 #include "llvm/MC/MCSchedule.h" 91 #include "llvm/Support/AtomicOrdering.h" 92 #include "llvm/Support/BranchProbability.h" 93 #include "llvm/Support/Casting.h" 94 #include "llvm/Support/CodeGen.h" 95 #include "llvm/Support/CommandLine.h" 96 #include "llvm/Support/Compiler.h" 97 #include "llvm/Support/Debug.h" 98 #include "llvm/Support/ErrorHandling.h" 99 #include "llvm/Support/KnownBits.h" 100 #include "llvm/Support/MachineValueType.h" 101 #include "llvm/Support/MathExtras.h" 102 #include "llvm/Support/raw_ostream.h" 103 #include "llvm/Target/TargetMachine.h" 104 #include "llvm/Target/TargetOptions.h" 105 #include <algorithm> 106 #include <cassert> 107 #include <cstdint> 108 #include <cstdlib> 109 #include <iterator> 110 #include <limits> 111 #include <string> 112 #include <tuple> 113 #include <utility> 114 #include <vector> 115 116 using namespace llvm; 117 using namespace llvm::PatternMatch; 118 119 #define DEBUG_TYPE "arm-isel" 120 121 STATISTIC(NumTailCalls, "Number of tail calls"); 122 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt"); 123 STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments"); 124 STATISTIC(NumConstpoolPromoted, 125 "Number of constants with their storage promoted into constant pools"); 126 127 static cl::opt<bool> 128 ARMInterworking("arm-interworking", cl::Hidden, 129 cl::desc("Enable / disable ARM interworking (for debugging only)"), 130 cl::init(true)); 131 132 static cl::opt<bool> EnableConstpoolPromotion( 133 "arm-promote-constant", cl::Hidden, 134 cl::desc("Enable / disable promotion of unnamed_addr constants into " 135 "constant pools"), 136 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed 137 static cl::opt<unsigned> ConstpoolPromotionMaxSize( 138 "arm-promote-constant-max-size", cl::Hidden, 139 cl::desc("Maximum size of constant to promote into a constant pool"), 140 cl::init(64)); 141 static cl::opt<unsigned> ConstpoolPromotionMaxTotal( 142 "arm-promote-constant-max-total", cl::Hidden, 143 cl::desc("Maximum size of ALL constants to promote into a constant pool"), 144 cl::init(128)); 145 146 static cl::opt<unsigned> 147 MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, 148 cl::desc("Maximum interleave factor for MVE VLDn to generate."), 149 cl::init(2)); 150 151 // The APCS parameter registers. 152 static const MCPhysReg GPRArgRegs[] = { 153 ARM::R0, ARM::R1, ARM::R2, ARM::R3 154 }; 155 156 void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT, 157 MVT PromotedBitwiseVT) { 158 if (VT != PromotedLdStVT) { 159 setOperationAction(ISD::LOAD, VT, Promote); 160 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT); 161 162 setOperationAction(ISD::STORE, VT, Promote); 163 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT); 164 } 165 166 MVT ElemTy = VT.getVectorElementType(); 167 if (ElemTy != MVT::f64) 168 setOperationAction(ISD::SETCC, VT, Custom); 169 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 170 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 171 if (ElemTy == MVT::i32) { 172 setOperationAction(ISD::SINT_TO_FP, VT, Custom); 173 setOperationAction(ISD::UINT_TO_FP, VT, Custom); 174 setOperationAction(ISD::FP_TO_SINT, VT, Custom); 175 setOperationAction(ISD::FP_TO_UINT, VT, Custom); 176 } else { 177 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 178 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 179 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 180 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 181 } 182 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 183 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 184 setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); 185 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); 186 setOperationAction(ISD::SELECT, VT, Expand); 187 setOperationAction(ISD::SELECT_CC, VT, Expand); 188 setOperationAction(ISD::VSELECT, VT, Expand); 189 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); 190 if (VT.isInteger()) { 191 setOperationAction(ISD::SHL, VT, Custom); 192 setOperationAction(ISD::SRA, VT, Custom); 193 setOperationAction(ISD::SRL, VT, Custom); 194 } 195 196 // Promote all bit-wise operations. 197 if (VT.isInteger() && VT != PromotedBitwiseVT) { 198 setOperationAction(ISD::AND, VT, Promote); 199 AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT); 200 setOperationAction(ISD::OR, VT, Promote); 201 AddPromotedToType (ISD::OR, VT, PromotedBitwiseVT); 202 setOperationAction(ISD::XOR, VT, Promote); 203 AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT); 204 } 205 206 // Neon does not support vector divide/remainder operations. 207 setOperationAction(ISD::SDIV, VT, Expand); 208 setOperationAction(ISD::UDIV, VT, Expand); 209 setOperationAction(ISD::FDIV, VT, Expand); 210 setOperationAction(ISD::SREM, VT, Expand); 211 setOperationAction(ISD::UREM, VT, Expand); 212 setOperationAction(ISD::FREM, VT, Expand); 213 214 if (!VT.isFloatingPoint() && 215 VT != MVT::v2i64 && VT != MVT::v1i64) 216 for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) 217 setOperationAction(Opcode, VT, Legal); 218 if (!VT.isFloatingPoint()) 219 for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT}) 220 setOperationAction(Opcode, VT, Legal); 221 } 222 223 void ARMTargetLowering::addDRTypeForNEON(MVT VT) { 224 addRegisterClass(VT, &ARM::DPRRegClass); 225 addTypeForNEON(VT, MVT::f64, MVT::v2i32); 226 } 227 228 void ARMTargetLowering::addQRTypeForNEON(MVT VT) { 229 addRegisterClass(VT, &ARM::DPairRegClass); 230 addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); 231 } 232 233 void ARMTargetLowering::setAllExpand(MVT VT) { 234 for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc) 235 setOperationAction(Opc, VT, Expand); 236 237 // We support these really simple operations even on types where all 238 // the actual arithmetic has to be broken down into simpler 239 // operations or turned into library calls. 240 setOperationAction(ISD::BITCAST, VT, Legal); 241 setOperationAction(ISD::LOAD, VT, Legal); 242 setOperationAction(ISD::STORE, VT, Legal); 243 setOperationAction(ISD::UNDEF, VT, Legal); 244 } 245 246 void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To, 247 LegalizeAction Action) { 248 setLoadExtAction(ISD::EXTLOAD, From, To, Action); 249 setLoadExtAction(ISD::ZEXTLOAD, From, To, Action); 250 setLoadExtAction(ISD::SEXTLOAD, From, To, Action); 251 } 252 253 void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { 254 const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 }; 255 256 for (auto VT : IntTypes) { 257 addRegisterClass(VT, &ARM::MQPRRegClass); 258 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 259 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 260 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 261 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 262 setOperationAction(ISD::SHL, VT, Custom); 263 setOperationAction(ISD::SRA, VT, Custom); 264 setOperationAction(ISD::SRL, VT, Custom); 265 setOperationAction(ISD::SMIN, VT, Legal); 266 setOperationAction(ISD::SMAX, VT, Legal); 267 setOperationAction(ISD::UMIN, VT, Legal); 268 setOperationAction(ISD::UMAX, VT, Legal); 269 setOperationAction(ISD::ABS, VT, Legal); 270 setOperationAction(ISD::SETCC, VT, Custom); 271 setOperationAction(ISD::MLOAD, VT, Custom); 272 setOperationAction(ISD::MSTORE, VT, Legal); 273 setOperationAction(ISD::CTLZ, VT, Legal); 274 setOperationAction(ISD::CTTZ, VT, Custom); 275 setOperationAction(ISD::BITREVERSE, VT, Legal); 276 setOperationAction(ISD::BSWAP, VT, Legal); 277 setOperationAction(ISD::SADDSAT, VT, Legal); 278 setOperationAction(ISD::UADDSAT, VT, Legal); 279 setOperationAction(ISD::SSUBSAT, VT, Legal); 280 setOperationAction(ISD::USUBSAT, VT, Legal); 281 282 // No native support for these. 283 setOperationAction(ISD::UDIV, VT, Expand); 284 setOperationAction(ISD::SDIV, VT, Expand); 285 setOperationAction(ISD::UREM, VT, Expand); 286 setOperationAction(ISD::SREM, VT, Expand); 287 setOperationAction(ISD::CTPOP, VT, Expand); 288 289 // Vector reductions 290 setOperationAction(ISD::VECREDUCE_ADD, VT, Legal); 291 setOperationAction(ISD::VECREDUCE_SMAX, VT, Legal); 292 setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal); 293 setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal); 294 setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal); 295 296 if (!HasMVEFP) { 297 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 298 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 299 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 300 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 301 } 302 303 // Pre and Post inc are supported on loads and stores 304 for (unsigned im = (unsigned)ISD::PRE_INC; 305 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 306 setIndexedLoadAction(im, VT, Legal); 307 setIndexedStoreAction(im, VT, Legal); 308 setIndexedMaskedLoadAction(im, VT, Legal); 309 setIndexedMaskedStoreAction(im, VT, Legal); 310 } 311 } 312 313 const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 }; 314 for (auto VT : FloatTypes) { 315 addRegisterClass(VT, &ARM::MQPRRegClass); 316 if (!HasMVEFP) 317 setAllExpand(VT); 318 319 // These are legal or custom whether we have MVE.fp or not 320 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 321 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 322 setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getVectorElementType(), Custom); 323 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 324 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 325 setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom); 326 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal); 327 setOperationAction(ISD::SETCC, VT, Custom); 328 setOperationAction(ISD::MLOAD, VT, Custom); 329 setOperationAction(ISD::MSTORE, VT, Legal); 330 331 // Pre and Post inc are supported on loads and stores 332 for (unsigned im = (unsigned)ISD::PRE_INC; 333 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 334 setIndexedLoadAction(im, VT, Legal); 335 setIndexedStoreAction(im, VT, Legal); 336 setIndexedMaskedLoadAction(im, VT, Legal); 337 setIndexedMaskedStoreAction(im, VT, Legal); 338 } 339 340 if (HasMVEFP) { 341 setOperationAction(ISD::FMINNUM, VT, Legal); 342 setOperationAction(ISD::FMAXNUM, VT, Legal); 343 setOperationAction(ISD::FROUND, VT, Legal); 344 345 // No native support for these. 346 setOperationAction(ISD::FDIV, VT, Expand); 347 setOperationAction(ISD::FREM, VT, Expand); 348 setOperationAction(ISD::FSQRT, VT, Expand); 349 setOperationAction(ISD::FSIN, VT, Expand); 350 setOperationAction(ISD::FCOS, VT, Expand); 351 setOperationAction(ISD::FPOW, VT, Expand); 352 setOperationAction(ISD::FLOG, VT, Expand); 353 setOperationAction(ISD::FLOG2, VT, Expand); 354 setOperationAction(ISD::FLOG10, VT, Expand); 355 setOperationAction(ISD::FEXP, VT, Expand); 356 setOperationAction(ISD::FEXP2, VT, Expand); 357 setOperationAction(ISD::FNEARBYINT, VT, Expand); 358 } 359 } 360 361 // We 'support' these types up to bitcast/load/store level, regardless of 362 // MVE integer-only / float support. Only doing FP data processing on the FP 363 // vector types is inhibited at integer-only level. 364 const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 }; 365 for (auto VT : LongTypes) { 366 addRegisterClass(VT, &ARM::MQPRRegClass); 367 setAllExpand(VT); 368 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 369 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 370 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 371 } 372 // We can do bitwise operations on v2i64 vectors 373 setOperationAction(ISD::AND, MVT::v2i64, Legal); 374 setOperationAction(ISD::OR, MVT::v2i64, Legal); 375 setOperationAction(ISD::XOR, MVT::v2i64, Legal); 376 377 // It is legal to extload from v4i8 to v4i16 or v4i32. 378 addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal); 379 addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal); 380 addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal); 381 382 // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16. 383 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Legal); 384 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Legal); 385 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Legal); 386 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i8, Legal); 387 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i16, Legal); 388 389 // Some truncating stores are legal too. 390 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); 391 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); 392 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); 393 394 // Pre and Post inc on these are legal, given the correct extends 395 for (unsigned im = (unsigned)ISD::PRE_INC; 396 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 397 for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) { 398 setIndexedLoadAction(im, VT, Legal); 399 setIndexedStoreAction(im, VT, Legal); 400 setIndexedMaskedLoadAction(im, VT, Legal); 401 setIndexedMaskedStoreAction(im, VT, Legal); 402 } 403 } 404 405 // Predicate types 406 const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1}; 407 for (auto VT : pTypes) { 408 addRegisterClass(VT, &ARM::VCCRRegClass); 409 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 410 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 411 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 412 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 413 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 414 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 415 setOperationAction(ISD::SETCC, VT, Custom); 416 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand); 417 setOperationAction(ISD::LOAD, VT, Custom); 418 setOperationAction(ISD::STORE, VT, Custom); 419 } 420 } 421 422 ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, 423 const ARMSubtarget &STI) 424 : TargetLowering(TM), Subtarget(&STI) { 425 RegInfo = Subtarget->getRegisterInfo(); 426 Itins = Subtarget->getInstrItineraryData(); 427 428 setBooleanContents(ZeroOrOneBooleanContent); 429 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 430 431 if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() && 432 !Subtarget->isTargetWatchOS()) { 433 bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard; 434 for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID) 435 setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID), 436 IsHFTarget ? CallingConv::ARM_AAPCS_VFP 437 : CallingConv::ARM_AAPCS); 438 } 439 440 if (Subtarget->isTargetMachO()) { 441 // Uses VFP for Thumb libfuncs if available. 442 if (Subtarget->isThumb() && Subtarget->hasVFP2Base() && 443 Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) { 444 static const struct { 445 const RTLIB::Libcall Op; 446 const char * const Name; 447 const ISD::CondCode Cond; 448 } LibraryCalls[] = { 449 // Single-precision floating-point arithmetic. 450 { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID }, 451 { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID }, 452 { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID }, 453 { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID }, 454 455 // Double-precision floating-point arithmetic. 456 { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID }, 457 { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID }, 458 { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID }, 459 { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID }, 460 461 // Single-precision comparisons. 462 { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE }, 463 { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE }, 464 { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE }, 465 { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE }, 466 { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE }, 467 { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE }, 468 { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE }, 469 470 // Double-precision comparisons. 471 { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE }, 472 { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE }, 473 { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE }, 474 { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE }, 475 { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE }, 476 { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE }, 477 { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE }, 478 479 // Floating-point to integer conversions. 480 // i64 conversions are done via library routines even when generating VFP 481 // instructions, so use the same ones. 482 { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID }, 483 { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID }, 484 { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID }, 485 { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID }, 486 487 // Conversions between floating types. 488 { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID }, 489 { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID }, 490 491 // Integer to floating-point conversions. 492 // i64 conversions are done via library routines even when generating VFP 493 // instructions, so use the same ones. 494 // FIXME: There appears to be some naming inconsistency in ARM libgcc: 495 // e.g., __floatunsidf vs. __floatunssidfvfp. 496 { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID }, 497 { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID }, 498 { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID }, 499 { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID }, 500 }; 501 502 for (const auto &LC : LibraryCalls) { 503 setLibcallName(LC.Op, LC.Name); 504 if (LC.Cond != ISD::SETCC_INVALID) 505 setCmpLibcallCC(LC.Op, LC.Cond); 506 } 507 } 508 } 509 510 // These libcalls are not available in 32-bit. 511 setLibcallName(RTLIB::SHL_I128, nullptr); 512 setLibcallName(RTLIB::SRL_I128, nullptr); 513 setLibcallName(RTLIB::SRA_I128, nullptr); 514 515 // RTLIB 516 if (Subtarget->isAAPCS_ABI() && 517 (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() || 518 Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) { 519 static const struct { 520 const RTLIB::Libcall Op; 521 const char * const Name; 522 const CallingConv::ID CC; 523 const ISD::CondCode Cond; 524 } LibraryCalls[] = { 525 // Double-precision floating-point arithmetic helper functions 526 // RTABI chapter 4.1.2, Table 2 527 { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 528 { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 529 { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 530 { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 531 532 // Double-precision floating-point comparison helper functions 533 // RTABI chapter 4.1.2, Table 3 534 { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, 535 { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, 536 { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, 537 { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, 538 { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, 539 { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, 540 { RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, 541 542 // Single-precision floating-point arithmetic helper functions 543 // RTABI chapter 4.1.2, Table 4 544 { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 545 { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 546 { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 547 { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 548 549 // Single-precision floating-point comparison helper functions 550 // RTABI chapter 4.1.2, Table 5 551 { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, 552 { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, 553 { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, 554 { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, 555 { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, 556 { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, 557 { RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, 558 559 // Floating-point to integer conversions. 560 // RTABI chapter 4.1.2, Table 6 561 { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 562 { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 563 { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 564 { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 565 { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 566 { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 567 { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 568 { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 569 570 // Conversions between floating types. 571 // RTABI chapter 4.1.2, Table 7 572 { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 573 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 574 { RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 575 576 // Integer to floating-point conversions. 577 // RTABI chapter 4.1.2, Table 8 578 { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 579 { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 580 { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 581 { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 582 { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 583 { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 584 { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 585 { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 586 587 // Long long helper functions 588 // RTABI chapter 4.2, Table 9 589 { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 590 { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 591 { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 592 { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 593 594 // Integer division functions 595 // RTABI chapter 4.3.1 596 { RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 597 { RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 598 { RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 599 { RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 600 { RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 601 { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 602 { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 603 { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 604 }; 605 606 for (const auto &LC : LibraryCalls) { 607 setLibcallName(LC.Op, LC.Name); 608 setLibcallCallingConv(LC.Op, LC.CC); 609 if (LC.Cond != ISD::SETCC_INVALID) 610 setCmpLibcallCC(LC.Op, LC.Cond); 611 } 612 613 // EABI dependent RTLIB 614 if (TM.Options.EABIVersion == EABI::EABI4 || 615 TM.Options.EABIVersion == EABI::EABI5) { 616 static const struct { 617 const RTLIB::Libcall Op; 618 const char *const Name; 619 const CallingConv::ID CC; 620 const ISD::CondCode Cond; 621 } MemOpsLibraryCalls[] = { 622 // Memory operations 623 // RTABI chapter 4.3.4 624 { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 625 { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 626 { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 627 }; 628 629 for (const auto &LC : MemOpsLibraryCalls) { 630 setLibcallName(LC.Op, LC.Name); 631 setLibcallCallingConv(LC.Op, LC.CC); 632 if (LC.Cond != ISD::SETCC_INVALID) 633 setCmpLibcallCC(LC.Op, LC.Cond); 634 } 635 } 636 } 637 638 if (Subtarget->isTargetWindows()) { 639 static const struct { 640 const RTLIB::Libcall Op; 641 const char * const Name; 642 const CallingConv::ID CC; 643 } LibraryCalls[] = { 644 { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP }, 645 { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP }, 646 { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP }, 647 { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP }, 648 { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP }, 649 { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP }, 650 { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP }, 651 { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP }, 652 }; 653 654 for (const auto &LC : LibraryCalls) { 655 setLibcallName(LC.Op, LC.Name); 656 setLibcallCallingConv(LC.Op, LC.CC); 657 } 658 } 659 660 // Use divmod compiler-rt calls for iOS 5.0 and later. 661 if (Subtarget->isTargetMachO() && 662 !(Subtarget->isTargetIOS() && 663 Subtarget->getTargetTriple().isOSVersionLT(5, 0))) { 664 setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); 665 setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); 666 } 667 668 // The half <-> float conversion functions are always soft-float on 669 // non-watchos platforms, but are needed for some targets which use a 670 // hard-float calling convention by default. 671 if (!Subtarget->isTargetWatchABI()) { 672 if (Subtarget->isAAPCS_ABI()) { 673 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS); 674 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS); 675 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS); 676 } else { 677 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS); 678 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS); 679 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS); 680 } 681 } 682 683 // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have 684 // a __gnu_ prefix (which is the default). 685 if (Subtarget->isTargetAEABI()) { 686 static const struct { 687 const RTLIB::Libcall Op; 688 const char * const Name; 689 const CallingConv::ID CC; 690 } LibraryCalls[] = { 691 { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS }, 692 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS }, 693 { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS }, 694 }; 695 696 for (const auto &LC : LibraryCalls) { 697 setLibcallName(LC.Op, LC.Name); 698 setLibcallCallingConv(LC.Op, LC.CC); 699 } 700 } 701 702 if (Subtarget->isThumb1Only()) 703 addRegisterClass(MVT::i32, &ARM::tGPRRegClass); 704 else 705 addRegisterClass(MVT::i32, &ARM::GPRRegClass); 706 707 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() && 708 Subtarget->hasFPRegs()) { 709 addRegisterClass(MVT::f32, &ARM::SPRRegClass); 710 addRegisterClass(MVT::f64, &ARM::DPRRegClass); 711 if (!Subtarget->hasVFP2Base()) 712 setAllExpand(MVT::f32); 713 if (!Subtarget->hasFP64()) 714 setAllExpand(MVT::f64); 715 } 716 717 if (Subtarget->hasFullFP16()) { 718 addRegisterClass(MVT::f16, &ARM::HPRRegClass); 719 setOperationAction(ISD::BITCAST, MVT::i16, Custom); 720 setOperationAction(ISD::BITCAST, MVT::i32, Custom); 721 setOperationAction(ISD::BITCAST, MVT::f16, Custom); 722 723 setOperationAction(ISD::FMINNUM, MVT::f16, Legal); 724 setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); 725 } 726 727 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 728 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { 729 setTruncStoreAction(VT, InnerVT, Expand); 730 addAllExtLoads(VT, InnerVT, Expand); 731 } 732 733 setOperationAction(ISD::MULHS, VT, Expand); 734 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 735 setOperationAction(ISD::MULHU, VT, Expand); 736 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 737 738 setOperationAction(ISD::BSWAP, VT, Expand); 739 } 740 741 setOperationAction(ISD::ConstantFP, MVT::f32, Custom); 742 setOperationAction(ISD::ConstantFP, MVT::f64, Custom); 743 744 setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom); 745 setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom); 746 747 if (Subtarget->hasMVEIntegerOps()) 748 addMVEVectorTypes(Subtarget->hasMVEFloatOps()); 749 750 // Combine low-overhead loop intrinsics so that we can lower i1 types. 751 if (Subtarget->hasLOB()) { 752 setTargetDAGCombine(ISD::BRCOND); 753 setTargetDAGCombine(ISD::BR_CC); 754 } 755 756 if (Subtarget->hasNEON()) { 757 addDRTypeForNEON(MVT::v2f32); 758 addDRTypeForNEON(MVT::v8i8); 759 addDRTypeForNEON(MVT::v4i16); 760 addDRTypeForNEON(MVT::v2i32); 761 addDRTypeForNEON(MVT::v1i64); 762 763 addQRTypeForNEON(MVT::v4f32); 764 addQRTypeForNEON(MVT::v2f64); 765 addQRTypeForNEON(MVT::v16i8); 766 addQRTypeForNEON(MVT::v8i16); 767 addQRTypeForNEON(MVT::v4i32); 768 addQRTypeForNEON(MVT::v2i64); 769 770 if (Subtarget->hasFullFP16()) { 771 addQRTypeForNEON(MVT::v8f16); 772 addDRTypeForNEON(MVT::v4f16); 773 } 774 } 775 776 if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) { 777 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but 778 // none of Neon, MVE or VFP supports any arithmetic operations on it. 779 setOperationAction(ISD::FADD, MVT::v2f64, Expand); 780 setOperationAction(ISD::FSUB, MVT::v2f64, Expand); 781 setOperationAction(ISD::FMUL, MVT::v2f64, Expand); 782 // FIXME: Code duplication: FDIV and FREM are expanded always, see 783 // ARMTargetLowering::addTypeForNEON method for details. 784 setOperationAction(ISD::FDIV, MVT::v2f64, Expand); 785 setOperationAction(ISD::FREM, MVT::v2f64, Expand); 786 // FIXME: Create unittest. 787 // In another words, find a way when "copysign" appears in DAG with vector 788 // operands. 789 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand); 790 // FIXME: Code duplication: SETCC has custom operation action, see 791 // ARMTargetLowering::addTypeForNEON method for details. 792 setOperationAction(ISD::SETCC, MVT::v2f64, Expand); 793 // FIXME: Create unittest for FNEG and for FABS. 794 setOperationAction(ISD::FNEG, MVT::v2f64, Expand); 795 setOperationAction(ISD::FABS, MVT::v2f64, Expand); 796 setOperationAction(ISD::FSQRT, MVT::v2f64, Expand); 797 setOperationAction(ISD::FSIN, MVT::v2f64, Expand); 798 setOperationAction(ISD::FCOS, MVT::v2f64, Expand); 799 setOperationAction(ISD::FPOW, MVT::v2f64, Expand); 800 setOperationAction(ISD::FLOG, MVT::v2f64, Expand); 801 setOperationAction(ISD::FLOG2, MVT::v2f64, Expand); 802 setOperationAction(ISD::FLOG10, MVT::v2f64, Expand); 803 setOperationAction(ISD::FEXP, MVT::v2f64, Expand); 804 setOperationAction(ISD::FEXP2, MVT::v2f64, Expand); 805 // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR. 806 setOperationAction(ISD::FCEIL, MVT::v2f64, Expand); 807 setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand); 808 setOperationAction(ISD::FRINT, MVT::v2f64, Expand); 809 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand); 810 setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand); 811 setOperationAction(ISD::FMA, MVT::v2f64, Expand); 812 } 813 814 if (Subtarget->hasNEON()) { 815 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively 816 // supported for v4f32. 817 setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); 818 setOperationAction(ISD::FSIN, MVT::v4f32, Expand); 819 setOperationAction(ISD::FCOS, MVT::v4f32, Expand); 820 setOperationAction(ISD::FPOW, MVT::v4f32, Expand); 821 setOperationAction(ISD::FLOG, MVT::v4f32, Expand); 822 setOperationAction(ISD::FLOG2, MVT::v4f32, Expand); 823 setOperationAction(ISD::FLOG10, MVT::v4f32, Expand); 824 setOperationAction(ISD::FEXP, MVT::v4f32, Expand); 825 setOperationAction(ISD::FEXP2, MVT::v4f32, Expand); 826 setOperationAction(ISD::FCEIL, MVT::v4f32, Expand); 827 setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand); 828 setOperationAction(ISD::FRINT, MVT::v4f32, Expand); 829 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); 830 setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand); 831 832 // Mark v2f32 intrinsics. 833 setOperationAction(ISD::FSQRT, MVT::v2f32, Expand); 834 setOperationAction(ISD::FSIN, MVT::v2f32, Expand); 835 setOperationAction(ISD::FCOS, MVT::v2f32, Expand); 836 setOperationAction(ISD::FPOW, MVT::v2f32, Expand); 837 setOperationAction(ISD::FLOG, MVT::v2f32, Expand); 838 setOperationAction(ISD::FLOG2, MVT::v2f32, Expand); 839 setOperationAction(ISD::FLOG10, MVT::v2f32, Expand); 840 setOperationAction(ISD::FEXP, MVT::v2f32, Expand); 841 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand); 842 setOperationAction(ISD::FCEIL, MVT::v2f32, Expand); 843 setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand); 844 setOperationAction(ISD::FRINT, MVT::v2f32, Expand); 845 setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand); 846 setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand); 847 848 // Neon does not support some operations on v1i64 and v2i64 types. 849 setOperationAction(ISD::MUL, MVT::v1i64, Expand); 850 // Custom handling for some quad-vector types to detect VMULL. 851 setOperationAction(ISD::MUL, MVT::v8i16, Custom); 852 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 853 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 854 // Custom handling for some vector types to avoid expensive expansions 855 setOperationAction(ISD::SDIV, MVT::v4i16, Custom); 856 setOperationAction(ISD::SDIV, MVT::v8i8, Custom); 857 setOperationAction(ISD::UDIV, MVT::v4i16, Custom); 858 setOperationAction(ISD::UDIV, MVT::v8i8, Custom); 859 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with 860 // a destination type that is wider than the source, and nor does 861 // it have a FP_TO_[SU]INT instruction with a narrower destination than 862 // source. 863 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); 864 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom); 865 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); 866 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); 867 setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); 868 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom); 869 setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom); 870 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); 871 872 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); 873 setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); 874 875 // NEON does not have single instruction CTPOP for vectors with element 876 // types wider than 8-bits. However, custom lowering can leverage the 877 // v8i8/v16i8 vcnt instruction. 878 setOperationAction(ISD::CTPOP, MVT::v2i32, Custom); 879 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); 880 setOperationAction(ISD::CTPOP, MVT::v4i16, Custom); 881 setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); 882 setOperationAction(ISD::CTPOP, MVT::v1i64, Custom); 883 setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); 884 885 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); 886 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); 887 888 // NEON does not have single instruction CTTZ for vectors. 889 setOperationAction(ISD::CTTZ, MVT::v8i8, Custom); 890 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom); 891 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom); 892 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); 893 894 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); 895 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); 896 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); 897 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom); 898 899 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom); 900 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom); 901 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom); 902 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom); 903 904 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom); 905 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom); 906 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); 907 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); 908 909 // NEON only has FMA instructions as of VFP4. 910 if (!Subtarget->hasVFP4Base()) { 911 setOperationAction(ISD::FMA, MVT::v2f32, Expand); 912 setOperationAction(ISD::FMA, MVT::v4f32, Expand); 913 } 914 915 setTargetDAGCombine(ISD::INTRINSIC_VOID); 916 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); 917 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 918 setTargetDAGCombine(ISD::SHL); 919 setTargetDAGCombine(ISD::SRL); 920 setTargetDAGCombine(ISD::SRA); 921 setTargetDAGCombine(ISD::FP_TO_SINT); 922 setTargetDAGCombine(ISD::FP_TO_UINT); 923 setTargetDAGCombine(ISD::FDIV); 924 setTargetDAGCombine(ISD::LOAD); 925 926 // It is legal to extload from v4i8 to v4i16 or v4i32. 927 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16, 928 MVT::v2i32}) { 929 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) { 930 setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal); 931 setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal); 932 setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal); 933 } 934 } 935 } 936 937 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) { 938 setTargetDAGCombine(ISD::BUILD_VECTOR); 939 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 940 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 941 setTargetDAGCombine(ISD::STORE); 942 setTargetDAGCombine(ISD::SIGN_EXTEND); 943 setTargetDAGCombine(ISD::ZERO_EXTEND); 944 setTargetDAGCombine(ISD::ANY_EXTEND); 945 } 946 947 if (!Subtarget->hasFP64()) { 948 // When targeting a floating-point unit with only single-precision 949 // operations, f64 is legal for the few double-precision instructions which 950 // are present However, no double-precision operations other than moves, 951 // loads and stores are provided by the hardware. 952 setOperationAction(ISD::FADD, MVT::f64, Expand); 953 setOperationAction(ISD::FSUB, MVT::f64, Expand); 954 setOperationAction(ISD::FMUL, MVT::f64, Expand); 955 setOperationAction(ISD::FMA, MVT::f64, Expand); 956 setOperationAction(ISD::FDIV, MVT::f64, Expand); 957 setOperationAction(ISD::FREM, MVT::f64, Expand); 958 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 959 setOperationAction(ISD::FGETSIGN, MVT::f64, Expand); 960 setOperationAction(ISD::FNEG, MVT::f64, Expand); 961 setOperationAction(ISD::FABS, MVT::f64, Expand); 962 setOperationAction(ISD::FSQRT, MVT::f64, Expand); 963 setOperationAction(ISD::FSIN, MVT::f64, Expand); 964 setOperationAction(ISD::FCOS, MVT::f64, Expand); 965 setOperationAction(ISD::FPOW, MVT::f64, Expand); 966 setOperationAction(ISD::FLOG, MVT::f64, Expand); 967 setOperationAction(ISD::FLOG2, MVT::f64, Expand); 968 setOperationAction(ISD::FLOG10, MVT::f64, Expand); 969 setOperationAction(ISD::FEXP, MVT::f64, Expand); 970 setOperationAction(ISD::FEXP2, MVT::f64, Expand); 971 setOperationAction(ISD::FCEIL, MVT::f64, Expand); 972 setOperationAction(ISD::FTRUNC, MVT::f64, Expand); 973 setOperationAction(ISD::FRINT, MVT::f64, Expand); 974 setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand); 975 setOperationAction(ISD::FFLOOR, MVT::f64, Expand); 976 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 977 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 978 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 979 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 980 setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom); 981 setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom); 982 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); 983 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom); 984 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom); 985 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::f64, Custom); 986 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::f64, Custom); 987 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom); 988 } 989 990 if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) { 991 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); 992 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom); 993 if (Subtarget->hasFullFP16()) { 994 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); 995 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); 996 } 997 } 998 999 if (!Subtarget->hasFP16()) { 1000 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom); 1001 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom); 1002 } 1003 1004 computeRegisterProperties(Subtarget->getRegisterInfo()); 1005 1006 // ARM does not have floating-point extending loads. 1007 for (MVT VT : MVT::fp_valuetypes()) { 1008 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); 1009 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); 1010 } 1011 1012 // ... or truncating stores 1013 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 1014 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 1015 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 1016 1017 // ARM does not have i1 sign extending load. 1018 for (MVT VT : MVT::integer_valuetypes()) 1019 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 1020 1021 // ARM supports all 4 flavors of integer indexed load / store. 1022 if (!Subtarget->isThumb1Only()) { 1023 for (unsigned im = (unsigned)ISD::PRE_INC; 1024 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 1025 setIndexedLoadAction(im, MVT::i1, Legal); 1026 setIndexedLoadAction(im, MVT::i8, Legal); 1027 setIndexedLoadAction(im, MVT::i16, Legal); 1028 setIndexedLoadAction(im, MVT::i32, Legal); 1029 setIndexedStoreAction(im, MVT::i1, Legal); 1030 setIndexedStoreAction(im, MVT::i8, Legal); 1031 setIndexedStoreAction(im, MVT::i16, Legal); 1032 setIndexedStoreAction(im, MVT::i32, Legal); 1033 } 1034 } else { 1035 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}. 1036 setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal); 1037 setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal); 1038 } 1039 1040 setOperationAction(ISD::SADDO, MVT::i32, Custom); 1041 setOperationAction(ISD::UADDO, MVT::i32, Custom); 1042 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 1043 setOperationAction(ISD::USUBO, MVT::i32, Custom); 1044 1045 setOperationAction(ISD::ADDCARRY, MVT::i32, Custom); 1046 setOperationAction(ISD::SUBCARRY, MVT::i32, Custom); 1047 if (Subtarget->hasDSP()) { 1048 setOperationAction(ISD::SADDSAT, MVT::i8, Custom); 1049 setOperationAction(ISD::SSUBSAT, MVT::i8, Custom); 1050 setOperationAction(ISD::SADDSAT, MVT::i16, Custom); 1051 setOperationAction(ISD::SSUBSAT, MVT::i16, Custom); 1052 } 1053 if (Subtarget->hasBaseDSP()) { 1054 setOperationAction(ISD::SADDSAT, MVT::i32, Legal); 1055 setOperationAction(ISD::SSUBSAT, MVT::i32, Legal); 1056 } 1057 1058 // i64 operation support. 1059 setOperationAction(ISD::MUL, MVT::i64, Expand); 1060 setOperationAction(ISD::MULHU, MVT::i32, Expand); 1061 if (Subtarget->isThumb1Only()) { 1062 setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); 1063 setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); 1064 } 1065 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops() 1066 || (Subtarget->isThumb2() && !Subtarget->hasDSP())) 1067 setOperationAction(ISD::MULHS, MVT::i32, Expand); 1068 1069 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); 1070 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); 1071 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); 1072 setOperationAction(ISD::SRL, MVT::i64, Custom); 1073 setOperationAction(ISD::SRA, MVT::i64, Custom); 1074 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 1075 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); 1076 1077 // MVE lowers 64 bit shifts to lsll and lsrl 1078 // assuming that ISD::SRL and SRA of i64 are already marked custom 1079 if (Subtarget->hasMVEIntegerOps()) 1080 setOperationAction(ISD::SHL, MVT::i64, Custom); 1081 1082 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1. 1083 if (Subtarget->isThumb1Only()) { 1084 setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand); 1085 setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand); 1086 setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand); 1087 } 1088 1089 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) 1090 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 1091 1092 // ARM does not have ROTL. 1093 setOperationAction(ISD::ROTL, MVT::i32, Expand); 1094 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 1095 setOperationAction(ISD::ROTL, VT, Expand); 1096 setOperationAction(ISD::ROTR, VT, Expand); 1097 } 1098 setOperationAction(ISD::CTTZ, MVT::i32, Custom); 1099 setOperationAction(ISD::CTPOP, MVT::i32, Expand); 1100 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) { 1101 setOperationAction(ISD::CTLZ, MVT::i32, Expand); 1102 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, LibCall); 1103 } 1104 1105 // @llvm.readcyclecounter requires the Performance Monitors extension. 1106 // Default to the 0 expansion on unsupported platforms. 1107 // FIXME: Technically there are older ARM CPUs that have 1108 // implementation-specific ways of obtaining this information. 1109 if (Subtarget->hasPerfMon()) 1110 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom); 1111 1112 // Only ARMv6 has BSWAP. 1113 if (!Subtarget->hasV6Ops()) 1114 setOperationAction(ISD::BSWAP, MVT::i32, Expand); 1115 1116 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() 1117 : Subtarget->hasDivideInARMMode(); 1118 if (!hasDivide) { 1119 // These are expanded into libcalls if the cpu doesn't have HW divider. 1120 setOperationAction(ISD::SDIV, MVT::i32, LibCall); 1121 setOperationAction(ISD::UDIV, MVT::i32, LibCall); 1122 } 1123 1124 if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) { 1125 setOperationAction(ISD::SDIV, MVT::i32, Custom); 1126 setOperationAction(ISD::UDIV, MVT::i32, Custom); 1127 1128 setOperationAction(ISD::SDIV, MVT::i64, Custom); 1129 setOperationAction(ISD::UDIV, MVT::i64, Custom); 1130 } 1131 1132 setOperationAction(ISD::SREM, MVT::i32, Expand); 1133 setOperationAction(ISD::UREM, MVT::i32, Expand); 1134 1135 // Register based DivRem for AEABI (RTABI 4.2) 1136 if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || 1137 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || 1138 Subtarget->isTargetWindows()) { 1139 setOperationAction(ISD::SREM, MVT::i64, Custom); 1140 setOperationAction(ISD::UREM, MVT::i64, Custom); 1141 HasStandaloneRem = false; 1142 1143 if (Subtarget->isTargetWindows()) { 1144 const struct { 1145 const RTLIB::Libcall Op; 1146 const char * const Name; 1147 const CallingConv::ID CC; 1148 } LibraryCalls[] = { 1149 { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS }, 1150 { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS }, 1151 { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS }, 1152 { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS }, 1153 1154 { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS }, 1155 { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS }, 1156 { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS }, 1157 { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS }, 1158 }; 1159 1160 for (const auto &LC : LibraryCalls) { 1161 setLibcallName(LC.Op, LC.Name); 1162 setLibcallCallingConv(LC.Op, LC.CC); 1163 } 1164 } else { 1165 const struct { 1166 const RTLIB::Libcall Op; 1167 const char * const Name; 1168 const CallingConv::ID CC; 1169 } LibraryCalls[] = { 1170 { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 1171 { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 1172 { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 1173 { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS }, 1174 1175 { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 1176 { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 1177 { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 1178 { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS }, 1179 }; 1180 1181 for (const auto &LC : LibraryCalls) { 1182 setLibcallName(LC.Op, LC.Name); 1183 setLibcallCallingConv(LC.Op, LC.CC); 1184 } 1185 } 1186 1187 setOperationAction(ISD::SDIVREM, MVT::i32, Custom); 1188 setOperationAction(ISD::UDIVREM, MVT::i32, Custom); 1189 setOperationAction(ISD::SDIVREM, MVT::i64, Custom); 1190 setOperationAction(ISD::UDIVREM, MVT::i64, Custom); 1191 } else { 1192 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 1193 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 1194 } 1195 1196 if (Subtarget->getTargetTriple().isOSMSVCRT()) { 1197 // MSVCRT doesn't have powi; fall back to pow 1198 setLibcallName(RTLIB::POWI_F32, nullptr); 1199 setLibcallName(RTLIB::POWI_F64, nullptr); 1200 } 1201 1202 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 1203 setOperationAction(ISD::ConstantPool, MVT::i32, Custom); 1204 setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); 1205 setOperationAction(ISD::BlockAddress, MVT::i32, Custom); 1206 1207 setOperationAction(ISD::TRAP, MVT::Other, Legal); 1208 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); 1209 1210 // Use the default implementation. 1211 setOperationAction(ISD::VASTART, MVT::Other, Custom); 1212 setOperationAction(ISD::VAARG, MVT::Other, Expand); 1213 setOperationAction(ISD::VACOPY, MVT::Other, Expand); 1214 setOperationAction(ISD::VAEND, MVT::Other, Expand); 1215 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 1216 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 1217 1218 if (Subtarget->isTargetWindows()) 1219 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 1220 else 1221 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 1222 1223 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use 1224 // the default expansion. 1225 InsertFencesForAtomic = false; 1226 if (Subtarget->hasAnyDataBarrier() && 1227 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) { 1228 // ATOMIC_FENCE needs custom lowering; the others should have been expanded 1229 // to ldrex/strex loops already. 1230 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); 1231 if (!Subtarget->isThumb() || !Subtarget->isMClass()) 1232 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 1233 1234 // On v8, we have particularly efficient implementations of atomic fences 1235 // if they can be combined with nearby atomic loads and stores. 1236 if (!Subtarget->hasAcquireRelease() || 1237 getTargetMachine().getOptLevel() == 0) { 1238 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc. 1239 InsertFencesForAtomic = true; 1240 } 1241 } else { 1242 // If there's anything we can use as a barrier, go through custom lowering 1243 // for ATOMIC_FENCE. 1244 // If target has DMB in thumb, Fences can be inserted. 1245 if (Subtarget->hasDataBarrier()) 1246 InsertFencesForAtomic = true; 1247 1248 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, 1249 Subtarget->hasAnyDataBarrier() ? Custom : Expand); 1250 1251 // Set them all for expansion, which will force libcalls. 1252 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand); 1253 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand); 1254 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand); 1255 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand); 1256 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand); 1257 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand); 1258 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand); 1259 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand); 1260 setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand); 1261 setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand); 1262 setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand); 1263 setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand); 1264 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the 1265 // Unordered/Monotonic case. 1266 if (!InsertFencesForAtomic) { 1267 setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom); 1268 setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom); 1269 } 1270 } 1271 1272 setOperationAction(ISD::PREFETCH, MVT::Other, Custom); 1273 1274 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes. 1275 if (!Subtarget->hasV6Ops()) { 1276 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); 1277 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); 1278 } 1279 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 1280 1281 if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() && 1282 !Subtarget->isThumb1Only()) { 1283 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR 1284 // iff target supports vfp2. 1285 setOperationAction(ISD::BITCAST, MVT::i64, Custom); 1286 setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); 1287 } 1288 1289 // We want to custom lower some of our intrinsics. 1290 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1291 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); 1292 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); 1293 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); 1294 if (Subtarget->useSjLjEH()) 1295 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); 1296 1297 setOperationAction(ISD::SETCC, MVT::i32, Expand); 1298 setOperationAction(ISD::SETCC, MVT::f32, Expand); 1299 setOperationAction(ISD::SETCC, MVT::f64, Expand); 1300 setOperationAction(ISD::SELECT, MVT::i32, Custom); 1301 setOperationAction(ISD::SELECT, MVT::f32, Custom); 1302 setOperationAction(ISD::SELECT, MVT::f64, Custom); 1303 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 1304 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 1305 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 1306 if (Subtarget->hasFullFP16()) { 1307 setOperationAction(ISD::SETCC, MVT::f16, Expand); 1308 setOperationAction(ISD::SELECT, MVT::f16, Custom); 1309 setOperationAction(ISD::SELECT_CC, MVT::f16, Custom); 1310 } 1311 1312 setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom); 1313 1314 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 1315 setOperationAction(ISD::BR_CC, MVT::i32, Custom); 1316 if (Subtarget->hasFullFP16()) 1317 setOperationAction(ISD::BR_CC, MVT::f16, Custom); 1318 setOperationAction(ISD::BR_CC, MVT::f32, Custom); 1319 setOperationAction(ISD::BR_CC, MVT::f64, Custom); 1320 setOperationAction(ISD::BR_JT, MVT::Other, Custom); 1321 1322 // We don't support sin/cos/fmod/copysign/pow 1323 setOperationAction(ISD::FSIN, MVT::f64, Expand); 1324 setOperationAction(ISD::FSIN, MVT::f32, Expand); 1325 setOperationAction(ISD::FCOS, MVT::f32, Expand); 1326 setOperationAction(ISD::FCOS, MVT::f64, Expand); 1327 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 1328 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 1329 setOperationAction(ISD::FREM, MVT::f64, Expand); 1330 setOperationAction(ISD::FREM, MVT::f32, Expand); 1331 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() && 1332 !Subtarget->isThumb1Only()) { 1333 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 1334 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 1335 } 1336 setOperationAction(ISD::FPOW, MVT::f64, Expand); 1337 setOperationAction(ISD::FPOW, MVT::f32, Expand); 1338 1339 if (!Subtarget->hasVFP4Base()) { 1340 setOperationAction(ISD::FMA, MVT::f64, Expand); 1341 setOperationAction(ISD::FMA, MVT::f32, Expand); 1342 } 1343 1344 // Various VFP goodness 1345 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) { 1346 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded. 1347 if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) { 1348 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); 1349 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); 1350 } 1351 1352 // fp16 is a special v7 extension that adds f16 <-> f32 conversions. 1353 if (!Subtarget->hasFP16()) { 1354 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); 1355 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); 1356 } 1357 1358 // Strict floating-point comparisons need custom lowering. 1359 setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom); 1360 setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom); 1361 setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom); 1362 setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom); 1363 setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom); 1364 setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom); 1365 } 1366 1367 // Use __sincos_stret if available. 1368 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && 1369 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { 1370 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 1371 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 1372 } 1373 1374 // FP-ARMv8 implements a lot of rounding-like FP operations. 1375 if (Subtarget->hasFPARMv8Base()) { 1376 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 1377 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 1378 setOperationAction(ISD::FROUND, MVT::f32, Legal); 1379 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 1380 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 1381 setOperationAction(ISD::FRINT, MVT::f32, Legal); 1382 setOperationAction(ISD::FMINNUM, MVT::f32, Legal); 1383 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); 1384 if (Subtarget->hasNEON()) { 1385 setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal); 1386 setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal); 1387 setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); 1388 setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); 1389 } 1390 1391 if (Subtarget->hasFP64()) { 1392 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 1393 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 1394 setOperationAction(ISD::FROUND, MVT::f64, Legal); 1395 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 1396 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 1397 setOperationAction(ISD::FRINT, MVT::f64, Legal); 1398 setOperationAction(ISD::FMINNUM, MVT::f64, Legal); 1399 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); 1400 } 1401 } 1402 1403 // FP16 often need to be promoted to call lib functions 1404 if (Subtarget->hasFullFP16()) { 1405 setOperationAction(ISD::FREM, MVT::f16, Promote); 1406 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); 1407 setOperationAction(ISD::FSIN, MVT::f16, Promote); 1408 setOperationAction(ISD::FCOS, MVT::f16, Promote); 1409 setOperationAction(ISD::FSINCOS, MVT::f16, Promote); 1410 setOperationAction(ISD::FPOWI, MVT::f16, Promote); 1411 setOperationAction(ISD::FPOW, MVT::f16, Promote); 1412 setOperationAction(ISD::FEXP, MVT::f16, Promote); 1413 setOperationAction(ISD::FEXP2, MVT::f16, Promote); 1414 setOperationAction(ISD::FLOG, MVT::f16, Promote); 1415 setOperationAction(ISD::FLOG10, MVT::f16, Promote); 1416 setOperationAction(ISD::FLOG2, MVT::f16, Promote); 1417 1418 setOperationAction(ISD::FROUND, MVT::f16, Legal); 1419 } 1420 1421 if (Subtarget->hasNEON()) { 1422 // vmin and vmax aren't available in a scalar form, so we use 1423 // a NEON instruction with an undef lane instead. 1424 setOperationAction(ISD::FMINIMUM, MVT::f16, Legal); 1425 setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal); 1426 setOperationAction(ISD::FMINIMUM, MVT::f32, Legal); 1427 setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal); 1428 setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal); 1429 setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal); 1430 setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal); 1431 setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal); 1432 1433 if (Subtarget->hasFullFP16()) { 1434 setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal); 1435 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Legal); 1436 setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal); 1437 setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal); 1438 1439 setOperationAction(ISD::FMINIMUM, MVT::v4f16, Legal); 1440 setOperationAction(ISD::FMAXIMUM, MVT::v4f16, Legal); 1441 setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal); 1442 setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal); 1443 } 1444 } 1445 1446 // We have target-specific dag combine patterns for the following nodes: 1447 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine 1448 setTargetDAGCombine(ISD::ADD); 1449 setTargetDAGCombine(ISD::SUB); 1450 setTargetDAGCombine(ISD::MUL); 1451 setTargetDAGCombine(ISD::AND); 1452 setTargetDAGCombine(ISD::OR); 1453 setTargetDAGCombine(ISD::XOR); 1454 1455 if (Subtarget->hasV6Ops()) 1456 setTargetDAGCombine(ISD::SRL); 1457 if (Subtarget->isThumb1Only()) 1458 setTargetDAGCombine(ISD::SHL); 1459 1460 setStackPointerRegisterToSaveRestore(ARM::SP); 1461 1462 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() || 1463 !Subtarget->hasVFP2Base() || Subtarget->hasMinSize()) 1464 setSchedulingPreference(Sched::RegPressure); 1465 else 1466 setSchedulingPreference(Sched::Hybrid); 1467 1468 //// temporary - rewrite interface to use type 1469 MaxStoresPerMemset = 8; 1470 MaxStoresPerMemsetOptSize = 4; 1471 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores 1472 MaxStoresPerMemcpyOptSize = 2; 1473 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores 1474 MaxStoresPerMemmoveOptSize = 2; 1475 1476 // On ARM arguments smaller than 4 bytes are extended, so all arguments 1477 // are at least 4 bytes aligned. 1478 setMinStackArgumentAlignment(Align(4)); 1479 1480 // Prefer likely predicted branches to selects on out-of-order cores. 1481 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder(); 1482 1483 setPrefLoopAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment())); 1484 1485 setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4)); 1486 1487 if (Subtarget->isThumb() || Subtarget->isThumb2()) 1488 setTargetDAGCombine(ISD::ABS); 1489 } 1490 1491 bool ARMTargetLowering::useSoftFloat() const { 1492 return Subtarget->useSoftFloat(); 1493 } 1494 1495 // FIXME: It might make sense to define the representative register class as the 1496 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is 1497 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently, 1498 // SPR's representative would be DPR_VFP2. This should work well if register 1499 // pressure tracking were modified such that a register use would increment the 1500 // pressure of the register class's representative and all of it's super 1501 // classes' representatives transitively. We have not implemented this because 1502 // of the difficulty prior to coalescing of modeling operand register classes 1503 // due to the common occurrence of cross class copies and subregister insertions 1504 // and extractions. 1505 std::pair<const TargetRegisterClass *, uint8_t> 1506 ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, 1507 MVT VT) const { 1508 const TargetRegisterClass *RRC = nullptr; 1509 uint8_t Cost = 1; 1510 switch (VT.SimpleTy) { 1511 default: 1512 return TargetLowering::findRepresentativeClass(TRI, VT); 1513 // Use DPR as representative register class for all floating point 1514 // and vector types. Since there are 32 SPR registers and 32 DPR registers so 1515 // the cost is 1 for both f32 and f64. 1516 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16: 1517 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32: 1518 RRC = &ARM::DPRRegClass; 1519 // When NEON is used for SP, only half of the register file is available 1520 // because operations that define both SP and DP results will be constrained 1521 // to the VFP2 class (D0-D15). We currently model this constraint prior to 1522 // coalescing by double-counting the SP regs. See the FIXME above. 1523 if (Subtarget->useNEONForSinglePrecisionFP()) 1524 Cost = 2; 1525 break; 1526 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1527 case MVT::v4f32: case MVT::v2f64: 1528 RRC = &ARM::DPRRegClass; 1529 Cost = 2; 1530 break; 1531 case MVT::v4i64: 1532 RRC = &ARM::DPRRegClass; 1533 Cost = 4; 1534 break; 1535 case MVT::v8i64: 1536 RRC = &ARM::DPRRegClass; 1537 Cost = 8; 1538 break; 1539 } 1540 return std::make_pair(RRC, Cost); 1541 } 1542 1543 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { 1544 switch ((ARMISD::NodeType)Opcode) { 1545 case ARMISD::FIRST_NUMBER: break; 1546 case ARMISD::Wrapper: return "ARMISD::Wrapper"; 1547 case ARMISD::WrapperPIC: return "ARMISD::WrapperPIC"; 1548 case ARMISD::WrapperJT: return "ARMISD::WrapperJT"; 1549 case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL"; 1550 case ARMISD::CALL: return "ARMISD::CALL"; 1551 case ARMISD::CALL_PRED: return "ARMISD::CALL_PRED"; 1552 case ARMISD::CALL_NOLINK: return "ARMISD::CALL_NOLINK"; 1553 case ARMISD::BRCOND: return "ARMISD::BRCOND"; 1554 case ARMISD::BR_JT: return "ARMISD::BR_JT"; 1555 case ARMISD::BR2_JT: return "ARMISD::BR2_JT"; 1556 case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG"; 1557 case ARMISD::INTRET_FLAG: return "ARMISD::INTRET_FLAG"; 1558 case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD"; 1559 case ARMISD::CMP: return "ARMISD::CMP"; 1560 case ARMISD::CMN: return "ARMISD::CMN"; 1561 case ARMISD::CMPZ: return "ARMISD::CMPZ"; 1562 case ARMISD::CMPFP: return "ARMISD::CMPFP"; 1563 case ARMISD::CMPFPE: return "ARMISD::CMPFPE"; 1564 case ARMISD::CMPFPw0: return "ARMISD::CMPFPw0"; 1565 case ARMISD::CMPFPEw0: return "ARMISD::CMPFPEw0"; 1566 case ARMISD::BCC_i64: return "ARMISD::BCC_i64"; 1567 case ARMISD::FMSTAT: return "ARMISD::FMSTAT"; 1568 1569 case ARMISD::CMOV: return "ARMISD::CMOV"; 1570 case ARMISD::SUBS: return "ARMISD::SUBS"; 1571 1572 case ARMISD::SSAT: return "ARMISD::SSAT"; 1573 case ARMISD::USAT: return "ARMISD::USAT"; 1574 1575 case ARMISD::ASRL: return "ARMISD::ASRL"; 1576 case ARMISD::LSRL: return "ARMISD::LSRL"; 1577 case ARMISD::LSLL: return "ARMISD::LSLL"; 1578 1579 case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG"; 1580 case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG"; 1581 case ARMISD::RRX: return "ARMISD::RRX"; 1582 1583 case ARMISD::ADDC: return "ARMISD::ADDC"; 1584 case ARMISD::ADDE: return "ARMISD::ADDE"; 1585 case ARMISD::SUBC: return "ARMISD::SUBC"; 1586 case ARMISD::SUBE: return "ARMISD::SUBE"; 1587 case ARMISD::LSLS: return "ARMISD::LSLS"; 1588 1589 case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD"; 1590 case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR"; 1591 case ARMISD::VMOVhr: return "ARMISD::VMOVhr"; 1592 case ARMISD::VMOVrh: return "ARMISD::VMOVrh"; 1593 case ARMISD::VMOVSR: return "ARMISD::VMOVSR"; 1594 1595 case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP"; 1596 case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP"; 1597 case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH"; 1598 1599 case ARMISD::TC_RETURN: return "ARMISD::TC_RETURN"; 1600 1601 case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER"; 1602 1603 case ARMISD::DYN_ALLOC: return "ARMISD::DYN_ALLOC"; 1604 1605 case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR"; 1606 1607 case ARMISD::PRELOAD: return "ARMISD::PRELOAD"; 1608 1609 case ARMISD::WIN__CHKSTK: return "ARMISD::WIN__CHKSTK"; 1610 case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK"; 1611 1612 case ARMISD::PREDICATE_CAST: return "ARMISD::PREDICATE_CAST"; 1613 case ARMISD::VCMP: return "ARMISD::VCMP"; 1614 case ARMISD::VCMPZ: return "ARMISD::VCMPZ"; 1615 case ARMISD::VTST: return "ARMISD::VTST"; 1616 1617 case ARMISD::VSHLs: return "ARMISD::VSHLs"; 1618 case ARMISD::VSHLu: return "ARMISD::VSHLu"; 1619 case ARMISD::VSHLIMM: return "ARMISD::VSHLIMM"; 1620 case ARMISD::VSHRsIMM: return "ARMISD::VSHRsIMM"; 1621 case ARMISD::VSHRuIMM: return "ARMISD::VSHRuIMM"; 1622 case ARMISD::VRSHRsIMM: return "ARMISD::VRSHRsIMM"; 1623 case ARMISD::VRSHRuIMM: return "ARMISD::VRSHRuIMM"; 1624 case ARMISD::VRSHRNIMM: return "ARMISD::VRSHRNIMM"; 1625 case ARMISD::VQSHLsIMM: return "ARMISD::VQSHLsIMM"; 1626 case ARMISD::VQSHLuIMM: return "ARMISD::VQSHLuIMM"; 1627 case ARMISD::VQSHLsuIMM: return "ARMISD::VQSHLsuIMM"; 1628 case ARMISD::VQSHRNsIMM: return "ARMISD::VQSHRNsIMM"; 1629 case ARMISD::VQSHRNuIMM: return "ARMISD::VQSHRNuIMM"; 1630 case ARMISD::VQSHRNsuIMM: return "ARMISD::VQSHRNsuIMM"; 1631 case ARMISD::VQRSHRNsIMM: return "ARMISD::VQRSHRNsIMM"; 1632 case ARMISD::VQRSHRNuIMM: return "ARMISD::VQRSHRNuIMM"; 1633 case ARMISD::VQRSHRNsuIMM: return "ARMISD::VQRSHRNsuIMM"; 1634 case ARMISD::VSLIIMM: return "ARMISD::VSLIIMM"; 1635 case ARMISD::VSRIIMM: return "ARMISD::VSRIIMM"; 1636 case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu"; 1637 case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs"; 1638 case ARMISD::VMOVIMM: return "ARMISD::VMOVIMM"; 1639 case ARMISD::VMVNIMM: return "ARMISD::VMVNIMM"; 1640 case ARMISD::VMOVFPIMM: return "ARMISD::VMOVFPIMM"; 1641 case ARMISD::VDUP: return "ARMISD::VDUP"; 1642 case ARMISD::VDUPLANE: return "ARMISD::VDUPLANE"; 1643 case ARMISD::VEXT: return "ARMISD::VEXT"; 1644 case ARMISD::VREV64: return "ARMISD::VREV64"; 1645 case ARMISD::VREV32: return "ARMISD::VREV32"; 1646 case ARMISD::VREV16: return "ARMISD::VREV16"; 1647 case ARMISD::VZIP: return "ARMISD::VZIP"; 1648 case ARMISD::VUZP: return "ARMISD::VUZP"; 1649 case ARMISD::VTRN: return "ARMISD::VTRN"; 1650 case ARMISD::VTBL1: return "ARMISD::VTBL1"; 1651 case ARMISD::VTBL2: return "ARMISD::VTBL2"; 1652 case ARMISD::VMOVN: return "ARMISD::VMOVN"; 1653 case ARMISD::VMULLs: return "ARMISD::VMULLs"; 1654 case ARMISD::VMULLu: return "ARMISD::VMULLu"; 1655 case ARMISD::UMAAL: return "ARMISD::UMAAL"; 1656 case ARMISD::UMLAL: return "ARMISD::UMLAL"; 1657 case ARMISD::SMLAL: return "ARMISD::SMLAL"; 1658 case ARMISD::SMLALBB: return "ARMISD::SMLALBB"; 1659 case ARMISD::SMLALBT: return "ARMISD::SMLALBT"; 1660 case ARMISD::SMLALTB: return "ARMISD::SMLALTB"; 1661 case ARMISD::SMLALTT: return "ARMISD::SMLALTT"; 1662 case ARMISD::SMULWB: return "ARMISD::SMULWB"; 1663 case ARMISD::SMULWT: return "ARMISD::SMULWT"; 1664 case ARMISD::SMLALD: return "ARMISD::SMLALD"; 1665 case ARMISD::SMLALDX: return "ARMISD::SMLALDX"; 1666 case ARMISD::SMLSLD: return "ARMISD::SMLSLD"; 1667 case ARMISD::SMLSLDX: return "ARMISD::SMLSLDX"; 1668 case ARMISD::SMMLAR: return "ARMISD::SMMLAR"; 1669 case ARMISD::SMMLSR: return "ARMISD::SMMLSR"; 1670 case ARMISD::QADD16b: return "ARMISD::QADD16b"; 1671 case ARMISD::QSUB16b: return "ARMISD::QSUB16b"; 1672 case ARMISD::QADD8b: return "ARMISD::QADD8b"; 1673 case ARMISD::QSUB8b: return "ARMISD::QSUB8b"; 1674 case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; 1675 case ARMISD::BFI: return "ARMISD::BFI"; 1676 case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; 1677 case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; 1678 case ARMISD::VBSL: return "ARMISD::VBSL"; 1679 case ARMISD::MEMCPY: return "ARMISD::MEMCPY"; 1680 case ARMISD::VLD1DUP: return "ARMISD::VLD1DUP"; 1681 case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; 1682 case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP"; 1683 case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP"; 1684 case ARMISD::VLD1_UPD: return "ARMISD::VLD1_UPD"; 1685 case ARMISD::VLD2_UPD: return "ARMISD::VLD2_UPD"; 1686 case ARMISD::VLD3_UPD: return "ARMISD::VLD3_UPD"; 1687 case ARMISD::VLD4_UPD: return "ARMISD::VLD4_UPD"; 1688 case ARMISD::VLD2LN_UPD: return "ARMISD::VLD2LN_UPD"; 1689 case ARMISD::VLD3LN_UPD: return "ARMISD::VLD3LN_UPD"; 1690 case ARMISD::VLD4LN_UPD: return "ARMISD::VLD4LN_UPD"; 1691 case ARMISD::VLD1DUP_UPD: return "ARMISD::VLD1DUP_UPD"; 1692 case ARMISD::VLD2DUP_UPD: return "ARMISD::VLD2DUP_UPD"; 1693 case ARMISD::VLD3DUP_UPD: return "ARMISD::VLD3DUP_UPD"; 1694 case ARMISD::VLD4DUP_UPD: return "ARMISD::VLD4DUP_UPD"; 1695 case ARMISD::VST1_UPD: return "ARMISD::VST1_UPD"; 1696 case ARMISD::VST2_UPD: return "ARMISD::VST2_UPD"; 1697 case ARMISD::VST3_UPD: return "ARMISD::VST3_UPD"; 1698 case ARMISD::VST4_UPD: return "ARMISD::VST4_UPD"; 1699 case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD"; 1700 case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD"; 1701 case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD"; 1702 case ARMISD::WLS: return "ARMISD::WLS"; 1703 case ARMISD::LE: return "ARMISD::LE"; 1704 case ARMISD::LOOP_DEC: return "ARMISD::LOOP_DEC"; 1705 case ARMISD::CSINV: return "ARMISD::CSINV"; 1706 case ARMISD::CSNEG: return "ARMISD::CSNEG"; 1707 case ARMISD::CSINC: return "ARMISD::CSINC"; 1708 } 1709 return nullptr; 1710 } 1711 1712 EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, 1713 EVT VT) const { 1714 if (!VT.isVector()) 1715 return getPointerTy(DL); 1716 1717 // MVE has a predicate register. 1718 if (Subtarget->hasMVEIntegerOps() && 1719 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8)) 1720 return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount()); 1721 return VT.changeVectorElementTypeToInteger(); 1722 } 1723 1724 /// getRegClassFor - Return the register class that should be used for the 1725 /// specified value type. 1726 const TargetRegisterClass * 1727 ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { 1728 (void)isDivergent; 1729 // Map v4i64 to QQ registers but do not make the type legal. Similarly map 1730 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to 1731 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive 1732 // MVE Q registers. 1733 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) { 1734 if (VT == MVT::v4i64) 1735 return &ARM::QQPRRegClass; 1736 if (VT == MVT::v8i64) 1737 return &ARM::QQQQPRRegClass; 1738 } 1739 return TargetLowering::getRegClassFor(VT); 1740 } 1741 1742 // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the 1743 // source/dest is aligned and the copy size is large enough. We therefore want 1744 // to align such objects passed to memory intrinsics. 1745 bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, 1746 unsigned &PrefAlign) const { 1747 if (!isa<MemIntrinsic>(CI)) 1748 return false; 1749 MinSize = 8; 1750 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1 1751 // cycle faster than 4-byte aligned LDM. 1752 PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4); 1753 return true; 1754 } 1755 1756 // Create a fast isel object. 1757 FastISel * 1758 ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 1759 const TargetLibraryInfo *libInfo) const { 1760 return ARM::createFastISel(funcInfo, libInfo); 1761 } 1762 1763 Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { 1764 unsigned NumVals = N->getNumValues(); 1765 if (!NumVals) 1766 return Sched::RegPressure; 1767 1768 for (unsigned i = 0; i != NumVals; ++i) { 1769 EVT VT = N->getValueType(i); 1770 if (VT == MVT::Glue || VT == MVT::Other) 1771 continue; 1772 if (VT.isFloatingPoint() || VT.isVector()) 1773 return Sched::ILP; 1774 } 1775 1776 if (!N->isMachineOpcode()) 1777 return Sched::RegPressure; 1778 1779 // Load are scheduled for latency even if there instruction itinerary 1780 // is not available. 1781 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 1782 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); 1783 1784 if (MCID.getNumDefs() == 0) 1785 return Sched::RegPressure; 1786 if (!Itins->isEmpty() && 1787 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2) 1788 return Sched::ILP; 1789 1790 return Sched::RegPressure; 1791 } 1792 1793 //===----------------------------------------------------------------------===// 1794 // Lowering Code 1795 //===----------------------------------------------------------------------===// 1796 1797 static bool isSRL16(const SDValue &Op) { 1798 if (Op.getOpcode() != ISD::SRL) 1799 return false; 1800 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1801 return Const->getZExtValue() == 16; 1802 return false; 1803 } 1804 1805 static bool isSRA16(const SDValue &Op) { 1806 if (Op.getOpcode() != ISD::SRA) 1807 return false; 1808 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1809 return Const->getZExtValue() == 16; 1810 return false; 1811 } 1812 1813 static bool isSHL16(const SDValue &Op) { 1814 if (Op.getOpcode() != ISD::SHL) 1815 return false; 1816 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1817 return Const->getZExtValue() == 16; 1818 return false; 1819 } 1820 1821 // Check for a signed 16-bit value. We special case SRA because it makes it 1822 // more simple when also looking for SRAs that aren't sign extending a 1823 // smaller value. Without the check, we'd need to take extra care with 1824 // checking order for some operations. 1825 static bool isS16(const SDValue &Op, SelectionDAG &DAG) { 1826 if (isSRA16(Op)) 1827 return isSHL16(Op.getOperand(0)); 1828 return DAG.ComputeNumSignBits(Op) == 17; 1829 } 1830 1831 /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC 1832 static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) { 1833 switch (CC) { 1834 default: llvm_unreachable("Unknown condition code!"); 1835 case ISD::SETNE: return ARMCC::NE; 1836 case ISD::SETEQ: return ARMCC::EQ; 1837 case ISD::SETGT: return ARMCC::GT; 1838 case ISD::SETGE: return ARMCC::GE; 1839 case ISD::SETLT: return ARMCC::LT; 1840 case ISD::SETLE: return ARMCC::LE; 1841 case ISD::SETUGT: return ARMCC::HI; 1842 case ISD::SETUGE: return ARMCC::HS; 1843 case ISD::SETULT: return ARMCC::LO; 1844 case ISD::SETULE: return ARMCC::LS; 1845 } 1846 } 1847 1848 /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC. 1849 static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, 1850 ARMCC::CondCodes &CondCode2) { 1851 CondCode2 = ARMCC::AL; 1852 switch (CC) { 1853 default: llvm_unreachable("Unknown FP condition!"); 1854 case ISD::SETEQ: 1855 case ISD::SETOEQ: CondCode = ARMCC::EQ; break; 1856 case ISD::SETGT: 1857 case ISD::SETOGT: CondCode = ARMCC::GT; break; 1858 case ISD::SETGE: 1859 case ISD::SETOGE: CondCode = ARMCC::GE; break; 1860 case ISD::SETOLT: CondCode = ARMCC::MI; break; 1861 case ISD::SETOLE: CondCode = ARMCC::LS; break; 1862 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break; 1863 case ISD::SETO: CondCode = ARMCC::VC; break; 1864 case ISD::SETUO: CondCode = ARMCC::VS; break; 1865 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break; 1866 case ISD::SETUGT: CondCode = ARMCC::HI; break; 1867 case ISD::SETUGE: CondCode = ARMCC::PL; break; 1868 case ISD::SETLT: 1869 case ISD::SETULT: CondCode = ARMCC::LT; break; 1870 case ISD::SETLE: 1871 case ISD::SETULE: CondCode = ARMCC::LE; break; 1872 case ISD::SETNE: 1873 case ISD::SETUNE: CondCode = ARMCC::NE; break; 1874 } 1875 } 1876 1877 //===----------------------------------------------------------------------===// 1878 // Calling Convention Implementation 1879 //===----------------------------------------------------------------------===// 1880 1881 /// getEffectiveCallingConv - Get the effective calling convention, taking into 1882 /// account presence of floating point hardware and calling convention 1883 /// limitations, such as support for variadic functions. 1884 CallingConv::ID 1885 ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC, 1886 bool isVarArg) const { 1887 switch (CC) { 1888 default: 1889 report_fatal_error("Unsupported calling convention"); 1890 case CallingConv::ARM_AAPCS: 1891 case CallingConv::ARM_APCS: 1892 case CallingConv::GHC: 1893 case CallingConv::CFGuard_Check: 1894 return CC; 1895 case CallingConv::PreserveMost: 1896 return CallingConv::PreserveMost; 1897 case CallingConv::ARM_AAPCS_VFP: 1898 case CallingConv::Swift: 1899 return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP; 1900 case CallingConv::C: 1901 if (!Subtarget->isAAPCS_ABI()) 1902 return CallingConv::ARM_APCS; 1903 else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && 1904 getTargetMachine().Options.FloatABIType == FloatABI::Hard && 1905 !isVarArg) 1906 return CallingConv::ARM_AAPCS_VFP; 1907 else 1908 return CallingConv::ARM_AAPCS; 1909 case CallingConv::Fast: 1910 case CallingConv::CXX_FAST_TLS: 1911 if (!Subtarget->isAAPCS_ABI()) { 1912 if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg) 1913 return CallingConv::Fast; 1914 return CallingConv::ARM_APCS; 1915 } else if (Subtarget->hasVFP2Base() && 1916 !Subtarget->isThumb1Only() && !isVarArg) 1917 return CallingConv::ARM_AAPCS_VFP; 1918 else 1919 return CallingConv::ARM_AAPCS; 1920 } 1921 } 1922 1923 CCAssignFn *ARMTargetLowering::CCAssignFnForCall(CallingConv::ID CC, 1924 bool isVarArg) const { 1925 return CCAssignFnForNode(CC, false, isVarArg); 1926 } 1927 1928 CCAssignFn *ARMTargetLowering::CCAssignFnForReturn(CallingConv::ID CC, 1929 bool isVarArg) const { 1930 return CCAssignFnForNode(CC, true, isVarArg); 1931 } 1932 1933 /// CCAssignFnForNode - Selects the correct CCAssignFn for the given 1934 /// CallingConvention. 1935 CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, 1936 bool Return, 1937 bool isVarArg) const { 1938 switch (getEffectiveCallingConv(CC, isVarArg)) { 1939 default: 1940 report_fatal_error("Unsupported calling convention"); 1941 case CallingConv::ARM_APCS: 1942 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS); 1943 case CallingConv::ARM_AAPCS: 1944 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); 1945 case CallingConv::ARM_AAPCS_VFP: 1946 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP); 1947 case CallingConv::Fast: 1948 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS); 1949 case CallingConv::GHC: 1950 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC); 1951 case CallingConv::PreserveMost: 1952 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); 1953 case CallingConv::CFGuard_Check: 1954 return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check); 1955 } 1956 } 1957 1958 /// LowerCallResult - Lower the result values of a call into the 1959 /// appropriate copies out of appropriate physical registers. 1960 SDValue ARMTargetLowering::LowerCallResult( 1961 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, 1962 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 1963 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn, 1964 SDValue ThisVal) const { 1965 // Assign locations to each value returned by this call. 1966 SmallVector<CCValAssign, 16> RVLocs; 1967 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 1968 *DAG.getContext()); 1969 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg)); 1970 1971 // Copy all of the result registers out of their specified physreg. 1972 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1973 CCValAssign VA = RVLocs[i]; 1974 1975 // Pass 'this' value directly from the argument to return value, to avoid 1976 // reg unit interference 1977 if (i == 0 && isThisReturn) { 1978 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 && 1979 "unexpected return calling convention register assignment"); 1980 InVals.push_back(ThisVal); 1981 continue; 1982 } 1983 1984 SDValue Val; 1985 if (VA.needsCustom()) { 1986 // Handle f64 or half of a v2f64. 1987 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 1988 InFlag); 1989 Chain = Lo.getValue(1); 1990 InFlag = Lo.getValue(2); 1991 VA = RVLocs[++i]; // skip ahead to next loc 1992 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 1993 InFlag); 1994 Chain = Hi.getValue(1); 1995 InFlag = Hi.getValue(2); 1996 if (!Subtarget->isLittle()) 1997 std::swap (Lo, Hi); 1998 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 1999 2000 if (VA.getLocVT() == MVT::v2f64) { 2001 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 2002 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 2003 DAG.getConstant(0, dl, MVT::i32)); 2004 2005 VA = RVLocs[++i]; // skip ahead to next loc 2006 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 2007 Chain = Lo.getValue(1); 2008 InFlag = Lo.getValue(2); 2009 VA = RVLocs[++i]; // skip ahead to next loc 2010 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 2011 Chain = Hi.getValue(1); 2012 InFlag = Hi.getValue(2); 2013 if (!Subtarget->isLittle()) 2014 std::swap (Lo, Hi); 2015 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 2016 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 2017 DAG.getConstant(1, dl, MVT::i32)); 2018 } 2019 } else { 2020 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), 2021 InFlag); 2022 Chain = Val.getValue(1); 2023 InFlag = Val.getValue(2); 2024 } 2025 2026 switch (VA.getLocInfo()) { 2027 default: llvm_unreachable("Unknown loc info!"); 2028 case CCValAssign::Full: break; 2029 case CCValAssign::BCvt: 2030 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val); 2031 break; 2032 } 2033 2034 InVals.push_back(Val); 2035 } 2036 2037 return Chain; 2038 } 2039 2040 /// LowerMemOpCallTo - Store the argument to the stack. 2041 SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, 2042 SDValue Arg, const SDLoc &dl, 2043 SelectionDAG &DAG, 2044 const CCValAssign &VA, 2045 ISD::ArgFlagsTy Flags) const { 2046 unsigned LocMemOffset = VA.getLocMemOffset(); 2047 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 2048 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), 2049 StackPtr, PtrOff); 2050 return DAG.getStore( 2051 Chain, dl, Arg, PtrOff, 2052 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset)); 2053 } 2054 2055 void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, 2056 SDValue Chain, SDValue &Arg, 2057 RegsToPassVector &RegsToPass, 2058 CCValAssign &VA, CCValAssign &NextVA, 2059 SDValue &StackPtr, 2060 SmallVectorImpl<SDValue> &MemOpChains, 2061 ISD::ArgFlagsTy Flags) const { 2062 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 2063 DAG.getVTList(MVT::i32, MVT::i32), Arg); 2064 unsigned id = Subtarget->isLittle() ? 0 : 1; 2065 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id))); 2066 2067 if (NextVA.isRegLoc()) 2068 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id))); 2069 else { 2070 assert(NextVA.isMemLoc()); 2071 if (!StackPtr.getNode()) 2072 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, 2073 getPointerTy(DAG.getDataLayout())); 2074 2075 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id), 2076 dl, DAG, NextVA, 2077 Flags)); 2078 } 2079 } 2080 2081 /// LowerCall - Lowering a call into a callseq_start <- 2082 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter 2083 /// nodes. 2084 SDValue 2085 ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 2086 SmallVectorImpl<SDValue> &InVals) const { 2087 SelectionDAG &DAG = CLI.DAG; 2088 SDLoc &dl = CLI.DL; 2089 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 2090 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 2091 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 2092 SDValue Chain = CLI.Chain; 2093 SDValue Callee = CLI.Callee; 2094 bool &isTailCall = CLI.IsTailCall; 2095 CallingConv::ID CallConv = CLI.CallConv; 2096 bool doesNotRet = CLI.DoesNotReturn; 2097 bool isVarArg = CLI.IsVarArg; 2098 2099 MachineFunction &MF = DAG.getMachineFunction(); 2100 MachineFunction::CallSiteInfo CSInfo; 2101 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); 2102 bool isThisReturn = false; 2103 bool PreferIndirect = false; 2104 2105 // Disable tail calls if they're not supported. 2106 if (!Subtarget->supportsTailCall()) 2107 isTailCall = false; 2108 2109 if (isa<GlobalAddressSDNode>(Callee)) { 2110 // If we're optimizing for minimum size and the function is called three or 2111 // more times in this block, we can improve codesize by calling indirectly 2112 // as BLXr has a 16-bit encoding. 2113 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal(); 2114 if (CLI.CS) { 2115 auto *BB = CLI.CS.getParent(); 2116 PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() && 2117 count_if(GV->users(), [&BB](const User *U) { 2118 return isa<Instruction>(U) && 2119 cast<Instruction>(U)->getParent() == BB; 2120 }) > 2; 2121 } 2122 } 2123 if (isTailCall) { 2124 // Check if it's really possible to do a tail call. 2125 isTailCall = IsEligibleForTailCallOptimization( 2126 Callee, CallConv, isVarArg, isStructRet, 2127 MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG, 2128 PreferIndirect); 2129 if (!isTailCall && CLI.CS && CLI.CS.isMustTailCall()) 2130 report_fatal_error("failed to perform tail call elimination on a call " 2131 "site marked musttail"); 2132 // We don't support GuaranteedTailCallOpt for ARM, only automatically 2133 // detected sibcalls. 2134 if (isTailCall) 2135 ++NumTailCalls; 2136 } 2137 2138 // Analyze operands of the call, assigning locations to each operand. 2139 SmallVector<CCValAssign, 16> ArgLocs; 2140 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 2141 *DAG.getContext()); 2142 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg)); 2143 2144 // Get a count of how many bytes are to be pushed on the stack. 2145 unsigned NumBytes = CCInfo.getNextStackOffset(); 2146 2147 if (isTailCall) { 2148 // For tail calls, memory operands are available in our caller's stack. 2149 NumBytes = 0; 2150 } else { 2151 // Adjust the stack pointer for the new arguments... 2152 // These operations are automatically eliminated by the prolog/epilog pass 2153 Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); 2154 } 2155 2156 SDValue StackPtr = 2157 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout())); 2158 2159 RegsToPassVector RegsToPass; 2160 SmallVector<SDValue, 8> MemOpChains; 2161 2162 // Walk the register/memloc assignments, inserting copies/loads. In the case 2163 // of tail call optimization, arguments are handled later. 2164 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 2165 i != e; 2166 ++i, ++realArgIdx) { 2167 CCValAssign &VA = ArgLocs[i]; 2168 SDValue Arg = OutVals[realArgIdx]; 2169 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 2170 bool isByVal = Flags.isByVal(); 2171 2172 // Promote the value if needed. 2173 switch (VA.getLocInfo()) { 2174 default: llvm_unreachable("Unknown loc info!"); 2175 case CCValAssign::Full: break; 2176 case CCValAssign::SExt: 2177 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); 2178 break; 2179 case CCValAssign::ZExt: 2180 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); 2181 break; 2182 case CCValAssign::AExt: 2183 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); 2184 break; 2185 case CCValAssign::BCvt: 2186 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 2187 break; 2188 } 2189 2190 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces 2191 if (VA.needsCustom()) { 2192 if (VA.getLocVT() == MVT::v2f64) { 2193 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2194 DAG.getConstant(0, dl, MVT::i32)); 2195 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2196 DAG.getConstant(1, dl, MVT::i32)); 2197 2198 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, 2199 VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); 2200 2201 VA = ArgLocs[++i]; // skip ahead to next loc 2202 if (VA.isRegLoc()) { 2203 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, 2204 VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); 2205 } else { 2206 assert(VA.isMemLoc()); 2207 2208 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1, 2209 dl, DAG, VA, Flags)); 2210 } 2211 } else { 2212 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], 2213 StackPtr, MemOpChains, Flags); 2214 } 2215 } else if (VA.isRegLoc()) { 2216 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() && 2217 Outs[0].VT == MVT::i32) { 2218 assert(VA.getLocVT() == MVT::i32 && 2219 "unexpected calling convention register assignment"); 2220 assert(!Ins.empty() && Ins[0].VT == MVT::i32 && 2221 "unexpected use of 'returned'"); 2222 isThisReturn = true; 2223 } 2224 const TargetOptions &Options = DAG.getTarget().Options; 2225 if (Options.EnableDebugEntryValues) 2226 CSInfo.emplace_back(VA.getLocReg(), i); 2227 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2228 } else if (isByVal) { 2229 assert(VA.isMemLoc()); 2230 unsigned offset = 0; 2231 2232 // True if this byval aggregate will be split between registers 2233 // and memory. 2234 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount(); 2235 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed(); 2236 2237 if (CurByValIdx < ByValArgsCount) { 2238 2239 unsigned RegBegin, RegEnd; 2240 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd); 2241 2242 EVT PtrVT = 2243 DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 2244 unsigned int i, j; 2245 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) { 2246 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32); 2247 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); 2248 SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg, 2249 MachinePointerInfo(), 2250 DAG.InferPtrAlignment(AddArg)); 2251 MemOpChains.push_back(Load.getValue(1)); 2252 RegsToPass.push_back(std::make_pair(j, Load)); 2253 } 2254 2255 // If parameter size outsides register area, "offset" value 2256 // helps us to calculate stack slot for remained part properly. 2257 offset = RegEnd - RegBegin; 2258 2259 CCInfo.nextInRegsParam(); 2260 } 2261 2262 if (Flags.getByValSize() > 4*offset) { 2263 auto PtrVT = getPointerTy(DAG.getDataLayout()); 2264 unsigned LocMemOffset = VA.getLocMemOffset(); 2265 SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 2266 SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff); 2267 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl); 2268 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset); 2269 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl, 2270 MVT::i32); 2271 SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl, 2272 MVT::i32); 2273 2274 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); 2275 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode}; 2276 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, 2277 Ops)); 2278 } 2279 } else if (!isTailCall) { 2280 assert(VA.isMemLoc()); 2281 2282 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2283 dl, DAG, VA, Flags)); 2284 } 2285 } 2286 2287 if (!MemOpChains.empty()) 2288 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 2289 2290 // Build a sequence of copy-to-reg nodes chained together with token chain 2291 // and flag operands which copy the outgoing args into the appropriate regs. 2292 SDValue InFlag; 2293 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2294 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2295 RegsToPass[i].second, InFlag); 2296 InFlag = Chain.getValue(1); 2297 } 2298 2299 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every 2300 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol 2301 // node so that legalize doesn't hack it. 2302 bool isDirect = false; 2303 2304 const TargetMachine &TM = getTargetMachine(); 2305 const Module *Mod = MF.getFunction().getParent(); 2306 const GlobalValue *GV = nullptr; 2307 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 2308 GV = G->getGlobal(); 2309 bool isStub = 2310 !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO(); 2311 2312 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass()); 2313 bool isLocalARMFunc = false; 2314 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2315 auto PtrVt = getPointerTy(DAG.getDataLayout()); 2316 2317 if (Subtarget->genLongCalls()) { 2318 assert((!isPositionIndependent() || Subtarget->isTargetWindows()) && 2319 "long-calls codegen is not position independent!"); 2320 // Handle a global address or an external symbol. If it's not one of 2321 // those, the target's already in a register, so we don't need to do 2322 // anything extra. 2323 if (isa<GlobalAddressSDNode>(Callee)) { 2324 // Create a constant pool entry for the callee address 2325 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2326 ARMConstantPoolValue *CPV = 2327 ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0); 2328 2329 // Get the address of the callee into a register 2330 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); 2331 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2332 Callee = DAG.getLoad( 2333 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2334 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2335 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) { 2336 const char *Sym = S->getSymbol(); 2337 2338 // Create a constant pool entry for the callee address 2339 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2340 ARMConstantPoolValue *CPV = 2341 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 2342 ARMPCLabelIndex, 0); 2343 // Get the address of the callee into a register 2344 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); 2345 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2346 Callee = DAG.getLoad( 2347 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2348 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2349 } 2350 } else if (isa<GlobalAddressSDNode>(Callee)) { 2351 if (!PreferIndirect) { 2352 isDirect = true; 2353 bool isDef = GV->isStrongDefinitionForLinker(); 2354 2355 // ARM call to a local ARM function is predicable. 2356 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking); 2357 // tBX takes a register source operand. 2358 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 2359 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?"); 2360 Callee = DAG.getNode( 2361 ARMISD::WrapperPIC, dl, PtrVt, 2362 DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY)); 2363 Callee = DAG.getLoad( 2364 PtrVt, dl, DAG.getEntryNode(), Callee, 2365 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 2366 /* Alignment = */ 0, MachineMemOperand::MODereferenceable | 2367 MachineMemOperand::MOInvariant); 2368 } else if (Subtarget->isTargetCOFF()) { 2369 assert(Subtarget->isTargetWindows() && 2370 "Windows is the only supported COFF target"); 2371 unsigned TargetFlags = ARMII::MO_NO_FLAG; 2372 if (GV->hasDLLImportStorageClass()) 2373 TargetFlags = ARMII::MO_DLLIMPORT; 2374 else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 2375 TargetFlags = ARMII::MO_COFFSTUB; 2376 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*offset=*/0, 2377 TargetFlags); 2378 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB)) 2379 Callee = 2380 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), 2381 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee), 2382 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 2383 } else { 2384 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0); 2385 } 2386 } 2387 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2388 isDirect = true; 2389 // tBX takes a register source operand. 2390 const char *Sym = S->getSymbol(); 2391 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 2392 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2393 ARMConstantPoolValue *CPV = 2394 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 2395 ARMPCLabelIndex, 4); 2396 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); 2397 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2398 Callee = DAG.getLoad( 2399 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2400 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2401 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 2402 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel); 2403 } else { 2404 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0); 2405 } 2406 } 2407 2408 // FIXME: handle tail calls differently. 2409 unsigned CallOpc; 2410 if (Subtarget->isThumb()) { 2411 if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) 2412 CallOpc = ARMISD::CALL_NOLINK; 2413 else 2414 CallOpc = ARMISD::CALL; 2415 } else { 2416 if (!isDirect && !Subtarget->hasV5TOps()) 2417 CallOpc = ARMISD::CALL_NOLINK; 2418 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() && 2419 // Emit regular call when code size is the priority 2420 !Subtarget->hasMinSize()) 2421 // "mov lr, pc; b _foo" to avoid confusing the RSP 2422 CallOpc = ARMISD::CALL_NOLINK; 2423 else 2424 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL; 2425 } 2426 2427 std::vector<SDValue> Ops; 2428 Ops.push_back(Chain); 2429 Ops.push_back(Callee); 2430 2431 // Add argument registers to the end of the list so that they are known live 2432 // into the call. 2433 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2434 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2435 RegsToPass[i].second.getValueType())); 2436 2437 // Add a register mask operand representing the call-preserved registers. 2438 if (!isTailCall) { 2439 const uint32_t *Mask; 2440 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); 2441 if (isThisReturn) { 2442 // For 'this' returns, use the R0-preserving mask if applicable 2443 Mask = ARI->getThisReturnPreservedMask(MF, CallConv); 2444 if (!Mask) { 2445 // Set isThisReturn to false if the calling convention is not one that 2446 // allows 'returned' to be modeled in this way, so LowerCallResult does 2447 // not try to pass 'this' straight through 2448 isThisReturn = false; 2449 Mask = ARI->getCallPreservedMask(MF, CallConv); 2450 } 2451 } else 2452 Mask = ARI->getCallPreservedMask(MF, CallConv); 2453 2454 assert(Mask && "Missing call preserved mask for calling convention"); 2455 Ops.push_back(DAG.getRegisterMask(Mask)); 2456 } 2457 2458 if (InFlag.getNode()) 2459 Ops.push_back(InFlag); 2460 2461 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2462 if (isTailCall) { 2463 MF.getFrameInfo().setHasTailCall(); 2464 SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops); 2465 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); 2466 return Ret; 2467 } 2468 2469 // Returns a chain and a flag for retval copy to use. 2470 Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); 2471 InFlag = Chain.getValue(1); 2472 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); 2473 2474 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), 2475 DAG.getIntPtrConstant(0, dl, true), InFlag, dl); 2476 if (!Ins.empty()) 2477 InFlag = Chain.getValue(1); 2478 2479 // Handle result values, copying them out of physregs into vregs that we 2480 // return. 2481 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, 2482 InVals, isThisReturn, 2483 isThisReturn ? OutVals[0] : SDValue()); 2484 } 2485 2486 /// HandleByVal - Every parameter *after* a byval parameter is passed 2487 /// on the stack. Remember the next parameter register to allocate, 2488 /// and then confiscate the rest of the parameter registers to insure 2489 /// this. 2490 void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size, 2491 unsigned Align) const { 2492 // Byval (as with any stack) slots are always at least 4 byte aligned. 2493 Align = std::max(Align, 4U); 2494 2495 unsigned Reg = State->AllocateReg(GPRArgRegs); 2496 if (!Reg) 2497 return; 2498 2499 unsigned AlignInRegs = Align / 4; 2500 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs; 2501 for (unsigned i = 0; i < Waste; ++i) 2502 Reg = State->AllocateReg(GPRArgRegs); 2503 2504 if (!Reg) 2505 return; 2506 2507 unsigned Excess = 4 * (ARM::R4 - Reg); 2508 2509 // Special case when NSAA != SP and parameter size greater than size of 2510 // all remained GPR regs. In that case we can't split parameter, we must 2511 // send it to stack. We also must set NCRN to R4, so waste all 2512 // remained registers. 2513 const unsigned NSAAOffset = State->getNextStackOffset(); 2514 if (NSAAOffset != 0 && Size > Excess) { 2515 while (State->AllocateReg(GPRArgRegs)) 2516 ; 2517 return; 2518 } 2519 2520 // First register for byval parameter is the first register that wasn't 2521 // allocated before this method call, so it would be "reg". 2522 // If parameter is small enough to be saved in range [reg, r4), then 2523 // the end (first after last) register would be reg + param-size-in-regs, 2524 // else parameter would be splitted between registers and stack, 2525 // end register would be r4 in this case. 2526 unsigned ByValRegBegin = Reg; 2527 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4); 2528 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd); 2529 // Note, first register is allocated in the beginning of function already, 2530 // allocate remained amount of registers we need. 2531 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i) 2532 State->AllocateReg(GPRArgRegs); 2533 // A byval parameter that is split between registers and memory needs its 2534 // size truncated here. 2535 // In the case where the entire structure fits in registers, we set the 2536 // size in memory to zero. 2537 Size = std::max<int>(Size - Excess, 0); 2538 } 2539 2540 /// MatchingStackOffset - Return true if the given stack call argument is 2541 /// already available in the same position (relatively) of the caller's 2542 /// incoming argument stack. 2543 static 2544 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2545 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI, 2546 const TargetInstrInfo *TII) { 2547 unsigned Bytes = Arg.getValueSizeInBits() / 8; 2548 int FI = std::numeric_limits<int>::max(); 2549 if (Arg.getOpcode() == ISD::CopyFromReg) { 2550 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2551 if (!Register::isVirtualRegister(VR)) 2552 return false; 2553 MachineInstr *Def = MRI->getVRegDef(VR); 2554 if (!Def) 2555 return false; 2556 if (!Flags.isByVal()) { 2557 if (!TII->isLoadFromStackSlot(*Def, FI)) 2558 return false; 2559 } else { 2560 return false; 2561 } 2562 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2563 if (Flags.isByVal()) 2564 // ByVal argument is passed in as a pointer but it's now being 2565 // dereferenced. e.g. 2566 // define @foo(%struct.X* %A) { 2567 // tail call @bar(%struct.X* byval %A) 2568 // } 2569 return false; 2570 SDValue Ptr = Ld->getBasePtr(); 2571 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2572 if (!FINode) 2573 return false; 2574 FI = FINode->getIndex(); 2575 } else 2576 return false; 2577 2578 assert(FI != std::numeric_limits<int>::max()); 2579 if (!MFI.isFixedObjectIndex(FI)) 2580 return false; 2581 return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI); 2582 } 2583 2584 /// IsEligibleForTailCallOptimization - Check whether the call is eligible 2585 /// for tail call optimization. Targets which want to do tail call 2586 /// optimization should implement this function. 2587 bool ARMTargetLowering::IsEligibleForTailCallOptimization( 2588 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, 2589 bool isCalleeStructRet, bool isCallerStructRet, 2590 const SmallVectorImpl<ISD::OutputArg> &Outs, 2591 const SmallVectorImpl<SDValue> &OutVals, 2592 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG, 2593 const bool isIndirect) const { 2594 MachineFunction &MF = DAG.getMachineFunction(); 2595 const Function &CallerF = MF.getFunction(); 2596 CallingConv::ID CallerCC = CallerF.getCallingConv(); 2597 2598 assert(Subtarget->supportsTailCall()); 2599 2600 // Indirect tail calls cannot be optimized for Thumb1 if the args 2601 // to the call take up r0-r3. The reason is that there are no legal registers 2602 // left to hold the pointer to the function to be called. 2603 if (Subtarget->isThumb1Only() && Outs.size() >= 4 && 2604 (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect)) 2605 return false; 2606 2607 // Look for obvious safe cases to perform tail call optimization that do not 2608 // require ABI changes. This is what gcc calls sibcall. 2609 2610 // Exception-handling functions need a special set of instructions to indicate 2611 // a return to the hardware. Tail-calling another function would probably 2612 // break this. 2613 if (CallerF.hasFnAttribute("interrupt")) 2614 return false; 2615 2616 // Also avoid sibcall optimization if either caller or callee uses struct 2617 // return semantics. 2618 if (isCalleeStructRet || isCallerStructRet) 2619 return false; 2620 2621 // Externally-defined functions with weak linkage should not be 2622 // tail-called on ARM when the OS does not support dynamic 2623 // pre-emption of symbols, as the AAELF spec requires normal calls 2624 // to undefined weak functions to be replaced with a NOP or jump to the 2625 // next instruction. The behaviour of branch instructions in this 2626 // situation (as used for tail calls) is implementation-defined, so we 2627 // cannot rely on the linker replacing the tail call with a return. 2628 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2629 const GlobalValue *GV = G->getGlobal(); 2630 const Triple &TT = getTargetMachine().getTargetTriple(); 2631 if (GV->hasExternalWeakLinkage() && 2632 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) 2633 return false; 2634 } 2635 2636 // Check that the call results are passed in the same way. 2637 LLVMContext &C = *DAG.getContext(); 2638 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, 2639 CCAssignFnForReturn(CalleeCC, isVarArg), 2640 CCAssignFnForReturn(CallerCC, isVarArg))) 2641 return false; 2642 // The callee has to preserve all registers the caller needs to preserve. 2643 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 2644 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); 2645 if (CalleeCC != CallerCC) { 2646 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); 2647 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) 2648 return false; 2649 } 2650 2651 // If Caller's vararg or byval argument has been split between registers and 2652 // stack, do not perform tail call, since part of the argument is in caller's 2653 // local frame. 2654 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>(); 2655 if (AFI_Caller->getArgRegsSaveSize()) 2656 return false; 2657 2658 // If the callee takes no arguments then go on to check the results of the 2659 // call. 2660 if (!Outs.empty()) { 2661 // Check if stack adjustment is needed. For now, do not do this if any 2662 // argument is passed on the stack. 2663 SmallVector<CCValAssign, 16> ArgLocs; 2664 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); 2665 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); 2666 if (CCInfo.getNextStackOffset()) { 2667 // Check if the arguments are already laid out in the right way as 2668 // the caller's fixed stack objects. 2669 MachineFrameInfo &MFI = MF.getFrameInfo(); 2670 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2671 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 2672 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 2673 i != e; 2674 ++i, ++realArgIdx) { 2675 CCValAssign &VA = ArgLocs[i]; 2676 EVT RegVT = VA.getLocVT(); 2677 SDValue Arg = OutVals[realArgIdx]; 2678 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 2679 if (VA.getLocInfo() == CCValAssign::Indirect) 2680 return false; 2681 if (VA.needsCustom()) { 2682 // f64 and vector types are split into multiple registers or 2683 // register/stack-slot combinations. The types will not match 2684 // the registers; give up on memory f64 refs until we figure 2685 // out what to do about this. 2686 if (!VA.isRegLoc()) 2687 return false; 2688 if (!ArgLocs[++i].isRegLoc()) 2689 return false; 2690 if (RegVT == MVT::v2f64) { 2691 if (!ArgLocs[++i].isRegLoc()) 2692 return false; 2693 if (!ArgLocs[++i].isRegLoc()) 2694 return false; 2695 } 2696 } else if (!VA.isRegLoc()) { 2697 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2698 MFI, MRI, TII)) 2699 return false; 2700 } 2701 } 2702 } 2703 2704 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2705 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) 2706 return false; 2707 } 2708 2709 return true; 2710 } 2711 2712 bool 2713 ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv, 2714 MachineFunction &MF, bool isVarArg, 2715 const SmallVectorImpl<ISD::OutputArg> &Outs, 2716 LLVMContext &Context) const { 2717 SmallVector<CCValAssign, 16> RVLocs; 2718 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 2719 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); 2720 } 2721 2722 static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps, 2723 const SDLoc &DL, SelectionDAG &DAG) { 2724 const MachineFunction &MF = DAG.getMachineFunction(); 2725 const Function &F = MF.getFunction(); 2726 2727 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString(); 2728 2729 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset 2730 // version of the "preferred return address". These offsets affect the return 2731 // instruction if this is a return from PL1 without hypervisor extensions. 2732 // IRQ/FIQ: +4 "subs pc, lr, #4" 2733 // SWI: 0 "subs pc, lr, #0" 2734 // ABORT: +4 "subs pc, lr, #4" 2735 // UNDEF: +4/+2 "subs pc, lr, #0" 2736 // UNDEF varies depending on where the exception came from ARM or Thumb 2737 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0. 2738 2739 int64_t LROffset; 2740 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" || 2741 IntKind == "ABORT") 2742 LROffset = 4; 2743 else if (IntKind == "SWI" || IntKind == "UNDEF") 2744 LROffset = 0; 2745 else 2746 report_fatal_error("Unsupported interrupt attribute. If present, value " 2747 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF"); 2748 2749 RetOps.insert(RetOps.begin() + 1, 2750 DAG.getConstant(LROffset, DL, MVT::i32, false)); 2751 2752 return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps); 2753 } 2754 2755 SDValue 2756 ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 2757 bool isVarArg, 2758 const SmallVectorImpl<ISD::OutputArg> &Outs, 2759 const SmallVectorImpl<SDValue> &OutVals, 2760 const SDLoc &dl, SelectionDAG &DAG) const { 2761 // CCValAssign - represent the assignment of the return value to a location. 2762 SmallVector<CCValAssign, 16> RVLocs; 2763 2764 // CCState - Info about the registers and stack slots. 2765 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 2766 *DAG.getContext()); 2767 2768 // Analyze outgoing return values. 2769 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); 2770 2771 SDValue Flag; 2772 SmallVector<SDValue, 4> RetOps; 2773 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 2774 bool isLittleEndian = Subtarget->isLittle(); 2775 2776 MachineFunction &MF = DAG.getMachineFunction(); 2777 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2778 AFI->setReturnRegsCount(RVLocs.size()); 2779 2780 // Copy the result values into the output registers. 2781 for (unsigned i = 0, realRVLocIdx = 0; 2782 i != RVLocs.size(); 2783 ++i, ++realRVLocIdx) { 2784 CCValAssign &VA = RVLocs[i]; 2785 assert(VA.isRegLoc() && "Can only return in registers!"); 2786 2787 SDValue Arg = OutVals[realRVLocIdx]; 2788 bool ReturnF16 = false; 2789 2790 if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) { 2791 // Half-precision return values can be returned like this: 2792 // 2793 // t11 f16 = fadd ... 2794 // t12: i16 = bitcast t11 2795 // t13: i32 = zero_extend t12 2796 // t14: f32 = bitcast t13 <~~~~~~~ Arg 2797 // 2798 // to avoid code generation for bitcasts, we simply set Arg to the node 2799 // that produces the f16 value, t11 in this case. 2800 // 2801 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) { 2802 SDValue ZE = Arg.getOperand(0); 2803 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) { 2804 SDValue BC = ZE.getOperand(0); 2805 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) { 2806 Arg = BC.getOperand(0); 2807 ReturnF16 = true; 2808 } 2809 } 2810 } 2811 } 2812 2813 switch (VA.getLocInfo()) { 2814 default: llvm_unreachable("Unknown loc info!"); 2815 case CCValAssign::Full: break; 2816 case CCValAssign::BCvt: 2817 if (!ReturnF16) 2818 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 2819 break; 2820 } 2821 2822 if (VA.needsCustom()) { 2823 if (VA.getLocVT() == MVT::v2f64) { 2824 // Extract the first half and return it in two registers. 2825 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2826 DAG.getConstant(0, dl, MVT::i32)); 2827 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl, 2828 DAG.getVTList(MVT::i32, MVT::i32), Half); 2829 2830 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2831 HalfGPRs.getValue(isLittleEndian ? 0 : 1), 2832 Flag); 2833 Flag = Chain.getValue(1); 2834 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2835 VA = RVLocs[++i]; // skip ahead to next loc 2836 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2837 HalfGPRs.getValue(isLittleEndian ? 1 : 0), 2838 Flag); 2839 Flag = Chain.getValue(1); 2840 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2841 VA = RVLocs[++i]; // skip ahead to next loc 2842 2843 // Extract the 2nd half and fall through to handle it as an f64 value. 2844 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2845 DAG.getConstant(1, dl, MVT::i32)); 2846 } 2847 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is 2848 // available. 2849 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 2850 DAG.getVTList(MVT::i32, MVT::i32), Arg); 2851 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2852 fmrrd.getValue(isLittleEndian ? 0 : 1), 2853 Flag); 2854 Flag = Chain.getValue(1); 2855 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2856 VA = RVLocs[++i]; // skip ahead to next loc 2857 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2858 fmrrd.getValue(isLittleEndian ? 1 : 0), 2859 Flag); 2860 } else 2861 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); 2862 2863 // Guarantee that all emitted copies are 2864 // stuck together, avoiding something bad. 2865 Flag = Chain.getValue(1); 2866 RetOps.push_back(DAG.getRegister(VA.getLocReg(), 2867 ReturnF16 ? MVT::f16 : VA.getLocVT())); 2868 } 2869 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 2870 const MCPhysReg *I = 2871 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 2872 if (I) { 2873 for (; *I; ++I) { 2874 if (ARM::GPRRegClass.contains(*I)) 2875 RetOps.push_back(DAG.getRegister(*I, MVT::i32)); 2876 else if (ARM::DPRRegClass.contains(*I)) 2877 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); 2878 else 2879 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 2880 } 2881 } 2882 2883 // Update chain and glue. 2884 RetOps[0] = Chain; 2885 if (Flag.getNode()) 2886 RetOps.push_back(Flag); 2887 2888 // CPUs which aren't M-class use a special sequence to return from 2889 // exceptions (roughly, any instruction setting pc and cpsr simultaneously, 2890 // though we use "subs pc, lr, #N"). 2891 // 2892 // M-class CPUs actually use a normal return sequence with a special 2893 // (hardware-provided) value in LR, so the normal code path works. 2894 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") && 2895 !Subtarget->isMClass()) { 2896 if (Subtarget->isThumb1Only()) 2897 report_fatal_error("interrupt attribute is not supported in Thumb1"); 2898 return LowerInterruptReturn(RetOps, dl, DAG); 2899 } 2900 2901 return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps); 2902 } 2903 2904 bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { 2905 if (N->getNumValues() != 1) 2906 return false; 2907 if (!N->hasNUsesOfValue(1, 0)) 2908 return false; 2909 2910 SDValue TCChain = Chain; 2911 SDNode *Copy = *N->use_begin(); 2912 if (Copy->getOpcode() == ISD::CopyToReg) { 2913 // If the copy has a glue operand, we conservatively assume it isn't safe to 2914 // perform a tail call. 2915 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 2916 return false; 2917 TCChain = Copy->getOperand(0); 2918 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) { 2919 SDNode *VMov = Copy; 2920 // f64 returned in a pair of GPRs. 2921 SmallPtrSet<SDNode*, 2> Copies; 2922 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); 2923 UI != UE; ++UI) { 2924 if (UI->getOpcode() != ISD::CopyToReg) 2925 return false; 2926 Copies.insert(*UI); 2927 } 2928 if (Copies.size() > 2) 2929 return false; 2930 2931 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); 2932 UI != UE; ++UI) { 2933 SDValue UseChain = UI->getOperand(0); 2934 if (Copies.count(UseChain.getNode())) 2935 // Second CopyToReg 2936 Copy = *UI; 2937 else { 2938 // We are at the top of this chain. 2939 // If the copy has a glue operand, we conservatively assume it 2940 // isn't safe to perform a tail call. 2941 if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue) 2942 return false; 2943 // First CopyToReg 2944 TCChain = UseChain; 2945 } 2946 } 2947 } else if (Copy->getOpcode() == ISD::BITCAST) { 2948 // f32 returned in a single GPR. 2949 if (!Copy->hasOneUse()) 2950 return false; 2951 Copy = *Copy->use_begin(); 2952 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0)) 2953 return false; 2954 // If the copy has a glue operand, we conservatively assume it isn't safe to 2955 // perform a tail call. 2956 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 2957 return false; 2958 TCChain = Copy->getOperand(0); 2959 } else { 2960 return false; 2961 } 2962 2963 bool HasRet = false; 2964 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 2965 UI != UE; ++UI) { 2966 if (UI->getOpcode() != ARMISD::RET_FLAG && 2967 UI->getOpcode() != ARMISD::INTRET_FLAG) 2968 return false; 2969 HasRet = true; 2970 } 2971 2972 if (!HasRet) 2973 return false; 2974 2975 Chain = TCChain; 2976 return true; 2977 } 2978 2979 bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { 2980 if (!Subtarget->supportsTailCall()) 2981 return false; 2982 2983 if (!CI->isTailCall()) 2984 return false; 2985 2986 return true; 2987 } 2988 2989 // Trying to write a 64 bit value so need to split into two 32 bit values first, 2990 // and pass the lower and high parts through. 2991 static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) { 2992 SDLoc DL(Op); 2993 SDValue WriteValue = Op->getOperand(2); 2994 2995 // This function is only supposed to be called for i64 type argument. 2996 assert(WriteValue.getValueType() == MVT::i64 2997 && "LowerWRITE_REGISTER called for non-i64 type argument."); 2998 2999 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, 3000 DAG.getConstant(0, DL, MVT::i32)); 3001 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, 3002 DAG.getConstant(1, DL, MVT::i32)); 3003 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi }; 3004 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops); 3005 } 3006 3007 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 3008 // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is 3009 // one of the above mentioned nodes. It has to be wrapped because otherwise 3010 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 3011 // be used to form addressing mode. These wrapped nodes will be selected 3012 // into MOVi. 3013 SDValue ARMTargetLowering::LowerConstantPool(SDValue Op, 3014 SelectionDAG &DAG) const { 3015 EVT PtrVT = Op.getValueType(); 3016 // FIXME there is no actual debug info here 3017 SDLoc dl(Op); 3018 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 3019 SDValue Res; 3020 3021 // When generating execute-only code Constant Pools must be promoted to the 3022 // global data section. It's a bit ugly that we can't share them across basic 3023 // blocks, but this way we guarantee that execute-only behaves correct with 3024 // position-independent addressing modes. 3025 if (Subtarget->genExecuteOnly()) { 3026 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>(); 3027 auto T = const_cast<Type*>(CP->getType()); 3028 auto C = const_cast<Constant*>(CP->getConstVal()); 3029 auto M = const_cast<Module*>(DAG.getMachineFunction(). 3030 getFunction().getParent()); 3031 auto GV = new GlobalVariable( 3032 *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C, 3033 Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" + 3034 Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" + 3035 Twine(AFI->createPICLabelUId()) 3036 ); 3037 SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV), 3038 dl, PtrVT); 3039 return LowerGlobalAddress(GA, DAG); 3040 } 3041 3042 if (CP->isMachineConstantPoolEntry()) 3043 Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, 3044 CP->getAlignment()); 3045 else 3046 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, 3047 CP->getAlignment()); 3048 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res); 3049 } 3050 3051 unsigned ARMTargetLowering::getJumpTableEncoding() const { 3052 return MachineJumpTableInfo::EK_Inline; 3053 } 3054 3055 SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, 3056 SelectionDAG &DAG) const { 3057 MachineFunction &MF = DAG.getMachineFunction(); 3058 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3059 unsigned ARMPCLabelIndex = 0; 3060 SDLoc DL(Op); 3061 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3062 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 3063 SDValue CPAddr; 3064 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI(); 3065 if (!IsPositionIndependent) { 3066 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4); 3067 } else { 3068 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; 3069 ARMPCLabelIndex = AFI->createPICLabelUId(); 3070 ARMConstantPoolValue *CPV = 3071 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex, 3072 ARMCP::CPBlockAddress, PCAdj); 3073 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3074 } 3075 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr); 3076 SDValue Result = DAG.getLoad( 3077 PtrVT, DL, DAG.getEntryNode(), CPAddr, 3078 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3079 if (!IsPositionIndependent) 3080 return Result; 3081 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32); 3082 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel); 3083 } 3084 3085 /// Convert a TLS address reference into the correct sequence of loads 3086 /// and calls to compute the variable's address for Darwin, and return an 3087 /// SDValue containing the final node. 3088 3089 /// Darwin only has one TLS scheme which must be capable of dealing with the 3090 /// fully general situation, in the worst case. This means: 3091 /// + "extern __thread" declaration. 3092 /// + Defined in a possibly unknown dynamic library. 3093 /// 3094 /// The general system is that each __thread variable has a [3 x i32] descriptor 3095 /// which contains information used by the runtime to calculate the address. The 3096 /// only part of this the compiler needs to know about is the first word, which 3097 /// contains a function pointer that must be called with the address of the 3098 /// entire descriptor in "r0". 3099 /// 3100 /// Since this descriptor may be in a different unit, in general access must 3101 /// proceed along the usual ARM rules. A common sequence to produce is: 3102 /// 3103 /// movw rT1, :lower16:_var$non_lazy_ptr 3104 /// movt rT1, :upper16:_var$non_lazy_ptr 3105 /// ldr r0, [rT1] 3106 /// ldr rT2, [r0] 3107 /// blx rT2 3108 /// [...address now in r0...] 3109 SDValue 3110 ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op, 3111 SelectionDAG &DAG) const { 3112 assert(Subtarget->isTargetDarwin() && 3113 "This function expects a Darwin target"); 3114 SDLoc DL(Op); 3115 3116 // First step is to get the address of the actua global symbol. This is where 3117 // the TLS descriptor lives. 3118 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG); 3119 3120 // The first entry in the descriptor is a function pointer that we must call 3121 // to obtain the address of the variable. 3122 SDValue Chain = DAG.getEntryNode(); 3123 SDValue FuncTLVGet = DAG.getLoad( 3124 MVT::i32, DL, Chain, DescAddr, 3125 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 3126 /* Alignment = */ 4, 3127 MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable | 3128 MachineMemOperand::MOInvariant); 3129 Chain = FuncTLVGet.getValue(1); 3130 3131 MachineFunction &F = DAG.getMachineFunction(); 3132 MachineFrameInfo &MFI = F.getFrameInfo(); 3133 MFI.setAdjustsStack(true); 3134 3135 // TLS calls preserve all registers except those that absolutely must be 3136 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be 3137 // silly). 3138 auto TRI = 3139 getTargetMachine().getSubtargetImpl(F.getFunction())->getRegisterInfo(); 3140 auto ARI = static_cast<const ARMRegisterInfo *>(TRI); 3141 const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction()); 3142 3143 // Finally, we can make the call. This is just a degenerate version of a 3144 // normal AArch64 call node: r0 takes the address of the descriptor, and 3145 // returns the address of the variable in this thread. 3146 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue()); 3147 Chain = 3148 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), 3149 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32), 3150 DAG.getRegisterMask(Mask), Chain.getValue(1)); 3151 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1)); 3152 } 3153 3154 SDValue 3155 ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op, 3156 SelectionDAG &DAG) const { 3157 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering"); 3158 3159 SDValue Chain = DAG.getEntryNode(); 3160 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3161 SDLoc DL(Op); 3162 3163 // Load the current TEB (thread environment block) 3164 SDValue Ops[] = {Chain, 3165 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32), 3166 DAG.getTargetConstant(15, DL, MVT::i32), 3167 DAG.getTargetConstant(0, DL, MVT::i32), 3168 DAG.getTargetConstant(13, DL, MVT::i32), 3169 DAG.getTargetConstant(0, DL, MVT::i32), 3170 DAG.getTargetConstant(2, DL, MVT::i32)}; 3171 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, 3172 DAG.getVTList(MVT::i32, MVT::Other), Ops); 3173 3174 SDValue TEB = CurrentTEB.getValue(0); 3175 Chain = CurrentTEB.getValue(1); 3176 3177 // Load the ThreadLocalStoragePointer from the TEB 3178 // A pointer to the TLS array is located at offset 0x2c from the TEB. 3179 SDValue TLSArray = 3180 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL)); 3181 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo()); 3182 3183 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4 3184 // offset into the TLSArray. 3185 3186 // Load the TLS index from the C runtime 3187 SDValue TLSIndex = 3188 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG); 3189 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex); 3190 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo()); 3191 3192 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex, 3193 DAG.getConstant(2, DL, MVT::i32)); 3194 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain, 3195 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot), 3196 MachinePointerInfo()); 3197 3198 // Get the offset of the start of the .tls section (section base) 3199 const auto *GA = cast<GlobalAddressSDNode>(Op); 3200 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL); 3201 SDValue Offset = DAG.getLoad( 3202 PtrVT, DL, Chain, DAG.getNode(ARMISD::Wrapper, DL, MVT::i32, 3203 DAG.getTargetConstantPool(CPV, PtrVT, 4)), 3204 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3205 3206 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset); 3207 } 3208 3209 // Lower ISD::GlobalTLSAddress using the "general dynamic" model 3210 SDValue 3211 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, 3212 SelectionDAG &DAG) const { 3213 SDLoc dl(GA); 3214 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3215 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 3216 MachineFunction &MF = DAG.getMachineFunction(); 3217 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3218 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 3219 ARMConstantPoolValue *CPV = 3220 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 3221 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true); 3222 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3223 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument); 3224 Argument = DAG.getLoad( 3225 PtrVT, dl, DAG.getEntryNode(), Argument, 3226 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3227 SDValue Chain = Argument.getValue(1); 3228 3229 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3230 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel); 3231 3232 // call __tls_get_addr. 3233 ArgListTy Args; 3234 ArgListEntry Entry; 3235 Entry.Node = Argument; 3236 Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext()); 3237 Args.push_back(Entry); 3238 3239 // FIXME: is there useful debug info available here? 3240 TargetLowering::CallLoweringInfo CLI(DAG); 3241 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee( 3242 CallingConv::C, Type::getInt32Ty(*DAG.getContext()), 3243 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args)); 3244 3245 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 3246 return CallResult.first; 3247 } 3248 3249 // Lower ISD::GlobalTLSAddress using the "initial exec" or 3250 // "local exec" model. 3251 SDValue 3252 ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, 3253 SelectionDAG &DAG, 3254 TLSModel::Model model) const { 3255 const GlobalValue *GV = GA->getGlobal(); 3256 SDLoc dl(GA); 3257 SDValue Offset; 3258 SDValue Chain = DAG.getEntryNode(); 3259 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3260 // Get the Thread Pointer 3261 SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 3262 3263 if (model == TLSModel::InitialExec) { 3264 MachineFunction &MF = DAG.getMachineFunction(); 3265 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3266 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 3267 // Initial exec model. 3268 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 3269 ARMConstantPoolValue *CPV = 3270 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 3271 ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF, 3272 true); 3273 Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3274 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 3275 Offset = DAG.getLoad( 3276 PtrVT, dl, Chain, Offset, 3277 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3278 Chain = Offset.getValue(1); 3279 3280 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3281 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel); 3282 3283 Offset = DAG.getLoad( 3284 PtrVT, dl, Chain, Offset, 3285 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3286 } else { 3287 // local exec model 3288 assert(model == TLSModel::LocalExec); 3289 ARMConstantPoolValue *CPV = 3290 ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF); 3291 Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3292 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 3293 Offset = DAG.getLoad( 3294 PtrVT, dl, Chain, Offset, 3295 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3296 } 3297 3298 // The address of the thread local variable is the add of the thread 3299 // pointer with the offset of the variable. 3300 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 3301 } 3302 3303 SDValue 3304 ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 3305 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 3306 if (DAG.getTarget().useEmulatedTLS()) 3307 return LowerToTLSEmulatedModel(GA, DAG); 3308 3309 if (Subtarget->isTargetDarwin()) 3310 return LowerGlobalTLSAddressDarwin(Op, DAG); 3311 3312 if (Subtarget->isTargetWindows()) 3313 return LowerGlobalTLSAddressWindows(Op, DAG); 3314 3315 // TODO: implement the "local dynamic" model 3316 assert(Subtarget->isTargetELF() && "Only ELF implemented here"); 3317 TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal()); 3318 3319 switch (model) { 3320 case TLSModel::GeneralDynamic: 3321 case TLSModel::LocalDynamic: 3322 return LowerToTLSGeneralDynamicModel(GA, DAG); 3323 case TLSModel::InitialExec: 3324 case TLSModel::LocalExec: 3325 return LowerToTLSExecModels(GA, DAG, model); 3326 } 3327 llvm_unreachable("bogus TLS model"); 3328 } 3329 3330 /// Return true if all users of V are within function F, looking through 3331 /// ConstantExprs. 3332 static bool allUsersAreInFunction(const Value *V, const Function *F) { 3333 SmallVector<const User*,4> Worklist; 3334 for (auto *U : V->users()) 3335 Worklist.push_back(U); 3336 while (!Worklist.empty()) { 3337 auto *U = Worklist.pop_back_val(); 3338 if (isa<ConstantExpr>(U)) { 3339 for (auto *UU : U->users()) 3340 Worklist.push_back(UU); 3341 continue; 3342 } 3343 3344 auto *I = dyn_cast<Instruction>(U); 3345 if (!I || I->getParent()->getParent() != F) 3346 return false; 3347 } 3348 return true; 3349 } 3350 3351 static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, 3352 const GlobalValue *GV, SelectionDAG &DAG, 3353 EVT PtrVT, const SDLoc &dl) { 3354 // If we're creating a pool entry for a constant global with unnamed address, 3355 // and the global is small enough, we can emit it inline into the constant pool 3356 // to save ourselves an indirection. 3357 // 3358 // This is a win if the constant is only used in one function (so it doesn't 3359 // need to be duplicated) or duplicating the constant wouldn't increase code 3360 // size (implying the constant is no larger than 4 bytes). 3361 const Function &F = DAG.getMachineFunction().getFunction(); 3362 3363 // We rely on this decision to inline being idemopotent and unrelated to the 3364 // use-site. We know that if we inline a variable at one use site, we'll 3365 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel 3366 // doesn't know about this optimization, so bail out if it's enabled else 3367 // we could decide to inline here (and thus never emit the GV) but require 3368 // the GV from fast-isel generated code. 3369 if (!EnableConstpoolPromotion || 3370 DAG.getMachineFunction().getTarget().Options.EnableFastISel) 3371 return SDValue(); 3372 3373 auto *GVar = dyn_cast<GlobalVariable>(GV); 3374 if (!GVar || !GVar->hasInitializer() || 3375 !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() || 3376 !GVar->hasLocalLinkage()) 3377 return SDValue(); 3378 3379 // If we inline a value that contains relocations, we move the relocations 3380 // from .data to .text. This is not allowed in position-independent code. 3381 auto *Init = GVar->getInitializer(); 3382 if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) && 3383 Init->needsRelocation()) 3384 return SDValue(); 3385 3386 // The constant islands pass can only really deal with alignment requests 3387 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote 3388 // any type wanting greater alignment requirements than 4 bytes. We also 3389 // can only promote constants that are multiples of 4 bytes in size or 3390 // are paddable to a multiple of 4. Currently we only try and pad constants 3391 // that are strings for simplicity. 3392 auto *CDAInit = dyn_cast<ConstantDataArray>(Init); 3393 unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType()); 3394 unsigned Align = DAG.getDataLayout().getPreferredAlignment(GVar); 3395 unsigned RequiredPadding = 4 - (Size % 4); 3396 bool PaddingPossible = 3397 RequiredPadding == 4 || (CDAInit && CDAInit->isString()); 3398 if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize || 3399 Size == 0) 3400 return SDValue(); 3401 3402 unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding); 3403 MachineFunction &MF = DAG.getMachineFunction(); 3404 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3405 3406 // We can't bloat the constant pool too much, else the ConstantIslands pass 3407 // may fail to converge. If we haven't promoted this global yet (it may have 3408 // multiple uses), and promoting it would increase the constant pool size (Sz 3409 // > 4), ensure we have space to do so up to MaxTotal. 3410 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4) 3411 if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >= 3412 ConstpoolPromotionMaxTotal) 3413 return SDValue(); 3414 3415 // This is only valid if all users are in a single function; we can't clone 3416 // the constant in general. The LLVM IR unnamed_addr allows merging 3417 // constants, but not cloning them. 3418 // 3419 // We could potentially allow cloning if we could prove all uses of the 3420 // constant in the current function don't care about the address, like 3421 // printf format strings. But that isn't implemented for now. 3422 if (!allUsersAreInFunction(GVar, &F)) 3423 return SDValue(); 3424 3425 // We're going to inline this global. Pad it out if needed. 3426 if (RequiredPadding != 4) { 3427 StringRef S = CDAInit->getAsString(); 3428 3429 SmallVector<uint8_t,16> V(S.size()); 3430 std::copy(S.bytes_begin(), S.bytes_end(), V.begin()); 3431 while (RequiredPadding--) 3432 V.push_back(0); 3433 Init = ConstantDataArray::get(*DAG.getContext(), V); 3434 } 3435 3436 auto CPVal = ARMConstantPoolConstant::Create(GVar, Init); 3437 SDValue CPAddr = 3438 DAG.getTargetConstantPool(CPVal, PtrVT, /*Align=*/4); 3439 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) { 3440 AFI->markGlobalAsPromotedToConstantPool(GVar); 3441 AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() + 3442 PaddedSize - 4); 3443 } 3444 ++NumConstpoolPromoted; 3445 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3446 } 3447 3448 bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const { 3449 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 3450 if (!(GV = GA->getBaseObject())) 3451 return false; 3452 if (const auto *V = dyn_cast<GlobalVariable>(GV)) 3453 return V->isConstant(); 3454 return isa<Function>(GV); 3455 } 3456 3457 SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op, 3458 SelectionDAG &DAG) const { 3459 switch (Subtarget->getTargetTriple().getObjectFormat()) { 3460 default: llvm_unreachable("unknown object format"); 3461 case Triple::COFF: 3462 return LowerGlobalAddressWindows(Op, DAG); 3463 case Triple::ELF: 3464 return LowerGlobalAddressELF(Op, DAG); 3465 case Triple::MachO: 3466 return LowerGlobalAddressDarwin(Op, DAG); 3467 } 3468 } 3469 3470 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, 3471 SelectionDAG &DAG) const { 3472 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3473 SDLoc dl(Op); 3474 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3475 const TargetMachine &TM = getTargetMachine(); 3476 bool IsRO = isReadOnly(GV); 3477 3478 // promoteToConstantPool only if not generating XO text section 3479 if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV) && !Subtarget->genExecuteOnly()) 3480 if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl)) 3481 return V; 3482 3483 if (isPositionIndependent()) { 3484 bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV); 3485 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 3486 UseGOT_PREL ? ARMII::MO_GOT : 0); 3487 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G); 3488 if (UseGOT_PREL) 3489 Result = 3490 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, 3491 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3492 return Result; 3493 } else if (Subtarget->isROPI() && IsRO) { 3494 // PC-relative. 3495 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT); 3496 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G); 3497 return Result; 3498 } else if (Subtarget->isRWPI() && !IsRO) { 3499 // SB-relative. 3500 SDValue RelAddr; 3501 if (Subtarget->useMovt()) { 3502 ++NumMovwMovt; 3503 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL); 3504 RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G); 3505 } else { // use literal pool for address constant 3506 ARMConstantPoolValue *CPV = 3507 ARMConstantPoolConstant::Create(GV, ARMCP::SBREL); 3508 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3509 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3510 RelAddr = DAG.getLoad( 3511 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3512 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3513 } 3514 SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT); 3515 SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr); 3516 return Result; 3517 } 3518 3519 // If we have T2 ops, we can materialize the address directly via movt/movw 3520 // pair. This is always cheaper. 3521 if (Subtarget->useMovt()) { 3522 ++NumMovwMovt; 3523 // FIXME: Once remat is capable of dealing with instructions with register 3524 // operands, expand this into two nodes. 3525 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, 3526 DAG.getTargetGlobalAddress(GV, dl, PtrVT)); 3527 } else { 3528 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); 3529 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3530 return DAG.getLoad( 3531 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3532 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3533 } 3534 } 3535 3536 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, 3537 SelectionDAG &DAG) const { 3538 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 3539 "ROPI/RWPI not currently supported for Darwin"); 3540 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3541 SDLoc dl(Op); 3542 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3543 3544 if (Subtarget->useMovt()) 3545 ++NumMovwMovt; 3546 3547 // FIXME: Once remat is capable of dealing with instructions with register 3548 // operands, expand this into multiple nodes 3549 unsigned Wrapper = 3550 isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper; 3551 3552 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY); 3553 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G); 3554 3555 if (Subtarget->isGVIndirectSymbol(GV)) 3556 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, 3557 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3558 return Result; 3559 } 3560 3561 SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, 3562 SelectionDAG &DAG) const { 3563 assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported"); 3564 assert(Subtarget->useMovt() && 3565 "Windows on ARM expects to use movw/movt"); 3566 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 3567 "ROPI/RWPI not currently supported for Windows"); 3568 3569 const TargetMachine &TM = getTargetMachine(); 3570 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3571 ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG; 3572 if (GV->hasDLLImportStorageClass()) 3573 TargetFlags = ARMII::MO_DLLIMPORT; 3574 else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 3575 TargetFlags = ARMII::MO_COFFSTUB; 3576 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3577 SDValue Result; 3578 SDLoc DL(Op); 3579 3580 ++NumMovwMovt; 3581 3582 // FIXME: Once remat is capable of dealing with instructions with register 3583 // operands, expand this into two nodes. 3584 Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, 3585 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0, 3586 TargetFlags)); 3587 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB)) 3588 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, 3589 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3590 return Result; 3591 } 3592 3593 SDValue 3594 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { 3595 SDLoc dl(Op); 3596 SDValue Val = DAG.getConstant(0, dl, MVT::i32); 3597 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, 3598 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), 3599 Op.getOperand(1), Val); 3600 } 3601 3602 SDValue 3603 ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { 3604 SDLoc dl(Op); 3605 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0), 3606 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32)); 3607 } 3608 3609 SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, 3610 SelectionDAG &DAG) const { 3611 SDLoc dl(Op); 3612 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other, 3613 Op.getOperand(0)); 3614 } 3615 3616 SDValue ARMTargetLowering::LowerINTRINSIC_VOID( 3617 SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const { 3618 unsigned IntNo = 3619 cast<ConstantSDNode>( 3620 Op.getOperand(Op.getOperand(0).getValueType() == MVT::Other)) 3621 ->getZExtValue(); 3622 switch (IntNo) { 3623 default: 3624 return SDValue(); // Don't custom lower most intrinsics. 3625 case Intrinsic::arm_gnu_eabi_mcount: { 3626 MachineFunction &MF = DAG.getMachineFunction(); 3627 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3628 SDLoc dl(Op); 3629 SDValue Chain = Op.getOperand(0); 3630 // call "\01__gnu_mcount_nc" 3631 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); 3632 const uint32_t *Mask = 3633 ARI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C); 3634 assert(Mask && "Missing call preserved mask for calling convention"); 3635 // Mark LR an implicit live-in. 3636 unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); 3637 SDValue ReturnAddress = 3638 DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT); 3639 std::vector<EVT> ResultTys = {MVT::Other, MVT::Glue}; 3640 SDValue Callee = 3641 DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0); 3642 SDValue RegisterMask = DAG.getRegisterMask(Mask); 3643 if (Subtarget->isThumb()) 3644 return SDValue( 3645 DAG.getMachineNode( 3646 ARM::tBL_PUSHLR, dl, ResultTys, 3647 {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT), 3648 DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}), 3649 0); 3650 return SDValue( 3651 DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys, 3652 {ReturnAddress, Callee, RegisterMask, Chain}), 3653 0); 3654 } 3655 } 3656 } 3657 3658 SDValue 3659 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, 3660 const ARMSubtarget *Subtarget) const { 3661 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 3662 SDLoc dl(Op); 3663 switch (IntNo) { 3664 default: return SDValue(); // Don't custom lower most intrinsics. 3665 case Intrinsic::thread_pointer: { 3666 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3667 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 3668 } 3669 case Intrinsic::arm_cls: { 3670 const SDValue &Operand = Op.getOperand(1); 3671 const EVT VTy = Op.getValueType(); 3672 SDValue SRA = 3673 DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy)); 3674 SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand); 3675 SDValue SHL = 3676 DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy)); 3677 SDValue OR = 3678 DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy)); 3679 SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR); 3680 return Result; 3681 } 3682 case Intrinsic::arm_cls64: { 3683 // cls(x) = if cls(hi(x)) != 31 then cls(hi(x)) 3684 // else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x))) 3685 const SDValue &Operand = Op.getOperand(1); 3686 const EVT VTy = Op.getValueType(); 3687 3688 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand, 3689 DAG.getConstant(1, dl, VTy)); 3690 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand, 3691 DAG.getConstant(0, dl, VTy)); 3692 SDValue Constant0 = DAG.getConstant(0, dl, VTy); 3693 SDValue Constant1 = DAG.getConstant(1, dl, VTy); 3694 SDValue Constant31 = DAG.getConstant(31, dl, VTy); 3695 SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31); 3696 SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi); 3697 SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1); 3698 SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1); 3699 SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi); 3700 SDValue CheckLo = 3701 DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ); 3702 SDValue HiIsZero = 3703 DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ); 3704 SDValue AdjustedLo = 3705 DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy)); 3706 SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VTy, AdjustedLo); 3707 SDValue Result = 3708 DAG.getSelect(dl, VTy, CheckLo, 3709 DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi); 3710 return Result; 3711 } 3712 case Intrinsic::eh_sjlj_lsda: { 3713 MachineFunction &MF = DAG.getMachineFunction(); 3714 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3715 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 3716 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3717 SDValue CPAddr; 3718 bool IsPositionIndependent = isPositionIndependent(); 3719 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0; 3720 ARMConstantPoolValue *CPV = 3721 ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex, 3722 ARMCP::CPLSDA, PCAdj); 3723 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3724 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3725 SDValue Result = DAG.getLoad( 3726 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3727 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3728 3729 if (IsPositionIndependent) { 3730 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3731 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); 3732 } 3733 return Result; 3734 } 3735 case Intrinsic::arm_neon_vabs: 3736 return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(), 3737 Op.getOperand(1)); 3738 case Intrinsic::arm_neon_vmulls: 3739 case Intrinsic::arm_neon_vmullu: { 3740 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls) 3741 ? ARMISD::VMULLs : ARMISD::VMULLu; 3742 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3743 Op.getOperand(1), Op.getOperand(2)); 3744 } 3745 case Intrinsic::arm_neon_vminnm: 3746 case Intrinsic::arm_neon_vmaxnm: { 3747 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm) 3748 ? ISD::FMINNUM : ISD::FMAXNUM; 3749 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3750 Op.getOperand(1), Op.getOperand(2)); 3751 } 3752 case Intrinsic::arm_neon_vminu: 3753 case Intrinsic::arm_neon_vmaxu: { 3754 if (Op.getValueType().isFloatingPoint()) 3755 return SDValue(); 3756 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu) 3757 ? ISD::UMIN : ISD::UMAX; 3758 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3759 Op.getOperand(1), Op.getOperand(2)); 3760 } 3761 case Intrinsic::arm_neon_vmins: 3762 case Intrinsic::arm_neon_vmaxs: { 3763 // v{min,max}s is overloaded between signed integers and floats. 3764 if (!Op.getValueType().isFloatingPoint()) { 3765 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) 3766 ? ISD::SMIN : ISD::SMAX; 3767 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3768 Op.getOperand(1), Op.getOperand(2)); 3769 } 3770 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) 3771 ? ISD::FMINIMUM : ISD::FMAXIMUM; 3772 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3773 Op.getOperand(1), Op.getOperand(2)); 3774 } 3775 case Intrinsic::arm_neon_vtbl1: 3776 return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(), 3777 Op.getOperand(1), Op.getOperand(2)); 3778 case Intrinsic::arm_neon_vtbl2: 3779 return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(), 3780 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 3781 case Intrinsic::arm_mve_pred_i2v: 3782 case Intrinsic::arm_mve_pred_v2i: 3783 return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(), 3784 Op.getOperand(1)); 3785 } 3786 } 3787 3788 static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, 3789 const ARMSubtarget *Subtarget) { 3790 SDLoc dl(Op); 3791 ConstantSDNode *SSIDNode = cast<ConstantSDNode>(Op.getOperand(2)); 3792 auto SSID = static_cast<SyncScope::ID>(SSIDNode->getZExtValue()); 3793 if (SSID == SyncScope::SingleThread) 3794 return Op; 3795 3796 if (!Subtarget->hasDataBarrier()) { 3797 // Some ARMv6 cpus can support data barriers with an mcr instruction. 3798 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 3799 // here. 3800 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() && 3801 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!"); 3802 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0), 3803 DAG.getConstant(0, dl, MVT::i32)); 3804 } 3805 3806 ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1)); 3807 AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue()); 3808 ARM_MB::MemBOpt Domain = ARM_MB::ISH; 3809 if (Subtarget->isMClass()) { 3810 // Only a full system barrier exists in the M-class architectures. 3811 Domain = ARM_MB::SY; 3812 } else if (Subtarget->preferISHSTBarriers() && 3813 Ord == AtomicOrdering::Release) { 3814 // Swift happens to implement ISHST barriers in a way that's compatible with 3815 // Release semantics but weaker than ISH so we'd be fools not to use 3816 // it. Beware: other processors probably don't! 3817 Domain = ARM_MB::ISHST; 3818 } 3819 3820 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0), 3821 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32), 3822 DAG.getConstant(Domain, dl, MVT::i32)); 3823 } 3824 3825 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG, 3826 const ARMSubtarget *Subtarget) { 3827 // ARM pre v5TE and Thumb1 does not have preload instructions. 3828 if (!(Subtarget->isThumb2() || 3829 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps()))) 3830 // Just preserve the chain. 3831 return Op.getOperand(0); 3832 3833 SDLoc dl(Op); 3834 unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1; 3835 if (!isRead && 3836 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension())) 3837 // ARMv7 with MP extension has PLDW. 3838 return Op.getOperand(0); 3839 3840 unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 3841 if (Subtarget->isThumb()) { 3842 // Invert the bits. 3843 isRead = ~isRead & 1; 3844 isData = ~isData & 1; 3845 } 3846 3847 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0), 3848 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32), 3849 DAG.getConstant(isData, dl, MVT::i32)); 3850 } 3851 3852 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { 3853 MachineFunction &MF = DAG.getMachineFunction(); 3854 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>(); 3855 3856 // vastart just stores the address of the VarArgsFrameIndex slot into the 3857 // memory location argument. 3858 SDLoc dl(Op); 3859 EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 3860 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 3861 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 3862 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), 3863 MachinePointerInfo(SV)); 3864 } 3865 3866 SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, 3867 CCValAssign &NextVA, 3868 SDValue &Root, 3869 SelectionDAG &DAG, 3870 const SDLoc &dl) const { 3871 MachineFunction &MF = DAG.getMachineFunction(); 3872 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3873 3874 const TargetRegisterClass *RC; 3875 if (AFI->isThumb1OnlyFunction()) 3876 RC = &ARM::tGPRRegClass; 3877 else 3878 RC = &ARM::GPRRegClass; 3879 3880 // Transform the arguments stored in physical registers into virtual ones. 3881 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 3882 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 3883 3884 SDValue ArgValue2; 3885 if (NextVA.isMemLoc()) { 3886 MachineFrameInfo &MFI = MF.getFrameInfo(); 3887 int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true); 3888 3889 // Create load node to retrieve arguments from the stack. 3890 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 3891 ArgValue2 = DAG.getLoad( 3892 MVT::i32, dl, Root, FIN, 3893 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); 3894 } else { 3895 Reg = MF.addLiveIn(NextVA.getLocReg(), RC); 3896 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 3897 } 3898 if (!Subtarget->isLittle()) 3899 std::swap (ArgValue, ArgValue2); 3900 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2); 3901 } 3902 3903 // The remaining GPRs hold either the beginning of variable-argument 3904 // data, or the beginning of an aggregate passed by value (usually 3905 // byval). Either way, we allocate stack slots adjacent to the data 3906 // provided by our caller, and store the unallocated registers there. 3907 // If this is a variadic function, the va_list pointer will begin with 3908 // these values; otherwise, this reassembles a (byval) structure that 3909 // was split between registers and memory. 3910 // Return: The frame index registers were stored into. 3911 int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, 3912 const SDLoc &dl, SDValue &Chain, 3913 const Value *OrigArg, 3914 unsigned InRegsParamRecordIdx, 3915 int ArgOffset, unsigned ArgSize) const { 3916 // Currently, two use-cases possible: 3917 // Case #1. Non-var-args function, and we meet first byval parameter. 3918 // Setup first unallocated register as first byval register; 3919 // eat all remained registers 3920 // (these two actions are performed by HandleByVal method). 3921 // Then, here, we initialize stack frame with 3922 // "store-reg" instructions. 3923 // Case #2. Var-args function, that doesn't contain byval parameters. 3924 // The same: eat all remained unallocated registers, 3925 // initialize stack frame. 3926 3927 MachineFunction &MF = DAG.getMachineFunction(); 3928 MachineFrameInfo &MFI = MF.getFrameInfo(); 3929 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3930 unsigned RBegin, REnd; 3931 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) { 3932 CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd); 3933 } else { 3934 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs); 3935 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx]; 3936 REnd = ARM::R4; 3937 } 3938 3939 if (REnd != RBegin) 3940 ArgOffset = -4 * (ARM::R4 - RBegin); 3941 3942 auto PtrVT = getPointerTy(DAG.getDataLayout()); 3943 int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false); 3944 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT); 3945 3946 SmallVector<SDValue, 4> MemOps; 3947 const TargetRegisterClass *RC = 3948 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass; 3949 3950 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) { 3951 unsigned VReg = MF.addLiveIn(Reg, RC); 3952 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); 3953 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, 3954 MachinePointerInfo(OrigArg, 4 * i)); 3955 MemOps.push_back(Store); 3956 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT)); 3957 } 3958 3959 if (!MemOps.empty()) 3960 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); 3961 return FrameIndex; 3962 } 3963 3964 // Setup stack frame, the va_list pointer will start from. 3965 void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, 3966 const SDLoc &dl, SDValue &Chain, 3967 unsigned ArgOffset, 3968 unsigned TotalArgRegsSaveSize, 3969 bool ForceMutable) const { 3970 MachineFunction &MF = DAG.getMachineFunction(); 3971 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3972 3973 // Try to store any remaining integer argument regs 3974 // to their spots on the stack so that they may be loaded by dereferencing 3975 // the result of va_next. 3976 // If there is no regs to be stored, just point address after last 3977 // argument passed via stack. 3978 int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr, 3979 CCInfo.getInRegsParamsCount(), 3980 CCInfo.getNextStackOffset(), 3981 std::max(4U, TotalArgRegsSaveSize)); 3982 AFI->setVarArgsFrameIndex(FrameIndex); 3983 } 3984 3985 SDValue ARMTargetLowering::LowerFormalArguments( 3986 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 3987 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 3988 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 3989 MachineFunction &MF = DAG.getMachineFunction(); 3990 MachineFrameInfo &MFI = MF.getFrameInfo(); 3991 3992 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3993 3994 // Assign locations to all of the incoming arguments. 3995 SmallVector<CCValAssign, 16> ArgLocs; 3996 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 3997 *DAG.getContext()); 3998 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg)); 3999 4000 SmallVector<SDValue, 16> ArgValues; 4001 SDValue ArgValue; 4002 Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin(); 4003 unsigned CurArgIdx = 0; 4004 4005 // Initially ArgRegsSaveSize is zero. 4006 // Then we increase this value each time we meet byval parameter. 4007 // We also increase this value in case of varargs function. 4008 AFI->setArgRegsSaveSize(0); 4009 4010 // Calculate the amount of stack space that we need to allocate to store 4011 // byval and variadic arguments that are passed in registers. 4012 // We need to know this before we allocate the first byval or variadic 4013 // argument, as they will be allocated a stack slot below the CFA (Canonical 4014 // Frame Address, the stack pointer at entry to the function). 4015 unsigned ArgRegBegin = ARM::R4; 4016 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 4017 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount()) 4018 break; 4019 4020 CCValAssign &VA = ArgLocs[i]; 4021 unsigned Index = VA.getValNo(); 4022 ISD::ArgFlagsTy Flags = Ins[Index].Flags; 4023 if (!Flags.isByVal()) 4024 continue; 4025 4026 assert(VA.isMemLoc() && "unexpected byval pointer in reg"); 4027 unsigned RBegin, REnd; 4028 CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd); 4029 ArgRegBegin = std::min(ArgRegBegin, RBegin); 4030 4031 CCInfo.nextInRegsParam(); 4032 } 4033 CCInfo.rewindByValRegsInfo(); 4034 4035 int lastInsIndex = -1; 4036 if (isVarArg && MFI.hasVAStart()) { 4037 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs); 4038 if (RegIdx != array_lengthof(GPRArgRegs)) 4039 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]); 4040 } 4041 4042 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin); 4043 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize); 4044 auto PtrVT = getPointerTy(DAG.getDataLayout()); 4045 4046 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 4047 CCValAssign &VA = ArgLocs[i]; 4048 if (Ins[VA.getValNo()].isOrigArg()) { 4049 std::advance(CurOrigArg, 4050 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx); 4051 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex(); 4052 } 4053 // Arguments stored in registers. 4054 if (VA.isRegLoc()) { 4055 EVT RegVT = VA.getLocVT(); 4056 4057 if (VA.needsCustom()) { 4058 // f64 and vector types are split up into multiple registers or 4059 // combinations of registers and stack slots. 4060 if (VA.getLocVT() == MVT::v2f64) { 4061 SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i], 4062 Chain, DAG, dl); 4063 VA = ArgLocs[++i]; // skip ahead to next loc 4064 SDValue ArgValue2; 4065 if (VA.isMemLoc()) { 4066 int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true); 4067 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 4068 ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN, 4069 MachinePointerInfo::getFixedStack( 4070 DAG.getMachineFunction(), FI)); 4071 } else { 4072 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], 4073 Chain, DAG, dl); 4074 } 4075 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 4076 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, 4077 ArgValue, ArgValue1, 4078 DAG.getIntPtrConstant(0, dl)); 4079 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, 4080 ArgValue, ArgValue2, 4081 DAG.getIntPtrConstant(1, dl)); 4082 } else 4083 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); 4084 } else { 4085 const TargetRegisterClass *RC; 4086 4087 4088 if (RegVT == MVT::f16) 4089 RC = &ARM::HPRRegClass; 4090 else if (RegVT == MVT::f32) 4091 RC = &ARM::SPRRegClass; 4092 else if (RegVT == MVT::f64 || RegVT == MVT::v4f16) 4093 RC = &ARM::DPRRegClass; 4094 else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16) 4095 RC = &ARM::QPRRegClass; 4096 else if (RegVT == MVT::i32) 4097 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass 4098 : &ARM::GPRRegClass; 4099 else 4100 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); 4101 4102 // Transform the arguments in physical registers into virtual ones. 4103 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 4104 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 4105 4106 // If this value is passed in r0 and has the returned attribute (e.g. 4107 // C++ 'structors), record this fact for later use. 4108 if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) { 4109 AFI->setPreservesR0(); 4110 } 4111 } 4112 4113 // If this is an 8 or 16-bit value, it is really passed promoted 4114 // to 32 bits. Insert an assert[sz]ext to capture this, then 4115 // truncate to the right size. 4116 switch (VA.getLocInfo()) { 4117 default: llvm_unreachable("Unknown loc info!"); 4118 case CCValAssign::Full: break; 4119 case CCValAssign::BCvt: 4120 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 4121 break; 4122 case CCValAssign::SExt: 4123 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 4124 DAG.getValueType(VA.getValVT())); 4125 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 4126 break; 4127 case CCValAssign::ZExt: 4128 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 4129 DAG.getValueType(VA.getValVT())); 4130 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 4131 break; 4132 } 4133 4134 InVals.push_back(ArgValue); 4135 } else { // VA.isRegLoc() 4136 // sanity check 4137 assert(VA.isMemLoc()); 4138 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered"); 4139 4140 int index = VA.getValNo(); 4141 4142 // Some Ins[] entries become multiple ArgLoc[] entries. 4143 // Process them only once. 4144 if (index != lastInsIndex) 4145 { 4146 ISD::ArgFlagsTy Flags = Ins[index].Flags; 4147 // FIXME: For now, all byval parameter objects are marked mutable. 4148 // This can be changed with more analysis. 4149 // In case of tail call optimization mark all arguments mutable. 4150 // Since they could be overwritten by lowering of arguments in case of 4151 // a tail call. 4152 if (Flags.isByVal()) { 4153 assert(Ins[index].isOrigArg() && 4154 "Byval arguments cannot be implicit"); 4155 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed(); 4156 4157 int FrameIndex = StoreByValRegs( 4158 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex, 4159 VA.getLocMemOffset(), Flags.getByValSize()); 4160 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT)); 4161 CCInfo.nextInRegsParam(); 4162 } else { 4163 unsigned FIOffset = VA.getLocMemOffset(); 4164 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8, 4165 FIOffset, true); 4166 4167 // Create load nodes to retrieve arguments from the stack. 4168 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 4169 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, 4170 MachinePointerInfo::getFixedStack( 4171 DAG.getMachineFunction(), FI))); 4172 } 4173 lastInsIndex = index; 4174 } 4175 } 4176 } 4177 4178 // varargs 4179 if (isVarArg && MFI.hasVAStart()) 4180 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, 4181 CCInfo.getNextStackOffset(), 4182 TotalArgRegsSaveSize); 4183 4184 AFI->setArgumentStackSize(CCInfo.getNextStackOffset()); 4185 4186 return Chain; 4187 } 4188 4189 /// isFloatingPointZero - Return true if this is +0.0. 4190 static bool isFloatingPointZero(SDValue Op) { 4191 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) 4192 return CFP->getValueAPF().isPosZero(); 4193 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) { 4194 // Maybe this has already been legalized into the constant pool? 4195 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) { 4196 SDValue WrapperOp = Op.getOperand(1).getOperand(0); 4197 if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp)) 4198 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal())) 4199 return CFP->getValueAPF().isPosZero(); 4200 } 4201 } else if (Op->getOpcode() == ISD::BITCAST && 4202 Op->getValueType(0) == MVT::f64) { 4203 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64) 4204 // created by LowerConstantFP(). 4205 SDValue BitcastOp = Op->getOperand(0); 4206 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM && 4207 isNullConstant(BitcastOp->getOperand(0))) 4208 return true; 4209 } 4210 return false; 4211 } 4212 4213 /// Returns appropriate ARM CMP (cmp) and corresponding condition code for 4214 /// the given operands. 4215 SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, 4216 SDValue &ARMcc, SelectionDAG &DAG, 4217 const SDLoc &dl) const { 4218 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { 4219 unsigned C = RHSC->getZExtValue(); 4220 if (!isLegalICmpImmediate((int32_t)C)) { 4221 // Constant does not fit, try adjusting it by one. 4222 switch (CC) { 4223 default: break; 4224 case ISD::SETLT: 4225 case ISD::SETGE: 4226 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) { 4227 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; 4228 RHS = DAG.getConstant(C - 1, dl, MVT::i32); 4229 } 4230 break; 4231 case ISD::SETULT: 4232 case ISD::SETUGE: 4233 if (C != 0 && isLegalICmpImmediate(C-1)) { 4234 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; 4235 RHS = DAG.getConstant(C - 1, dl, MVT::i32); 4236 } 4237 break; 4238 case ISD::SETLE: 4239 case ISD::SETGT: 4240 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) { 4241 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; 4242 RHS = DAG.getConstant(C + 1, dl, MVT::i32); 4243 } 4244 break; 4245 case ISD::SETULE: 4246 case ISD::SETUGT: 4247 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) { 4248 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; 4249 RHS = DAG.getConstant(C + 1, dl, MVT::i32); 4250 } 4251 break; 4252 } 4253 } 4254 } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) && 4255 (ARM_AM::getShiftOpcForNode(RHS.getOpcode()) == ARM_AM::no_shift)) { 4256 // In ARM and Thumb-2, the compare instructions can shift their second 4257 // operand. 4258 CC = ISD::getSetCCSwappedOperands(CC); 4259 std::swap(LHS, RHS); 4260 } 4261 4262 // Thumb1 has very limited immediate modes, so turning an "and" into a 4263 // shift can save multiple instructions. 4264 // 4265 // If we have (x & C1), and C1 is an appropriate mask, we can transform it 4266 // into "((x << n) >> n)". But that isn't necessarily profitable on its 4267 // own. If it's the operand to an unsigned comparison with an immediate, 4268 // we can eliminate one of the shifts: we transform 4269 // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)". 4270 // 4271 // We avoid transforming cases which aren't profitable due to encoding 4272 // details: 4273 // 4274 // 1. C2 fits into the immediate field of a cmp, and the transformed version 4275 // would not; in that case, we're essentially trading one immediate load for 4276 // another. 4277 // 2. C1 is 255 or 65535, so we can use uxtb or uxth. 4278 // 3. C2 is zero; we have other code for this special case. 4279 // 4280 // FIXME: Figure out profitability for Thumb2; we usually can't save an 4281 // instruction, since the AND is always one instruction anyway, but we could 4282 // use narrow instructions in some cases. 4283 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND && 4284 LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) && 4285 LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) && 4286 !isSignedIntSetCC(CC)) { 4287 unsigned Mask = cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue(); 4288 auto *RHSC = cast<ConstantSDNode>(RHS.getNode()); 4289 uint64_t RHSV = RHSC->getZExtValue(); 4290 if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) { 4291 unsigned ShiftBits = countLeadingZeros(Mask); 4292 if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) { 4293 SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32); 4294 LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt); 4295 RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32); 4296 } 4297 } 4298 } 4299 4300 // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a 4301 // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same 4302 // way a cmp would. 4303 // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and 4304 // some tweaks to the heuristics for the previous and->shift transform. 4305 // FIXME: Optimize cases where the LHS isn't a shift. 4306 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL && 4307 isa<ConstantSDNode>(RHS) && 4308 cast<ConstantSDNode>(RHS)->getZExtValue() == 0x80000000U && 4309 CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) && 4310 cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() < 31) { 4311 unsigned ShiftAmt = 4312 cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() + 1; 4313 SDValue Shift = DAG.getNode(ARMISD::LSLS, dl, 4314 DAG.getVTList(MVT::i32, MVT::i32), 4315 LHS.getOperand(0), 4316 DAG.getConstant(ShiftAmt, dl, MVT::i32)); 4317 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, 4318 Shift.getValue(1), SDValue()); 4319 ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32); 4320 return Chain.getValue(1); 4321 } 4322 4323 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 4324 4325 // If the RHS is a constant zero then the V (overflow) flag will never be 4326 // set. This can allow us to simplify GE to PL or LT to MI, which can be 4327 // simpler for other passes (like the peephole optimiser) to deal with. 4328 if (isNullConstant(RHS)) { 4329 switch (CondCode) { 4330 default: break; 4331 case ARMCC::GE: 4332 CondCode = ARMCC::PL; 4333 break; 4334 case ARMCC::LT: 4335 CondCode = ARMCC::MI; 4336 break; 4337 } 4338 } 4339 4340 ARMISD::NodeType CompareType; 4341 switch (CondCode) { 4342 default: 4343 CompareType = ARMISD::CMP; 4344 break; 4345 case ARMCC::EQ: 4346 case ARMCC::NE: 4347 // Uses only Z Flag 4348 CompareType = ARMISD::CMPZ; 4349 break; 4350 } 4351 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 4352 return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS); 4353 } 4354 4355 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. 4356 SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, 4357 SelectionDAG &DAG, const SDLoc &dl, 4358 bool Signaling) const { 4359 assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64); 4360 SDValue Cmp; 4361 if (!isFloatingPointZero(RHS)) 4362 Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP, 4363 dl, MVT::Glue, LHS, RHS); 4364 else 4365 Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0, 4366 dl, MVT::Glue, LHS); 4367 return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp); 4368 } 4369 4370 /// duplicateCmp - Glue values can have only one use, so this function 4371 /// duplicates a comparison node. 4372 SDValue 4373 ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const { 4374 unsigned Opc = Cmp.getOpcode(); 4375 SDLoc DL(Cmp); 4376 if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ) 4377 return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); 4378 4379 assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation"); 4380 Cmp = Cmp.getOperand(0); 4381 Opc = Cmp.getOpcode(); 4382 if (Opc == ARMISD::CMPFP) 4383 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); 4384 else { 4385 assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT"); 4386 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0)); 4387 } 4388 return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp); 4389 } 4390 4391 // This function returns three things: the arithmetic computation itself 4392 // (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The 4393 // comparison and the condition code define the case in which the arithmetic 4394 // computation *does not* overflow. 4395 std::pair<SDValue, SDValue> 4396 ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG, 4397 SDValue &ARMcc) const { 4398 assert(Op.getValueType() == MVT::i32 && "Unsupported value type"); 4399 4400 SDValue Value, OverflowCmp; 4401 SDValue LHS = Op.getOperand(0); 4402 SDValue RHS = Op.getOperand(1); 4403 SDLoc dl(Op); 4404 4405 // FIXME: We are currently always generating CMPs because we don't support 4406 // generating CMN through the backend. This is not as good as the natural 4407 // CMP case because it causes a register dependency and cannot be folded 4408 // later. 4409 4410 switch (Op.getOpcode()) { 4411 default: 4412 llvm_unreachable("Unknown overflow instruction!"); 4413 case ISD::SADDO: 4414 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); 4415 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS); 4416 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); 4417 break; 4418 case ISD::UADDO: 4419 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); 4420 // We use ADDC here to correspond to its use in LowerUnsignedALUO. 4421 // We do not use it in the USUBO case as Value may not be used. 4422 Value = DAG.getNode(ARMISD::ADDC, dl, 4423 DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS) 4424 .getValue(0); 4425 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); 4426 break; 4427 case ISD::SSUBO: 4428 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); 4429 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); 4430 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); 4431 break; 4432 case ISD::USUBO: 4433 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); 4434 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); 4435 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); 4436 break; 4437 case ISD::UMULO: 4438 // We generate a UMUL_LOHI and then check if the high word is 0. 4439 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32); 4440 Value = DAG.getNode(ISD::UMUL_LOHI, dl, 4441 DAG.getVTList(Op.getValueType(), Op.getValueType()), 4442 LHS, RHS); 4443 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1), 4444 DAG.getConstant(0, dl, MVT::i32)); 4445 Value = Value.getValue(0); // We only want the low 32 bits for the result. 4446 break; 4447 case ISD::SMULO: 4448 // We generate a SMUL_LOHI and then check if all the bits of the high word 4449 // are the same as the sign bit of the low word. 4450 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32); 4451 Value = DAG.getNode(ISD::SMUL_LOHI, dl, 4452 DAG.getVTList(Op.getValueType(), Op.getValueType()), 4453 LHS, RHS); 4454 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1), 4455 DAG.getNode(ISD::SRA, dl, Op.getValueType(), 4456 Value.getValue(0), 4457 DAG.getConstant(31, dl, MVT::i32))); 4458 Value = Value.getValue(0); // We only want the low 32 bits for the result. 4459 break; 4460 } // switch (...) 4461 4462 return std::make_pair(Value, OverflowCmp); 4463 } 4464 4465 SDValue 4466 ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const { 4467 // Let legalize expand this if it isn't a legal type yet. 4468 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 4469 return SDValue(); 4470 4471 SDValue Value, OverflowCmp; 4472 SDValue ARMcc; 4473 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc); 4474 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4475 SDLoc dl(Op); 4476 // We use 0 and 1 as false and true values. 4477 SDValue TVal = DAG.getConstant(1, dl, MVT::i32); 4478 SDValue FVal = DAG.getConstant(0, dl, MVT::i32); 4479 EVT VT = Op.getValueType(); 4480 4481 SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal, 4482 ARMcc, CCR, OverflowCmp); 4483 4484 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 4485 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 4486 } 4487 4488 static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry, 4489 SelectionDAG &DAG) { 4490 SDLoc DL(BoolCarry); 4491 EVT CarryVT = BoolCarry.getValueType(); 4492 4493 // This converts the boolean value carry into the carry flag by doing 4494 // ARMISD::SUBC Carry, 1 4495 SDValue Carry = DAG.getNode(ARMISD::SUBC, DL, 4496 DAG.getVTList(CarryVT, MVT::i32), 4497 BoolCarry, DAG.getConstant(1, DL, CarryVT)); 4498 return Carry.getValue(1); 4499 } 4500 4501 static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT, 4502 SelectionDAG &DAG) { 4503 SDLoc DL(Flags); 4504 4505 // Now convert the carry flag into a boolean carry. We do this 4506 // using ARMISD:ADDE 0, 0, Carry 4507 return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32), 4508 DAG.getConstant(0, DL, MVT::i32), 4509 DAG.getConstant(0, DL, MVT::i32), Flags); 4510 } 4511 4512 SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op, 4513 SelectionDAG &DAG) const { 4514 // Let legalize expand this if it isn't a legal type yet. 4515 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 4516 return SDValue(); 4517 4518 SDValue LHS = Op.getOperand(0); 4519 SDValue RHS = Op.getOperand(1); 4520 SDLoc dl(Op); 4521 4522 EVT VT = Op.getValueType(); 4523 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 4524 SDValue Value; 4525 SDValue Overflow; 4526 switch (Op.getOpcode()) { 4527 default: 4528 llvm_unreachable("Unknown overflow instruction!"); 4529 case ISD::UADDO: 4530 Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS); 4531 // Convert the carry flag into a boolean value. 4532 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG); 4533 break; 4534 case ISD::USUBO: { 4535 Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS); 4536 // Convert the carry flag into a boolean value. 4537 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG); 4538 // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow 4539 // value. So compute 1 - C. 4540 Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32, 4541 DAG.getConstant(1, dl, MVT::i32), Overflow); 4542 break; 4543 } 4544 } 4545 4546 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 4547 } 4548 4549 static SDValue LowerSADDSUBSAT(SDValue Op, SelectionDAG &DAG, 4550 const ARMSubtarget *Subtarget) { 4551 EVT VT = Op.getValueType(); 4552 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) 4553 return SDValue(); 4554 if (!VT.isSimple()) 4555 return SDValue(); 4556 4557 unsigned NewOpcode; 4558 bool IsAdd = Op->getOpcode() == ISD::SADDSAT; 4559 switch (VT.getSimpleVT().SimpleTy) { 4560 default: 4561 return SDValue(); 4562 case MVT::i8: 4563 NewOpcode = IsAdd ? ARMISD::QADD8b : ARMISD::QSUB8b; 4564 break; 4565 case MVT::i16: 4566 NewOpcode = IsAdd ? ARMISD::QADD16b : ARMISD::QSUB16b; 4567 break; 4568 } 4569 4570 SDLoc dl(Op); 4571 SDValue Add = 4572 DAG.getNode(NewOpcode, dl, MVT::i32, 4573 DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32), 4574 DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32)); 4575 return DAG.getNode(ISD::TRUNCATE, dl, VT, Add); 4576 } 4577 4578 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 4579 SDValue Cond = Op.getOperand(0); 4580 SDValue SelectTrue = Op.getOperand(1); 4581 SDValue SelectFalse = Op.getOperand(2); 4582 SDLoc dl(Op); 4583 unsigned Opc = Cond.getOpcode(); 4584 4585 if (Cond.getResNo() == 1 && 4586 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 4587 Opc == ISD::USUBO)) { 4588 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) 4589 return SDValue(); 4590 4591 SDValue Value, OverflowCmp; 4592 SDValue ARMcc; 4593 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); 4594 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4595 EVT VT = Op.getValueType(); 4596 4597 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR, 4598 OverflowCmp, DAG); 4599 } 4600 4601 // Convert: 4602 // 4603 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond) 4604 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond) 4605 // 4606 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) { 4607 const ConstantSDNode *CMOVTrue = 4608 dyn_cast<ConstantSDNode>(Cond.getOperand(0)); 4609 const ConstantSDNode *CMOVFalse = 4610 dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 4611 4612 if (CMOVTrue && CMOVFalse) { 4613 unsigned CMOVTrueVal = CMOVTrue->getZExtValue(); 4614 unsigned CMOVFalseVal = CMOVFalse->getZExtValue(); 4615 4616 SDValue True; 4617 SDValue False; 4618 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) { 4619 True = SelectTrue; 4620 False = SelectFalse; 4621 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) { 4622 True = SelectFalse; 4623 False = SelectTrue; 4624 } 4625 4626 if (True.getNode() && False.getNode()) { 4627 EVT VT = Op.getValueType(); 4628 SDValue ARMcc = Cond.getOperand(2); 4629 SDValue CCR = Cond.getOperand(3); 4630 SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG); 4631 assert(True.getValueType() == VT); 4632 return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG); 4633 } 4634 } 4635 } 4636 4637 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the 4638 // undefined bits before doing a full-word comparison with zero. 4639 Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond, 4640 DAG.getConstant(1, dl, Cond.getValueType())); 4641 4642 return DAG.getSelectCC(dl, Cond, 4643 DAG.getConstant(0, dl, Cond.getValueType()), 4644 SelectTrue, SelectFalse, ISD::SETNE); 4645 } 4646 4647 static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, 4648 bool &swpCmpOps, bool &swpVselOps) { 4649 // Start by selecting the GE condition code for opcodes that return true for 4650 // 'equality' 4651 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE || 4652 CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE) 4653 CondCode = ARMCC::GE; 4654 4655 // and GT for opcodes that return false for 'equality'. 4656 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT || 4657 CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT) 4658 CondCode = ARMCC::GT; 4659 4660 // Since we are constrained to GE/GT, if the opcode contains 'less', we need 4661 // to swap the compare operands. 4662 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT || 4663 CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT) 4664 swpCmpOps = true; 4665 4666 // Both GT and GE are ordered comparisons, and return false for 'unordered'. 4667 // If we have an unordered opcode, we need to swap the operands to the VSEL 4668 // instruction (effectively negating the condition). 4669 // 4670 // This also has the effect of swapping which one of 'less' or 'greater' 4671 // returns true, so we also swap the compare operands. It also switches 4672 // whether we return true for 'equality', so we compensate by picking the 4673 // opposite condition code to our original choice. 4674 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE || 4675 CC == ISD::SETUGT) { 4676 swpCmpOps = !swpCmpOps; 4677 swpVselOps = !swpVselOps; 4678 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT; 4679 } 4680 4681 // 'ordered' is 'anything but unordered', so use the VS condition code and 4682 // swap the VSEL operands. 4683 if (CC == ISD::SETO) { 4684 CondCode = ARMCC::VS; 4685 swpVselOps = true; 4686 } 4687 4688 // 'unordered or not equal' is 'anything but equal', so use the EQ condition 4689 // code and swap the VSEL operands. Also do this if we don't care about the 4690 // unordered case. 4691 if (CC == ISD::SETUNE || CC == ISD::SETNE) { 4692 CondCode = ARMCC::EQ; 4693 swpVselOps = true; 4694 } 4695 } 4696 4697 SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, 4698 SDValue TrueVal, SDValue ARMcc, SDValue CCR, 4699 SDValue Cmp, SelectionDAG &DAG) const { 4700 if (!Subtarget->hasFP64() && VT == MVT::f64) { 4701 FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl, 4702 DAG.getVTList(MVT::i32, MVT::i32), FalseVal); 4703 TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl, 4704 DAG.getVTList(MVT::i32, MVT::i32), TrueVal); 4705 4706 SDValue TrueLow = TrueVal.getValue(0); 4707 SDValue TrueHigh = TrueVal.getValue(1); 4708 SDValue FalseLow = FalseVal.getValue(0); 4709 SDValue FalseHigh = FalseVal.getValue(1); 4710 4711 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow, 4712 ARMcc, CCR, Cmp); 4713 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh, 4714 ARMcc, CCR, duplicateCmp(Cmp, DAG)); 4715 4716 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High); 4717 } else { 4718 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR, 4719 Cmp); 4720 } 4721 } 4722 4723 static bool isGTorGE(ISD::CondCode CC) { 4724 return CC == ISD::SETGT || CC == ISD::SETGE; 4725 } 4726 4727 static bool isLTorLE(ISD::CondCode CC) { 4728 return CC == ISD::SETLT || CC == ISD::SETLE; 4729 } 4730 4731 // See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating. 4732 // All of these conditions (and their <= and >= counterparts) will do: 4733 // x < k ? k : x 4734 // x > k ? x : k 4735 // k < x ? x : k 4736 // k > x ? k : x 4737 static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, 4738 const SDValue TrueVal, const SDValue FalseVal, 4739 const ISD::CondCode CC, const SDValue K) { 4740 return (isGTorGE(CC) && 4741 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) || 4742 (isLTorLE(CC) && 4743 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))); 4744 } 4745 4746 // Similar to isLowerSaturate(), but checks for upper-saturating conditions. 4747 static bool isUpperSaturate(const SDValue LHS, const SDValue RHS, 4748 const SDValue TrueVal, const SDValue FalseVal, 4749 const ISD::CondCode CC, const SDValue K) { 4750 return (isGTorGE(CC) && 4751 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))) || 4752 (isLTorLE(CC) && 4753 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))); 4754 } 4755 4756 // Check if two chained conditionals could be converted into SSAT or USAT. 4757 // 4758 // SSAT can replace a set of two conditional selectors that bound a number to an 4759 // interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples: 4760 // 4761 // x < -k ? -k : (x > k ? k : x) 4762 // x < -k ? -k : (x < k ? x : k) 4763 // x > -k ? (x > k ? k : x) : -k 4764 // x < k ? (x < -k ? -k : x) : k 4765 // etc. 4766 // 4767 // USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1 is 4768 // a power of 2. 4769 // 4770 // It returns true if the conversion can be done, false otherwise. 4771 // Additionally, the variable is returned in parameter V, the constant in K and 4772 // usat is set to true if the conditional represents an unsigned saturation 4773 static bool isSaturatingConditional(const SDValue &Op, SDValue &V, 4774 uint64_t &K, bool &usat) { 4775 SDValue LHS1 = Op.getOperand(0); 4776 SDValue RHS1 = Op.getOperand(1); 4777 SDValue TrueVal1 = Op.getOperand(2); 4778 SDValue FalseVal1 = Op.getOperand(3); 4779 ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 4780 4781 const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1; 4782 if (Op2.getOpcode() != ISD::SELECT_CC) 4783 return false; 4784 4785 SDValue LHS2 = Op2.getOperand(0); 4786 SDValue RHS2 = Op2.getOperand(1); 4787 SDValue TrueVal2 = Op2.getOperand(2); 4788 SDValue FalseVal2 = Op2.getOperand(3); 4789 ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get(); 4790 4791 // Find out which are the constants and which are the variables 4792 // in each conditional 4793 SDValue *K1 = isa<ConstantSDNode>(LHS1) ? &LHS1 : isa<ConstantSDNode>(RHS1) 4794 ? &RHS1 4795 : nullptr; 4796 SDValue *K2 = isa<ConstantSDNode>(LHS2) ? &LHS2 : isa<ConstantSDNode>(RHS2) 4797 ? &RHS2 4798 : nullptr; 4799 SDValue K2Tmp = isa<ConstantSDNode>(TrueVal2) ? TrueVal2 : FalseVal2; 4800 SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1; 4801 SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2; 4802 SDValue V2 = (K2Tmp == TrueVal2) ? FalseVal2 : TrueVal2; 4803 4804 // We must detect cases where the original operations worked with 16- or 4805 // 8-bit values. In such case, V2Tmp != V2 because the comparison operations 4806 // must work with sign-extended values but the select operations return 4807 // the original non-extended value. 4808 SDValue V2TmpReg = V2Tmp; 4809 if (V2Tmp->getOpcode() == ISD::SIGN_EXTEND_INREG) 4810 V2TmpReg = V2Tmp->getOperand(0); 4811 4812 // Check that the registers and the constants have the correct values 4813 // in both conditionals 4814 if (!K1 || !K2 || *K1 == Op2 || *K2 != K2Tmp || V1Tmp != V2Tmp || 4815 V2TmpReg != V2) 4816 return false; 4817 4818 // Figure out which conditional is saturating the lower/upper bound. 4819 const SDValue *LowerCheckOp = 4820 isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) 4821 ? &Op 4822 : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) 4823 ? &Op2 4824 : nullptr; 4825 const SDValue *UpperCheckOp = 4826 isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) 4827 ? &Op 4828 : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) 4829 ? &Op2 4830 : nullptr; 4831 4832 if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp) 4833 return false; 4834 4835 // Check that the constant in the lower-bound check is 4836 // the opposite of the constant in the upper-bound check 4837 // in 1's complement. 4838 int64_t Val1 = cast<ConstantSDNode>(*K1)->getSExtValue(); 4839 int64_t Val2 = cast<ConstantSDNode>(*K2)->getSExtValue(); 4840 int64_t PosVal = std::max(Val1, Val2); 4841 int64_t NegVal = std::min(Val1, Val2); 4842 4843 if (((Val1 > Val2 && UpperCheckOp == &Op) || 4844 (Val1 < Val2 && UpperCheckOp == &Op2)) && 4845 isPowerOf2_64(PosVal + 1)) { 4846 4847 // Handle the difference between USAT (unsigned) and SSAT (signed) saturation 4848 if (Val1 == ~Val2) 4849 usat = false; 4850 else if (NegVal == 0) 4851 usat = true; 4852 else 4853 return false; 4854 4855 V = V2; 4856 K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive 4857 4858 return true; 4859 } 4860 4861 return false; 4862 } 4863 4864 // Check if a condition of the type x < k ? k : x can be converted into a 4865 // bit operation instead of conditional moves. 4866 // Currently this is allowed given: 4867 // - The conditions and values match up 4868 // - k is 0 or -1 (all ones) 4869 // This function will not check the last condition, thats up to the caller 4870 // It returns true if the transformation can be made, and in such case 4871 // returns x in V, and k in SatK. 4872 static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, 4873 SDValue &SatK) 4874 { 4875 SDValue LHS = Op.getOperand(0); 4876 SDValue RHS = Op.getOperand(1); 4877 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 4878 SDValue TrueVal = Op.getOperand(2); 4879 SDValue FalseVal = Op.getOperand(3); 4880 4881 SDValue *K = isa<ConstantSDNode>(LHS) ? &LHS : isa<ConstantSDNode>(RHS) 4882 ? &RHS 4883 : nullptr; 4884 4885 // No constant operation in comparison, early out 4886 if (!K) 4887 return false; 4888 4889 SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal; 4890 V = (KTmp == TrueVal) ? FalseVal : TrueVal; 4891 SDValue VTmp = (K && *K == LHS) ? RHS : LHS; 4892 4893 // If the constant on left and right side, or variable on left and right, 4894 // does not match, early out 4895 if (*K != KTmp || V != VTmp) 4896 return false; 4897 4898 if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) { 4899 SatK = *K; 4900 return true; 4901 } 4902 4903 return false; 4904 } 4905 4906 bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const { 4907 if (VT == MVT::f32) 4908 return !Subtarget->hasVFP2Base(); 4909 if (VT == MVT::f64) 4910 return !Subtarget->hasFP64(); 4911 if (VT == MVT::f16) 4912 return !Subtarget->hasFullFP16(); 4913 return false; 4914 } 4915 4916 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 4917 EVT VT = Op.getValueType(); 4918 SDLoc dl(Op); 4919 4920 // Try to convert two saturating conditional selects into a single SSAT 4921 SDValue SatValue; 4922 uint64_t SatConstant; 4923 bool SatUSat; 4924 if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) && 4925 isSaturatingConditional(Op, SatValue, SatConstant, SatUSat)) { 4926 if (SatUSat) 4927 return DAG.getNode(ARMISD::USAT, dl, VT, SatValue, 4928 DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); 4929 else 4930 return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue, 4931 DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); 4932 } 4933 4934 // Try to convert expressions of the form x < k ? k : x (and similar forms) 4935 // into more efficient bit operations, which is possible when k is 0 or -1 4936 // On ARM and Thumb-2 which have flexible operand 2 this will result in 4937 // single instructions. On Thumb the shift and the bit operation will be two 4938 // instructions. 4939 // Only allow this transformation on full-width (32-bit) operations 4940 SDValue LowerSatConstant; 4941 if (VT == MVT::i32 && 4942 isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) { 4943 SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue, 4944 DAG.getConstant(31, dl, VT)); 4945 if (isNullConstant(LowerSatConstant)) { 4946 SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV, 4947 DAG.getAllOnesConstant(dl, VT)); 4948 return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV); 4949 } else if (isAllOnesConstant(LowerSatConstant)) 4950 return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV); 4951 } 4952 4953 SDValue LHS = Op.getOperand(0); 4954 SDValue RHS = Op.getOperand(1); 4955 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 4956 SDValue TrueVal = Op.getOperand(2); 4957 SDValue FalseVal = Op.getOperand(3); 4958 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal); 4959 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal); 4960 4961 if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal && 4962 LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) { 4963 unsigned TVal = CTVal->getZExtValue(); 4964 unsigned FVal = CFVal->getZExtValue(); 4965 unsigned Opcode = 0; 4966 4967 if (TVal == ~FVal) { 4968 Opcode = ARMISD::CSINV; 4969 } else if (TVal == ~FVal + 1) { 4970 Opcode = ARMISD::CSNEG; 4971 } else if (TVal + 1 == FVal) { 4972 Opcode = ARMISD::CSINC; 4973 } else if (TVal == FVal + 1) { 4974 Opcode = ARMISD::CSINC; 4975 std::swap(TrueVal, FalseVal); 4976 std::swap(TVal, FVal); 4977 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 4978 } 4979 4980 if (Opcode) { 4981 // If one of the constants is cheaper than another, materialise the 4982 // cheaper one and let the csel generate the other. 4983 if (Opcode != ARMISD::CSINC && 4984 HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) { 4985 std::swap(TrueVal, FalseVal); 4986 std::swap(TVal, FVal); 4987 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 4988 } 4989 4990 // Attempt to use ZR checking TVal is 0, possibly inverting the condition 4991 // to get there. CSINC not is invertable like the other two (~(~a) == a, 4992 // -(-a) == a, but (a+1)+1 != a). 4993 if (FVal == 0 && Opcode != ARMISD::CSINC) { 4994 std::swap(TrueVal, FalseVal); 4995 std::swap(TVal, FVal); 4996 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 4997 } 4998 if (TVal == 0) 4999 TrueVal = DAG.getRegister(ARM::ZR, MVT::i32); 5000 5001 // Drops F's value because we can get it by inverting/negating TVal. 5002 FalseVal = TrueVal; 5003 5004 SDValue ARMcc; 5005 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5006 EVT VT = TrueVal.getValueType(); 5007 return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp); 5008 } 5009 } 5010 5011 if (isUnsupportedFloatingType(LHS.getValueType())) { 5012 DAG.getTargetLoweringInfo().softenSetCCOperands( 5013 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS); 5014 5015 // If softenSetCCOperands only returned one value, we should compare it to 5016 // zero. 5017 if (!RHS.getNode()) { 5018 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 5019 CC = ISD::SETNE; 5020 } 5021 } 5022 5023 if (LHS.getValueType() == MVT::i32) { 5024 // Try to generate VSEL on ARMv8. 5025 // The VSEL instruction can't use all the usual ARM condition 5026 // codes: it only has two bits to select the condition code, so it's 5027 // constrained to use only GE, GT, VS and EQ. 5028 // 5029 // To implement all the various ISD::SETXXX opcodes, we sometimes need to 5030 // swap the operands of the previous compare instruction (effectively 5031 // inverting the compare condition, swapping 'less' and 'greater') and 5032 // sometimes need to swap the operands to the VSEL (which inverts the 5033 // condition in the sense of firing whenever the previous condition didn't) 5034 if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 || 5035 TrueVal.getValueType() == MVT::f32 || 5036 TrueVal.getValueType() == MVT::f64)) { 5037 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 5038 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE || 5039 CondCode == ARMCC::VC || CondCode == ARMCC::NE) { 5040 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 5041 std::swap(TrueVal, FalseVal); 5042 } 5043 } 5044 5045 SDValue ARMcc; 5046 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5047 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5048 // Choose GE over PL, which vsel does now support 5049 if (cast<ConstantSDNode>(ARMcc)->getZExtValue() == ARMCC::PL) 5050 ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32); 5051 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); 5052 } 5053 5054 ARMCC::CondCodes CondCode, CondCode2; 5055 FPCCToARMCC(CC, CondCode, CondCode2); 5056 5057 // Normalize the fp compare. If RHS is zero we prefer to keep it there so we 5058 // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we 5059 // must use VSEL (limited condition codes), due to not having conditional f16 5060 // moves. 5061 if (Subtarget->hasFPARMv8Base() && 5062 !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) && 5063 (TrueVal.getValueType() == MVT::f16 || 5064 TrueVal.getValueType() == MVT::f32 || 5065 TrueVal.getValueType() == MVT::f64)) { 5066 bool swpCmpOps = false; 5067 bool swpVselOps = false; 5068 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps); 5069 5070 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE || 5071 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) { 5072 if (swpCmpOps) 5073 std::swap(LHS, RHS); 5074 if (swpVselOps) 5075 std::swap(TrueVal, FalseVal); 5076 } 5077 } 5078 5079 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 5080 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); 5081 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5082 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); 5083 if (CondCode2 != ARMCC::AL) { 5084 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32); 5085 // FIXME: Needs another CMP because flag can have but one use. 5086 SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl); 5087 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG); 5088 } 5089 return Result; 5090 } 5091 5092 /// canChangeToInt - Given the fp compare operand, return true if it is suitable 5093 /// to morph to an integer compare sequence. 5094 static bool canChangeToInt(SDValue Op, bool &SeenZero, 5095 const ARMSubtarget *Subtarget) { 5096 SDNode *N = Op.getNode(); 5097 if (!N->hasOneUse()) 5098 // Otherwise it requires moving the value from fp to integer registers. 5099 return false; 5100 if (!N->getNumValues()) 5101 return false; 5102 EVT VT = Op.getValueType(); 5103 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow()) 5104 // f32 case is generally profitable. f64 case only makes sense when vcmpe + 5105 // vmrs are very slow, e.g. cortex-a8. 5106 return false; 5107 5108 if (isFloatingPointZero(Op)) { 5109 SeenZero = true; 5110 return true; 5111 } 5112 return ISD::isNormalLoad(N); 5113 } 5114 5115 static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) { 5116 if (isFloatingPointZero(Op)) 5117 return DAG.getConstant(0, SDLoc(Op), MVT::i32); 5118 5119 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) 5120 return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(), 5121 Ld->getPointerInfo(), Ld->getAlignment(), 5122 Ld->getMemOperand()->getFlags()); 5123 5124 llvm_unreachable("Unknown VFP cmp argument!"); 5125 } 5126 5127 static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, 5128 SDValue &RetVal1, SDValue &RetVal2) { 5129 SDLoc dl(Op); 5130 5131 if (isFloatingPointZero(Op)) { 5132 RetVal1 = DAG.getConstant(0, dl, MVT::i32); 5133 RetVal2 = DAG.getConstant(0, dl, MVT::i32); 5134 return; 5135 } 5136 5137 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) { 5138 SDValue Ptr = Ld->getBasePtr(); 5139 RetVal1 = 5140 DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), 5141 Ld->getAlignment(), Ld->getMemOperand()->getFlags()); 5142 5143 EVT PtrType = Ptr.getValueType(); 5144 unsigned NewAlign = MinAlign(Ld->getAlignment(), 4); 5145 SDValue NewPtr = DAG.getNode(ISD::ADD, dl, 5146 PtrType, Ptr, DAG.getConstant(4, dl, PtrType)); 5147 RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr, 5148 Ld->getPointerInfo().getWithOffset(4), NewAlign, 5149 Ld->getMemOperand()->getFlags()); 5150 return; 5151 } 5152 5153 llvm_unreachable("Unknown VFP cmp argument!"); 5154 } 5155 5156 /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some 5157 /// f32 and even f64 comparisons to integer ones. 5158 SDValue 5159 ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { 5160 SDValue Chain = Op.getOperand(0); 5161 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 5162 SDValue LHS = Op.getOperand(2); 5163 SDValue RHS = Op.getOperand(3); 5164 SDValue Dest = Op.getOperand(4); 5165 SDLoc dl(Op); 5166 5167 bool LHSSeenZero = false; 5168 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget); 5169 bool RHSSeenZero = false; 5170 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget); 5171 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) { 5172 // If unsafe fp math optimization is enabled and there are no other uses of 5173 // the CMP operands, and the condition code is EQ or NE, we can optimize it 5174 // to an integer comparison. 5175 if (CC == ISD::SETOEQ) 5176 CC = ISD::SETEQ; 5177 else if (CC == ISD::SETUNE) 5178 CC = ISD::SETNE; 5179 5180 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32); 5181 SDValue ARMcc; 5182 if (LHS.getValueType() == MVT::f32) { 5183 LHS = DAG.getNode(ISD::AND, dl, MVT::i32, 5184 bitcastf32Toi32(LHS, DAG), Mask); 5185 RHS = DAG.getNode(ISD::AND, dl, MVT::i32, 5186 bitcastf32Toi32(RHS, DAG), Mask); 5187 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5188 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5189 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 5190 Chain, Dest, ARMcc, CCR, Cmp); 5191 } 5192 5193 SDValue LHS1, LHS2; 5194 SDValue RHS1, RHS2; 5195 expandf64Toi32(LHS, DAG, LHS1, LHS2); 5196 expandf64Toi32(RHS, DAG, RHS1, RHS2); 5197 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask); 5198 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask); 5199 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 5200 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 5201 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 5202 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest }; 5203 return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops); 5204 } 5205 5206 return SDValue(); 5207 } 5208 5209 SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 5210 SDValue Chain = Op.getOperand(0); 5211 SDValue Cond = Op.getOperand(1); 5212 SDValue Dest = Op.getOperand(2); 5213 SDLoc dl(Op); 5214 5215 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 5216 // instruction. 5217 unsigned Opc = Cond.getOpcode(); 5218 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) && 5219 !Subtarget->isThumb1Only(); 5220 if (Cond.getResNo() == 1 && 5221 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 5222 Opc == ISD::USUBO || OptimizeMul)) { 5223 // Only lower legal XALUO ops. 5224 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) 5225 return SDValue(); 5226 5227 // The actual operation with overflow check. 5228 SDValue Value, OverflowCmp; 5229 SDValue ARMcc; 5230 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); 5231 5232 // Reverse the condition code. 5233 ARMCC::CondCodes CondCode = 5234 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue(); 5235 CondCode = ARMCC::getOppositeCondition(CondCode); 5236 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); 5237 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5238 5239 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, 5240 OverflowCmp); 5241 } 5242 5243 return SDValue(); 5244 } 5245 5246 SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { 5247 SDValue Chain = Op.getOperand(0); 5248 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 5249 SDValue LHS = Op.getOperand(2); 5250 SDValue RHS = Op.getOperand(3); 5251 SDValue Dest = Op.getOperand(4); 5252 SDLoc dl(Op); 5253 5254 if (isUnsupportedFloatingType(LHS.getValueType())) { 5255 DAG.getTargetLoweringInfo().softenSetCCOperands( 5256 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS); 5257 5258 // If softenSetCCOperands only returned one value, we should compare it to 5259 // zero. 5260 if (!RHS.getNode()) { 5261 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 5262 CC = ISD::SETNE; 5263 } 5264 } 5265 5266 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 5267 // instruction. 5268 unsigned Opc = LHS.getOpcode(); 5269 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) && 5270 !Subtarget->isThumb1Only(); 5271 if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) && 5272 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 5273 Opc == ISD::USUBO || OptimizeMul) && 5274 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 5275 // Only lower legal XALUO ops. 5276 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) 5277 return SDValue(); 5278 5279 // The actual operation with overflow check. 5280 SDValue Value, OverflowCmp; 5281 SDValue ARMcc; 5282 std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc); 5283 5284 if ((CC == ISD::SETNE) != isOneConstant(RHS)) { 5285 // Reverse the condition code. 5286 ARMCC::CondCodes CondCode = 5287 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue(); 5288 CondCode = ARMCC::getOppositeCondition(CondCode); 5289 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); 5290 } 5291 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5292 5293 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, 5294 OverflowCmp); 5295 } 5296 5297 if (LHS.getValueType() == MVT::i32) { 5298 SDValue ARMcc; 5299 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5300 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5301 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 5302 Chain, Dest, ARMcc, CCR, Cmp); 5303 } 5304 5305 if (getTargetMachine().Options.UnsafeFPMath && 5306 (CC == ISD::SETEQ || CC == ISD::SETOEQ || 5307 CC == ISD::SETNE || CC == ISD::SETUNE)) { 5308 if (SDValue Result = OptimizeVFPBrcond(Op, DAG)) 5309 return Result; 5310 } 5311 5312 ARMCC::CondCodes CondCode, CondCode2; 5313 FPCCToARMCC(CC, CondCode, CondCode2); 5314 5315 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 5316 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); 5317 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5318 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 5319 SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp }; 5320 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); 5321 if (CondCode2 != ARMCC::AL) { 5322 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32); 5323 SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) }; 5324 Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); 5325 } 5326 return Res; 5327 } 5328 5329 SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { 5330 SDValue Chain = Op.getOperand(0); 5331 SDValue Table = Op.getOperand(1); 5332 SDValue Index = Op.getOperand(2); 5333 SDLoc dl(Op); 5334 5335 EVT PTy = getPointerTy(DAG.getDataLayout()); 5336 JumpTableSDNode *JT = cast<JumpTableSDNode>(Table); 5337 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy); 5338 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI); 5339 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy)); 5340 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index); 5341 if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) { 5342 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table 5343 // which does another jump to the destination. This also makes it easier 5344 // to translate it to TBB / TBH later (Thumb2 only). 5345 // FIXME: This might not work if the function is extremely large. 5346 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain, 5347 Addr, Op.getOperand(2), JTI); 5348 } 5349 if (isPositionIndependent() || Subtarget->isROPI()) { 5350 Addr = 5351 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, 5352 MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); 5353 Chain = Addr.getValue(1); 5354 Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr); 5355 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); 5356 } else { 5357 Addr = 5358 DAG.getLoad(PTy, dl, Chain, Addr, 5359 MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); 5360 Chain = Addr.getValue(1); 5361 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); 5362 } 5363 } 5364 5365 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { 5366 EVT VT = Op.getValueType(); 5367 SDLoc dl(Op); 5368 5369 if (Op.getValueType().getVectorElementType() == MVT::i32) { 5370 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32) 5371 return Op; 5372 return DAG.UnrollVectorOp(Op.getNode()); 5373 } 5374 5375 const bool HasFullFP16 = 5376 static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16(); 5377 5378 EVT NewTy; 5379 const EVT OpTy = Op.getOperand(0).getValueType(); 5380 if (OpTy == MVT::v4f32) 5381 NewTy = MVT::v4i32; 5382 else if (OpTy == MVT::v4f16 && HasFullFP16) 5383 NewTy = MVT::v4i16; 5384 else if (OpTy == MVT::v8f16 && HasFullFP16) 5385 NewTy = MVT::v8i16; 5386 else 5387 llvm_unreachable("Invalid type for custom lowering!"); 5388 5389 if (VT != MVT::v4i16 && VT != MVT::v8i16) 5390 return DAG.UnrollVectorOp(Op.getNode()); 5391 5392 Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0)); 5393 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op); 5394 } 5395 5396 SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { 5397 EVT VT = Op.getValueType(); 5398 if (VT.isVector()) 5399 return LowerVectorFP_TO_INT(Op, DAG); 5400 5401 bool IsStrict = Op->isStrictFPOpcode(); 5402 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 5403 5404 if (isUnsupportedFloatingType(SrcVal.getValueType())) { 5405 RTLIB::Libcall LC; 5406 if (Op.getOpcode() == ISD::FP_TO_SINT || 5407 Op.getOpcode() == ISD::STRICT_FP_TO_SINT) 5408 LC = RTLIB::getFPTOSINT(SrcVal.getValueType(), 5409 Op.getValueType()); 5410 else 5411 LC = RTLIB::getFPTOUINT(SrcVal.getValueType(), 5412 Op.getValueType()); 5413 SDLoc Loc(Op); 5414 MakeLibCallOptions CallOptions; 5415 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); 5416 SDValue Result; 5417 std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal, 5418 CallOptions, Loc, Chain); 5419 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result; 5420 } 5421 5422 // FIXME: Remove this when we have strict fp instruction selection patterns 5423 if (IsStrict) { 5424 SDLoc Loc(Op); 5425 SDValue Result = 5426 DAG.getNode(Op.getOpcode() == ISD::STRICT_FP_TO_SINT ? ISD::FP_TO_SINT 5427 : ISD::FP_TO_UINT, 5428 Loc, Op.getValueType(), SrcVal); 5429 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc); 5430 } 5431 5432 return Op; 5433 } 5434 5435 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 5436 EVT VT = Op.getValueType(); 5437 SDLoc dl(Op); 5438 5439 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) { 5440 if (VT.getVectorElementType() == MVT::f32) 5441 return Op; 5442 return DAG.UnrollVectorOp(Op.getNode()); 5443 } 5444 5445 assert((Op.getOperand(0).getValueType() == MVT::v4i16 || 5446 Op.getOperand(0).getValueType() == MVT::v8i16) && 5447 "Invalid type for custom lowering!"); 5448 5449 const bool HasFullFP16 = 5450 static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16(); 5451 5452 EVT DestVecType; 5453 if (VT == MVT::v4f32) 5454 DestVecType = MVT::v4i32; 5455 else if (VT == MVT::v4f16 && HasFullFP16) 5456 DestVecType = MVT::v4i16; 5457 else if (VT == MVT::v8f16 && HasFullFP16) 5458 DestVecType = MVT::v8i16; 5459 else 5460 return DAG.UnrollVectorOp(Op.getNode()); 5461 5462 unsigned CastOpc; 5463 unsigned Opc; 5464 switch (Op.getOpcode()) { 5465 default: llvm_unreachable("Invalid opcode!"); 5466 case ISD::SINT_TO_FP: 5467 CastOpc = ISD::SIGN_EXTEND; 5468 Opc = ISD::SINT_TO_FP; 5469 break; 5470 case ISD::UINT_TO_FP: 5471 CastOpc = ISD::ZERO_EXTEND; 5472 Opc = ISD::UINT_TO_FP; 5473 break; 5474 } 5475 5476 Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0)); 5477 return DAG.getNode(Opc, dl, VT, Op); 5478 } 5479 5480 SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { 5481 EVT VT = Op.getValueType(); 5482 if (VT.isVector()) 5483 return LowerVectorINT_TO_FP(Op, DAG); 5484 if (isUnsupportedFloatingType(VT)) { 5485 RTLIB::Libcall LC; 5486 if (Op.getOpcode() == ISD::SINT_TO_FP) 5487 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), 5488 Op.getValueType()); 5489 else 5490 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), 5491 Op.getValueType()); 5492 MakeLibCallOptions CallOptions; 5493 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), 5494 CallOptions, SDLoc(Op)).first; 5495 } 5496 5497 return Op; 5498 } 5499 5500 SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 5501 // Implement fcopysign with a fabs and a conditional fneg. 5502 SDValue Tmp0 = Op.getOperand(0); 5503 SDValue Tmp1 = Op.getOperand(1); 5504 SDLoc dl(Op); 5505 EVT VT = Op.getValueType(); 5506 EVT SrcVT = Tmp1.getValueType(); 5507 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST || 5508 Tmp0.getOpcode() == ARMISD::VMOVDRR; 5509 bool UseNEON = !InGPR && Subtarget->hasNEON(); 5510 5511 if (UseNEON) { 5512 // Use VBSL to copy the sign bit. 5513 unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80); 5514 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32, 5515 DAG.getTargetConstant(EncodedVal, dl, MVT::i32)); 5516 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64; 5517 if (VT == MVT::f64) 5518 Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT, 5519 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask), 5520 DAG.getConstant(32, dl, MVT::i32)); 5521 else /*if (VT == MVT::f32)*/ 5522 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0); 5523 if (SrcVT == MVT::f32) { 5524 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1); 5525 if (VT == MVT::f64) 5526 Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT, 5527 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1), 5528 DAG.getConstant(32, dl, MVT::i32)); 5529 } else if (VT == MVT::f32) 5530 Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64, 5531 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1), 5532 DAG.getConstant(32, dl, MVT::i32)); 5533 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0); 5534 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1); 5535 5536 SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), 5537 dl, MVT::i32); 5538 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes); 5539 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask, 5540 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes)); 5541 5542 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT, 5543 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask), 5544 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot)); 5545 if (VT == MVT::f32) { 5546 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res); 5547 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res, 5548 DAG.getConstant(0, dl, MVT::i32)); 5549 } else { 5550 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res); 5551 } 5552 5553 return Res; 5554 } 5555 5556 // Bitcast operand 1 to i32. 5557 if (SrcVT == MVT::f64) 5558 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 5559 Tmp1).getValue(1); 5560 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1); 5561 5562 // Or in the signbit with integer operations. 5563 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32); 5564 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32); 5565 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1); 5566 if (VT == MVT::f32) { 5567 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32, 5568 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2); 5569 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, 5570 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1)); 5571 } 5572 5573 // f64: Or the high part with signbit and then combine two parts. 5574 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 5575 Tmp0); 5576 SDValue Lo = Tmp0.getValue(0); 5577 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2); 5578 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1); 5579 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 5580 } 5581 5582 SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ 5583 MachineFunction &MF = DAG.getMachineFunction(); 5584 MachineFrameInfo &MFI = MF.getFrameInfo(); 5585 MFI.setReturnAddressIsTaken(true); 5586 5587 if (verifyReturnAddressArgumentIsConstant(Op, DAG)) 5588 return SDValue(); 5589 5590 EVT VT = Op.getValueType(); 5591 SDLoc dl(Op); 5592 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 5593 if (Depth) { 5594 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 5595 SDValue Offset = DAG.getConstant(4, dl, MVT::i32); 5596 return DAG.getLoad(VT, dl, DAG.getEntryNode(), 5597 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset), 5598 MachinePointerInfo()); 5599 } 5600 5601 // Return LR, which contains the return address. Mark it an implicit live-in. 5602 unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); 5603 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); 5604 } 5605 5606 SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 5607 const ARMBaseRegisterInfo &ARI = 5608 *static_cast<const ARMBaseRegisterInfo*>(RegInfo); 5609 MachineFunction &MF = DAG.getMachineFunction(); 5610 MachineFrameInfo &MFI = MF.getFrameInfo(); 5611 MFI.setFrameAddressIsTaken(true); 5612 5613 EVT VT = Op.getValueType(); 5614 SDLoc dl(Op); // FIXME probably not meaningful 5615 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 5616 Register FrameReg = ARI.getFrameRegister(MF); 5617 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 5618 while (Depth--) 5619 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 5620 MachinePointerInfo()); 5621 return FrameAddr; 5622 } 5623 5624 // FIXME? Maybe this could be a TableGen attribute on some registers and 5625 // this table could be generated automatically from RegInfo. 5626 Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT, 5627 const MachineFunction &MF) const { 5628 Register Reg = StringSwitch<unsigned>(RegName) 5629 .Case("sp", ARM::SP) 5630 .Default(0); 5631 if (Reg) 5632 return Reg; 5633 report_fatal_error(Twine("Invalid register name \"" 5634 + StringRef(RegName) + "\".")); 5635 } 5636 5637 // Result is 64 bit value so split into two 32 bit values and return as a 5638 // pair of values. 5639 static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results, 5640 SelectionDAG &DAG) { 5641 SDLoc DL(N); 5642 5643 // This function is only supposed to be called for i64 type destination. 5644 assert(N->getValueType(0) == MVT::i64 5645 && "ExpandREAD_REGISTER called for non-i64 type result."); 5646 5647 SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL, 5648 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other), 5649 N->getOperand(0), 5650 N->getOperand(1)); 5651 5652 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0), 5653 Read.getValue(1))); 5654 Results.push_back(Read.getOperand(0)); 5655 } 5656 5657 /// \p BC is a bitcast that is about to be turned into a VMOVDRR. 5658 /// When \p DstVT, the destination type of \p BC, is on the vector 5659 /// register bank and the source of bitcast, \p Op, operates on the same bank, 5660 /// it might be possible to combine them, such that everything stays on the 5661 /// vector register bank. 5662 /// \p return The node that would replace \p BT, if the combine 5663 /// is possible. 5664 static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, 5665 SelectionDAG &DAG) { 5666 SDValue Op = BC->getOperand(0); 5667 EVT DstVT = BC->getValueType(0); 5668 5669 // The only vector instruction that can produce a scalar (remember, 5670 // since the bitcast was about to be turned into VMOVDRR, the source 5671 // type is i64) from a vector is EXTRACT_VECTOR_ELT. 5672 // Moreover, we can do this combine only if there is one use. 5673 // Finally, if the destination type is not a vector, there is not 5674 // much point on forcing everything on the vector bank. 5675 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 5676 !Op.hasOneUse()) 5677 return SDValue(); 5678 5679 // If the index is not constant, we will introduce an additional 5680 // multiply that will stick. 5681 // Give up in that case. 5682 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 5683 if (!Index) 5684 return SDValue(); 5685 unsigned DstNumElt = DstVT.getVectorNumElements(); 5686 5687 // Compute the new index. 5688 const APInt &APIntIndex = Index->getAPIntValue(); 5689 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt); 5690 NewIndex *= APIntIndex; 5691 // Check if the new constant index fits into i32. 5692 if (NewIndex.getBitWidth() > 32) 5693 return SDValue(); 5694 5695 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) -> 5696 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M) 5697 SDLoc dl(Op); 5698 SDValue ExtractSrc = Op.getOperand(0); 5699 EVT VecVT = EVT::getVectorVT( 5700 *DAG.getContext(), DstVT.getScalarType(), 5701 ExtractSrc.getValueType().getVectorNumElements() * DstNumElt); 5702 SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc); 5703 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast, 5704 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32)); 5705 } 5706 5707 /// ExpandBITCAST - If the target supports VFP, this function is called to 5708 /// expand a bit convert where either the source or destination type is i64 to 5709 /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 5710 /// operand type is illegal (e.g., v2f32 for a target that doesn't support 5711 /// vectors), since the legalizer won't know what to do with that. 5712 static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG, 5713 const ARMSubtarget *Subtarget) { 5714 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 5715 SDLoc dl(N); 5716 SDValue Op = N->getOperand(0); 5717 5718 // This function is only supposed to be called for i64 types, either as the 5719 // source or destination of the bit convert. 5720 EVT SrcVT = Op.getValueType(); 5721 EVT DstVT = N->getValueType(0); 5722 const bool HasFullFP16 = Subtarget->hasFullFP16(); 5723 5724 if (SrcVT == MVT::f32 && DstVT == MVT::i32) { 5725 // FullFP16: half values are passed in S-registers, and we don't 5726 // need any of the bitcast and moves: 5727 // 5728 // t2: f32,ch = CopyFromReg t0, Register:f32 %0 5729 // t5: i32 = bitcast t2 5730 // t18: f16 = ARMISD::VMOVhr t5 5731 if (Op.getOpcode() != ISD::CopyFromReg || 5732 Op.getValueType() != MVT::f32) 5733 return SDValue(); 5734 5735 auto Move = N->use_begin(); 5736 if (Move->getOpcode() != ARMISD::VMOVhr) 5737 return SDValue(); 5738 5739 SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) }; 5740 SDValue Copy = DAG.getNode(ISD::CopyFromReg, SDLoc(Op), MVT::f16, Ops); 5741 DAG.ReplaceAllUsesWith(*Move, &Copy); 5742 return Copy; 5743 } 5744 5745 if (SrcVT == MVT::i16 && DstVT == MVT::f16) { 5746 if (!HasFullFP16) 5747 return SDValue(); 5748 // SoftFP: read half-precision arguments: 5749 // 5750 // t2: i32,ch = ... 5751 // t7: i16 = truncate t2 <~~~~ Op 5752 // t8: f16 = bitcast t7 <~~~~ N 5753 // 5754 if (Op.getOperand(0).getValueType() == MVT::i32) 5755 return DAG.getNode(ARMISD::VMOVhr, SDLoc(Op), 5756 MVT::f16, Op.getOperand(0)); 5757 5758 return SDValue(); 5759 } 5760 5761 // Half-precision return values 5762 if (SrcVT == MVT::f16 && DstVT == MVT::i16) { 5763 if (!HasFullFP16) 5764 return SDValue(); 5765 // 5766 // t11: f16 = fadd t8, t10 5767 // t12: i16 = bitcast t11 <~~~ SDNode N 5768 // t13: i32 = zero_extend t12 5769 // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t13 5770 // t17: ch = ARMISD::RET_FLAG t16, Register:i32 %r0, t16:1 5771 // 5772 // transform this into: 5773 // 5774 // t20: i32 = ARMISD::VMOVrh t11 5775 // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t20 5776 // 5777 auto ZeroExtend = N->use_begin(); 5778 if (N->use_size() != 1 || ZeroExtend->getOpcode() != ISD::ZERO_EXTEND || 5779 ZeroExtend->getValueType(0) != MVT::i32) 5780 return SDValue(); 5781 5782 auto Copy = ZeroExtend->use_begin(); 5783 if (Copy->getOpcode() == ISD::CopyToReg && 5784 Copy->use_begin()->getOpcode() == ARMISD::RET_FLAG) { 5785 SDValue Cvt = DAG.getNode(ARMISD::VMOVrh, SDLoc(Op), MVT::i32, Op); 5786 DAG.ReplaceAllUsesWith(*ZeroExtend, &Cvt); 5787 return Cvt; 5788 } 5789 return SDValue(); 5790 } 5791 5792 if (!(SrcVT == MVT::i64 || DstVT == MVT::i64)) 5793 return SDValue(); 5794 5795 // Turn i64->f64 into VMOVDRR. 5796 if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) { 5797 // Do not force values to GPRs (this is what VMOVDRR does for the inputs) 5798 // if we can combine the bitcast with its source. 5799 if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG)) 5800 return Val; 5801 5802 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 5803 DAG.getConstant(0, dl, MVT::i32)); 5804 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 5805 DAG.getConstant(1, dl, MVT::i32)); 5806 return DAG.getNode(ISD::BITCAST, dl, DstVT, 5807 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi)); 5808 } 5809 5810 // Turn f64->i64 into VMOVRRD. 5811 if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) { 5812 SDValue Cvt; 5813 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() && 5814 SrcVT.getVectorNumElements() > 1) 5815 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, 5816 DAG.getVTList(MVT::i32, MVT::i32), 5817 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op)); 5818 else 5819 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, 5820 DAG.getVTList(MVT::i32, MVT::i32), Op); 5821 // Merge the pieces into a single i64 value. 5822 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1)); 5823 } 5824 5825 return SDValue(); 5826 } 5827 5828 /// getZeroVector - Returns a vector of specified type with all zero elements. 5829 /// Zero vectors are used to represent vector negation and in those cases 5830 /// will be implemented with the NEON VNEG instruction. However, VNEG does 5831 /// not support i64 elements, so sometimes the zero vectors will need to be 5832 /// explicitly constructed. Regardless, use a canonical VMOV to create the 5833 /// zero vector. 5834 static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) { 5835 assert(VT.isVector() && "Expected a vector type"); 5836 // The canonical modified immediate encoding of a zero vector is....0! 5837 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32); 5838 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 5839 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal); 5840 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 5841 } 5842 5843 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two 5844 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 5845 SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, 5846 SelectionDAG &DAG) const { 5847 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5848 EVT VT = Op.getValueType(); 5849 unsigned VTBits = VT.getSizeInBits(); 5850 SDLoc dl(Op); 5851 SDValue ShOpLo = Op.getOperand(0); 5852 SDValue ShOpHi = Op.getOperand(1); 5853 SDValue ShAmt = Op.getOperand(2); 5854 SDValue ARMcc; 5855 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5856 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 5857 5858 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 5859 5860 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 5861 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); 5862 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 5863 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 5864 DAG.getConstant(VTBits, dl, MVT::i32)); 5865 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 5866 SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 5867 SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 5868 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5869 ISD::SETGE, ARMcc, DAG, dl); 5870 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift, 5871 ARMcc, CCR, CmpLo); 5872 5873 SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 5874 SDValue HiBigShift = Opc == ISD::SRA 5875 ? DAG.getNode(Opc, dl, VT, ShOpHi, 5876 DAG.getConstant(VTBits - 1, dl, VT)) 5877 : DAG.getConstant(0, dl, VT); 5878 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5879 ISD::SETGE, ARMcc, DAG, dl); 5880 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, 5881 ARMcc, CCR, CmpHi); 5882 5883 SDValue Ops[2] = { Lo, Hi }; 5884 return DAG.getMergeValues(Ops, dl); 5885 } 5886 5887 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two 5888 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 5889 SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, 5890 SelectionDAG &DAG) const { 5891 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5892 EVT VT = Op.getValueType(); 5893 unsigned VTBits = VT.getSizeInBits(); 5894 SDLoc dl(Op); 5895 SDValue ShOpLo = Op.getOperand(0); 5896 SDValue ShOpHi = Op.getOperand(1); 5897 SDValue ShAmt = Op.getOperand(2); 5898 SDValue ARMcc; 5899 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5900 5901 assert(Op.getOpcode() == ISD::SHL_PARTS); 5902 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 5903 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); 5904 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 5905 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 5906 SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 5907 5908 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 5909 DAG.getConstant(VTBits, dl, MVT::i32)); 5910 SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 5911 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5912 ISD::SETGE, ARMcc, DAG, dl); 5913 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, 5914 ARMcc, CCR, CmpHi); 5915 5916 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5917 ISD::SETGE, ARMcc, DAG, dl); 5918 SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 5919 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, 5920 DAG.getConstant(0, dl, VT), ARMcc, CCR, CmpLo); 5921 5922 SDValue Ops[2] = { Lo, Hi }; 5923 return DAG.getMergeValues(Ops, dl); 5924 } 5925 5926 SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, 5927 SelectionDAG &DAG) const { 5928 // The rounding mode is in bits 23:22 of the FPSCR. 5929 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 5930 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) 5931 // so that the shift + and get folded into a bitfield extract. 5932 SDLoc dl(Op); 5933 SDValue Ops[] = { DAG.getEntryNode(), 5934 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32) }; 5935 5936 SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, MVT::i32, Ops); 5937 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR, 5938 DAG.getConstant(1U << 22, dl, MVT::i32)); 5939 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, 5940 DAG.getConstant(22, dl, MVT::i32)); 5941 return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, 5942 DAG.getConstant(3, dl, MVT::i32)); 5943 } 5944 5945 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, 5946 const ARMSubtarget *ST) { 5947 SDLoc dl(N); 5948 EVT VT = N->getValueType(0); 5949 if (VT.isVector() && ST->hasNEON()) { 5950 5951 // Compute the least significant set bit: LSB = X & -X 5952 SDValue X = N->getOperand(0); 5953 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X); 5954 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX); 5955 5956 EVT ElemTy = VT.getVectorElementType(); 5957 5958 if (ElemTy == MVT::i8) { 5959 // Compute with: cttz(x) = ctpop(lsb - 1) 5960 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 5961 DAG.getTargetConstant(1, dl, ElemTy)); 5962 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); 5963 return DAG.getNode(ISD::CTPOP, dl, VT, Bits); 5964 } 5965 5966 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) && 5967 (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) { 5968 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0 5969 unsigned NumBits = ElemTy.getSizeInBits(); 5970 SDValue WidthMinus1 = 5971 DAG.getNode(ARMISD::VMOVIMM, dl, VT, 5972 DAG.getTargetConstant(NumBits - 1, dl, ElemTy)); 5973 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB); 5974 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ); 5975 } 5976 5977 // Compute with: cttz(x) = ctpop(lsb - 1) 5978 5979 // Compute LSB - 1. 5980 SDValue Bits; 5981 if (ElemTy == MVT::i64) { 5982 // Load constant 0xffff'ffff'ffff'ffff to register. 5983 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 5984 DAG.getTargetConstant(0x1eff, dl, MVT::i32)); 5985 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF); 5986 } else { 5987 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 5988 DAG.getTargetConstant(1, dl, ElemTy)); 5989 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); 5990 } 5991 return DAG.getNode(ISD::CTPOP, dl, VT, Bits); 5992 } 5993 5994 if (!ST->hasV6T2Ops()) 5995 return SDValue(); 5996 5997 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0)); 5998 return DAG.getNode(ISD::CTLZ, dl, VT, rbit); 5999 } 6000 6001 static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG, 6002 const ARMSubtarget *ST) { 6003 EVT VT = N->getValueType(0); 6004 SDLoc DL(N); 6005 6006 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON."); 6007 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 || 6008 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) && 6009 "Unexpected type for custom ctpop lowering"); 6010 6011 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 6012 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; 6013 SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0)); 6014 Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res); 6015 6016 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds. 6017 unsigned EltSize = 8; 6018 unsigned NumElts = VT.is64BitVector() ? 8 : 16; 6019 while (EltSize != VT.getScalarSizeInBits()) { 6020 SmallVector<SDValue, 8> Ops; 6021 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL, 6022 TLI.getPointerTy(DAG.getDataLayout()))); 6023 Ops.push_back(Res); 6024 6025 EltSize *= 2; 6026 NumElts /= 2; 6027 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts); 6028 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops); 6029 } 6030 6031 return Res; 6032 } 6033 6034 /// Getvshiftimm - Check if this is a valid build_vector for the immediate 6035 /// operand of a vector shift operation, where all the elements of the 6036 /// build_vector must have the same constant integer value. 6037 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { 6038 // Ignore bit_converts. 6039 while (Op.getOpcode() == ISD::BITCAST) 6040 Op = Op.getOperand(0); 6041 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); 6042 APInt SplatBits, SplatUndef; 6043 unsigned SplatBitSize; 6044 bool HasAnyUndefs; 6045 if (!BVN || 6046 !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, 6047 ElementBits) || 6048 SplatBitSize > ElementBits) 6049 return false; 6050 Cnt = SplatBits.getSExtValue(); 6051 return true; 6052 } 6053 6054 /// isVShiftLImm - Check if this is a valid build_vector for the immediate 6055 /// operand of a vector shift left operation. That value must be in the range: 6056 /// 0 <= Value < ElementBits for a left shift; or 6057 /// 0 <= Value <= ElementBits for a long left shift. 6058 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { 6059 assert(VT.isVector() && "vector shift count is not a vector type"); 6060 int64_t ElementBits = VT.getScalarSizeInBits(); 6061 if (!getVShiftImm(Op, ElementBits, Cnt)) 6062 return false; 6063 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); 6064 } 6065 6066 /// isVShiftRImm - Check if this is a valid build_vector for the immediate 6067 /// operand of a vector shift right operation. For a shift opcode, the value 6068 /// is positive, but for an intrinsic the value count must be negative. The 6069 /// absolute value must be in the range: 6070 /// 1 <= |Value| <= ElementBits for a right shift; or 6071 /// 1 <= |Value| <= ElementBits/2 for a narrow right shift. 6072 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, 6073 int64_t &Cnt) { 6074 assert(VT.isVector() && "vector shift count is not a vector type"); 6075 int64_t ElementBits = VT.getScalarSizeInBits(); 6076 if (!getVShiftImm(Op, ElementBits, Cnt)) 6077 return false; 6078 if (!isIntrinsic) 6079 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); 6080 if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) { 6081 Cnt = -Cnt; 6082 return true; 6083 } 6084 return false; 6085 } 6086 6087 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, 6088 const ARMSubtarget *ST) { 6089 EVT VT = N->getValueType(0); 6090 SDLoc dl(N); 6091 int64_t Cnt; 6092 6093 if (!VT.isVector()) 6094 return SDValue(); 6095 6096 // We essentially have two forms here. Shift by an immediate and shift by a 6097 // vector register (there are also shift by a gpr, but that is just handled 6098 // with a tablegen pattern). We cannot easily match shift by an immediate in 6099 // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM. 6100 // For shifting by a vector, we don't have VSHR, only VSHL (which can be 6101 // signed or unsigned, and a negative shift indicates a shift right). 6102 if (N->getOpcode() == ISD::SHL) { 6103 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) 6104 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0), 6105 DAG.getConstant(Cnt, dl, MVT::i32)); 6106 return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0), 6107 N->getOperand(1)); 6108 } 6109 6110 assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) && 6111 "unexpected vector shift opcode"); 6112 6113 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { 6114 unsigned VShiftOpc = 6115 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM); 6116 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), 6117 DAG.getConstant(Cnt, dl, MVT::i32)); 6118 } 6119 6120 // Other right shifts we don't have operations for (we use a shift left by a 6121 // negative number). 6122 EVT ShiftVT = N->getOperand(1).getValueType(); 6123 SDValue NegatedCount = DAG.getNode( 6124 ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1)); 6125 unsigned VShiftOpc = 6126 (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu); 6127 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount); 6128 } 6129 6130 static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, 6131 const ARMSubtarget *ST) { 6132 EVT VT = N->getValueType(0); 6133 SDLoc dl(N); 6134 6135 // We can get here for a node like i32 = ISD::SHL i32, i64 6136 if (VT != MVT::i64) 6137 return SDValue(); 6138 6139 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA || 6140 N->getOpcode() == ISD::SHL) && 6141 "Unknown shift to lower!"); 6142 6143 unsigned ShOpc = N->getOpcode(); 6144 if (ST->hasMVEIntegerOps()) { 6145 SDValue ShAmt = N->getOperand(1); 6146 unsigned ShPartsOpc = ARMISD::LSLL; 6147 ConstantSDNode *Con = dyn_cast<ConstantSDNode>(ShAmt); 6148 6149 // If the shift amount is greater than 32 or has a greater bitwidth than 64 6150 // then do the default optimisation 6151 if (ShAmt->getValueType(0).getSizeInBits() > 64 || 6152 (Con && (Con->getZExtValue() == 0 || Con->getZExtValue() >= 32))) 6153 return SDValue(); 6154 6155 // Extract the lower 32 bits of the shift amount if it's not an i32 6156 if (ShAmt->getValueType(0) != MVT::i32) 6157 ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32); 6158 6159 if (ShOpc == ISD::SRL) { 6160 if (!Con) 6161 // There is no t2LSRLr instruction so negate and perform an lsll if the 6162 // shift amount is in a register, emulating a right shift. 6163 ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 6164 DAG.getConstant(0, dl, MVT::i32), ShAmt); 6165 else 6166 // Else generate an lsrl on the immediate shift amount 6167 ShPartsOpc = ARMISD::LSRL; 6168 } else if (ShOpc == ISD::SRA) 6169 ShPartsOpc = ARMISD::ASRL; 6170 6171 // Lower 32 bits of the destination/source 6172 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6173 DAG.getConstant(0, dl, MVT::i32)); 6174 // Upper 32 bits of the destination/source 6175 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6176 DAG.getConstant(1, dl, MVT::i32)); 6177 6178 // Generate the shift operation as computed above 6179 Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi, 6180 ShAmt); 6181 // The upper 32 bits come from the second return value of lsll 6182 Hi = SDValue(Lo.getNode(), 1); 6183 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 6184 } 6185 6186 // We only lower SRA, SRL of 1 here, all others use generic lowering. 6187 if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL) 6188 return SDValue(); 6189 6190 // If we are in thumb mode, we don't have RRX. 6191 if (ST->isThumb1Only()) 6192 return SDValue(); 6193 6194 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr. 6195 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6196 DAG.getConstant(0, dl, MVT::i32)); 6197 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6198 DAG.getConstant(1, dl, MVT::i32)); 6199 6200 // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and 6201 // captures the result into a carry flag. 6202 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG; 6203 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi); 6204 6205 // The low part is an ARMISD::RRX operand, which shifts the carry in. 6206 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1)); 6207 6208 // Merge the pieces into a single i64 value. 6209 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 6210 } 6211 6212 static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, 6213 const ARMSubtarget *ST) { 6214 bool Invert = false; 6215 bool Swap = false; 6216 unsigned Opc = ARMCC::AL; 6217 6218 SDValue Op0 = Op.getOperand(0); 6219 SDValue Op1 = Op.getOperand(1); 6220 SDValue CC = Op.getOperand(2); 6221 EVT VT = Op.getValueType(); 6222 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 6223 SDLoc dl(Op); 6224 6225 EVT CmpVT; 6226 if (ST->hasNEON()) 6227 CmpVT = Op0.getValueType().changeVectorElementTypeToInteger(); 6228 else { 6229 assert(ST->hasMVEIntegerOps() && 6230 "No hardware support for integer vector comparison!"); 6231 6232 if (Op.getValueType().getVectorElementType() != MVT::i1) 6233 return SDValue(); 6234 6235 // Make sure we expand floating point setcc to scalar if we do not have 6236 // mve.fp, so that we can handle them from there. 6237 if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps()) 6238 return SDValue(); 6239 6240 CmpVT = VT; 6241 } 6242 6243 if (Op0.getValueType().getVectorElementType() == MVT::i64 && 6244 (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) { 6245 // Special-case integer 64-bit equality comparisons. They aren't legal, 6246 // but they can be lowered with a few vector instructions. 6247 unsigned CmpElements = CmpVT.getVectorNumElements() * 2; 6248 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements); 6249 SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0); 6250 SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1); 6251 SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1, 6252 DAG.getCondCode(ISD::SETEQ)); 6253 SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp); 6254 SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed); 6255 Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged); 6256 if (SetCCOpcode == ISD::SETNE) 6257 Merged = DAG.getNOT(dl, Merged, CmpVT); 6258 Merged = DAG.getSExtOrTrunc(Merged, dl, VT); 6259 return Merged; 6260 } 6261 6262 if (CmpVT.getVectorElementType() == MVT::i64) 6263 // 64-bit comparisons are not legal in general. 6264 return SDValue(); 6265 6266 if (Op1.getValueType().isFloatingPoint()) { 6267 switch (SetCCOpcode) { 6268 default: llvm_unreachable("Illegal FP comparison"); 6269 case ISD::SETUNE: 6270 case ISD::SETNE: 6271 if (ST->hasMVEFloatOps()) { 6272 Opc = ARMCC::NE; break; 6273 } else { 6274 Invert = true; LLVM_FALLTHROUGH; 6275 } 6276 case ISD::SETOEQ: 6277 case ISD::SETEQ: Opc = ARMCC::EQ; break; 6278 case ISD::SETOLT: 6279 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; 6280 case ISD::SETOGT: 6281 case ISD::SETGT: Opc = ARMCC::GT; break; 6282 case ISD::SETOLE: 6283 case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; 6284 case ISD::SETOGE: 6285 case ISD::SETGE: Opc = ARMCC::GE; break; 6286 case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH; 6287 case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break; 6288 case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH; 6289 case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break; 6290 case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH; 6291 case ISD::SETONE: { 6292 // Expand this to (OLT | OGT). 6293 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0, 6294 DAG.getConstant(ARMCC::GT, dl, MVT::i32)); 6295 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, 6296 DAG.getConstant(ARMCC::GT, dl, MVT::i32)); 6297 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1); 6298 if (Invert) 6299 Result = DAG.getNOT(dl, Result, VT); 6300 return Result; 6301 } 6302 case ISD::SETUO: Invert = true; LLVM_FALLTHROUGH; 6303 case ISD::SETO: { 6304 // Expand this to (OLT | OGE). 6305 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0, 6306 DAG.getConstant(ARMCC::GT, dl, MVT::i32)); 6307 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, 6308 DAG.getConstant(ARMCC::GE, dl, MVT::i32)); 6309 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1); 6310 if (Invert) 6311 Result = DAG.getNOT(dl, Result, VT); 6312 return Result; 6313 } 6314 } 6315 } else { 6316 // Integer comparisons. 6317 switch (SetCCOpcode) { 6318 default: llvm_unreachable("Illegal integer comparison"); 6319 case ISD::SETNE: 6320 if (ST->hasMVEIntegerOps()) { 6321 Opc = ARMCC::NE; break; 6322 } else { 6323 Invert = true; LLVM_FALLTHROUGH; 6324 } 6325 case ISD::SETEQ: Opc = ARMCC::EQ; break; 6326 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; 6327 case ISD::SETGT: Opc = ARMCC::GT; break; 6328 case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; 6329 case ISD::SETGE: Opc = ARMCC::GE; break; 6330 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH; 6331 case ISD::SETUGT: Opc = ARMCC::HI; break; 6332 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH; 6333 case ISD::SETUGE: Opc = ARMCC::HS; break; 6334 } 6335 6336 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero). 6337 if (ST->hasNEON() && Opc == ARMCC::EQ) { 6338 SDValue AndOp; 6339 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 6340 AndOp = Op0; 6341 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) 6342 AndOp = Op1; 6343 6344 // Ignore bitconvert. 6345 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST) 6346 AndOp = AndOp.getOperand(0); 6347 6348 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) { 6349 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0)); 6350 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1)); 6351 SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1); 6352 if (!Invert) 6353 Result = DAG.getNOT(dl, Result, VT); 6354 return Result; 6355 } 6356 } 6357 } 6358 6359 if (Swap) 6360 std::swap(Op0, Op1); 6361 6362 // If one of the operands is a constant vector zero, attempt to fold the 6363 // comparison to a specialized compare-against-zero form. 6364 SDValue SingleOp; 6365 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 6366 SingleOp = Op0; 6367 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) { 6368 if (Opc == ARMCC::GE) 6369 Opc = ARMCC::LE; 6370 else if (Opc == ARMCC::GT) 6371 Opc = ARMCC::LT; 6372 SingleOp = Op1; 6373 } 6374 6375 SDValue Result; 6376 if (SingleOp.getNode()) { 6377 Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, SingleOp, 6378 DAG.getConstant(Opc, dl, MVT::i32)); 6379 } else { 6380 Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, 6381 DAG.getConstant(Opc, dl, MVT::i32)); 6382 } 6383 6384 Result = DAG.getSExtOrTrunc(Result, dl, VT); 6385 6386 if (Invert) 6387 Result = DAG.getNOT(dl, Result, VT); 6388 6389 return Result; 6390 } 6391 6392 static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) { 6393 SDValue LHS = Op.getOperand(0); 6394 SDValue RHS = Op.getOperand(1); 6395 SDValue Carry = Op.getOperand(2); 6396 SDValue Cond = Op.getOperand(3); 6397 SDLoc DL(Op); 6398 6399 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only."); 6400 6401 // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we 6402 // have to invert the carry first. 6403 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 6404 DAG.getConstant(1, DL, MVT::i32), Carry); 6405 // This converts the boolean value carry into the carry flag. 6406 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 6407 6408 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 6409 SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry); 6410 6411 SDValue FVal = DAG.getConstant(0, DL, MVT::i32); 6412 SDValue TVal = DAG.getConstant(1, DL, MVT::i32); 6413 SDValue ARMcc = DAG.getConstant( 6414 IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32); 6415 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 6416 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR, 6417 Cmp.getValue(1), SDValue()); 6418 return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc, 6419 CCR, Chain.getValue(1)); 6420 } 6421 6422 /// isVMOVModifiedImm - Check if the specified splat value corresponds to a 6423 /// valid vector constant for a NEON or MVE instruction with a "modified 6424 /// immediate" operand (e.g., VMOV). If so, return the encoded value. 6425 static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, 6426 unsigned SplatBitSize, SelectionDAG &DAG, 6427 const SDLoc &dl, EVT &VT, bool is128Bits, 6428 VMOVModImmType type) { 6429 unsigned OpCmode, Imm; 6430 6431 // SplatBitSize is set to the smallest size that splats the vector, so a 6432 // zero vector will always have SplatBitSize == 8. However, NEON modified 6433 // immediate instructions others than VMOV do not support the 8-bit encoding 6434 // of a zero vector, and the default encoding of zero is supposed to be the 6435 // 32-bit version. 6436 if (SplatBits == 0) 6437 SplatBitSize = 32; 6438 6439 switch (SplatBitSize) { 6440 case 8: 6441 if (type != VMOVModImm) 6442 return SDValue(); 6443 // Any 1-byte value is OK. Op=0, Cmode=1110. 6444 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); 6445 OpCmode = 0xe; 6446 Imm = SplatBits; 6447 VT = is128Bits ? MVT::v16i8 : MVT::v8i8; 6448 break; 6449 6450 case 16: 6451 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero. 6452 VT = is128Bits ? MVT::v8i16 : MVT::v4i16; 6453 if ((SplatBits & ~0xff) == 0) { 6454 // Value = 0x00nn: Op=x, Cmode=100x. 6455 OpCmode = 0x8; 6456 Imm = SplatBits; 6457 break; 6458 } 6459 if ((SplatBits & ~0xff00) == 0) { 6460 // Value = 0xnn00: Op=x, Cmode=101x. 6461 OpCmode = 0xa; 6462 Imm = SplatBits >> 8; 6463 break; 6464 } 6465 return SDValue(); 6466 6467 case 32: 6468 // NEON's 32-bit VMOV supports splat values where: 6469 // * only one byte is nonzero, or 6470 // * the least significant byte is 0xff and the second byte is nonzero, or 6471 // * the least significant 2 bytes are 0xff and the third is nonzero. 6472 VT = is128Bits ? MVT::v4i32 : MVT::v2i32; 6473 if ((SplatBits & ~0xff) == 0) { 6474 // Value = 0x000000nn: Op=x, Cmode=000x. 6475 OpCmode = 0; 6476 Imm = SplatBits; 6477 break; 6478 } 6479 if ((SplatBits & ~0xff00) == 0) { 6480 // Value = 0x0000nn00: Op=x, Cmode=001x. 6481 OpCmode = 0x2; 6482 Imm = SplatBits >> 8; 6483 break; 6484 } 6485 if ((SplatBits & ~0xff0000) == 0) { 6486 // Value = 0x00nn0000: Op=x, Cmode=010x. 6487 OpCmode = 0x4; 6488 Imm = SplatBits >> 16; 6489 break; 6490 } 6491 if ((SplatBits & ~0xff000000) == 0) { 6492 // Value = 0xnn000000: Op=x, Cmode=011x. 6493 OpCmode = 0x6; 6494 Imm = SplatBits >> 24; 6495 break; 6496 } 6497 6498 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC 6499 if (type == OtherModImm) return SDValue(); 6500 6501 if ((SplatBits & ~0xffff) == 0 && 6502 ((SplatBits | SplatUndef) & 0xff) == 0xff) { 6503 // Value = 0x0000nnff: Op=x, Cmode=1100. 6504 OpCmode = 0xc; 6505 Imm = SplatBits >> 8; 6506 break; 6507 } 6508 6509 // cmode == 0b1101 is not supported for MVE VMVN 6510 if (type == MVEVMVNModImm) 6511 return SDValue(); 6512 6513 if ((SplatBits & ~0xffffff) == 0 && 6514 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { 6515 // Value = 0x00nnffff: Op=x, Cmode=1101. 6516 OpCmode = 0xd; 6517 Imm = SplatBits >> 16; 6518 break; 6519 } 6520 6521 // Note: there are a few 32-bit splat values (specifically: 00ffff00, 6522 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not 6523 // VMOV.I32. A (very) minor optimization would be to replicate the value 6524 // and fall through here to test for a valid 64-bit splat. But, then the 6525 // caller would also need to check and handle the change in size. 6526 return SDValue(); 6527 6528 case 64: { 6529 if (type != VMOVModImm) 6530 return SDValue(); 6531 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff. 6532 uint64_t BitMask = 0xff; 6533 uint64_t Val = 0; 6534 unsigned ImmMask = 1; 6535 Imm = 0; 6536 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { 6537 if (((SplatBits | SplatUndef) & BitMask) == BitMask) { 6538 Val |= BitMask; 6539 Imm |= ImmMask; 6540 } else if ((SplatBits & BitMask) != 0) { 6541 return SDValue(); 6542 } 6543 BitMask <<= 8; 6544 ImmMask <<= 1; 6545 } 6546 6547 if (DAG.getDataLayout().isBigEndian()) 6548 // swap higher and lower 32 bit word 6549 Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4); 6550 6551 // Op=1, Cmode=1110. 6552 OpCmode = 0x1e; 6553 VT = is128Bits ? MVT::v2i64 : MVT::v1i64; 6554 break; 6555 } 6556 6557 default: 6558 llvm_unreachable("unexpected size for isVMOVModifiedImm"); 6559 } 6560 6561 unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm); 6562 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32); 6563 } 6564 6565 SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, 6566 const ARMSubtarget *ST) const { 6567 EVT VT = Op.getValueType(); 6568 bool IsDouble = (VT == MVT::f64); 6569 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op); 6570 const APFloat &FPVal = CFP->getValueAPF(); 6571 6572 // Prevent floating-point constants from using literal loads 6573 // when execute-only is enabled. 6574 if (ST->genExecuteOnly()) { 6575 // If we can represent the constant as an immediate, don't lower it 6576 if (isFPImmLegal(FPVal, VT)) 6577 return Op; 6578 // Otherwise, construct as integer, and move to float register 6579 APInt INTVal = FPVal.bitcastToAPInt(); 6580 SDLoc DL(CFP); 6581 switch (VT.getSimpleVT().SimpleTy) { 6582 default: 6583 llvm_unreachable("Unknown floating point type!"); 6584 break; 6585 case MVT::f64: { 6586 SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32); 6587 SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32); 6588 if (!ST->isLittle()) 6589 std::swap(Lo, Hi); 6590 return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi); 6591 } 6592 case MVT::f32: 6593 return DAG.getNode(ARMISD::VMOVSR, DL, VT, 6594 DAG.getConstant(INTVal, DL, MVT::i32)); 6595 } 6596 } 6597 6598 if (!ST->hasVFP3Base()) 6599 return SDValue(); 6600 6601 // Use the default (constant pool) lowering for double constants when we have 6602 // an SP-only FPU 6603 if (IsDouble && !Subtarget->hasFP64()) 6604 return SDValue(); 6605 6606 // Try splatting with a VMOV.f32... 6607 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal); 6608 6609 if (ImmVal != -1) { 6610 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) { 6611 // We have code in place to select a valid ConstantFP already, no need to 6612 // do any mangling. 6613 return Op; 6614 } 6615 6616 // It's a float and we are trying to use NEON operations where 6617 // possible. Lower it to a splat followed by an extract. 6618 SDLoc DL(Op); 6619 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32); 6620 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32, 6621 NewVal); 6622 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant, 6623 DAG.getConstant(0, DL, MVT::i32)); 6624 } 6625 6626 // The rest of our options are NEON only, make sure that's allowed before 6627 // proceeding.. 6628 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP())) 6629 return SDValue(); 6630 6631 EVT VMovVT; 6632 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue(); 6633 6634 // It wouldn't really be worth bothering for doubles except for one very 6635 // important value, which does happen to match: 0.0. So make sure we don't do 6636 // anything stupid. 6637 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32)) 6638 return SDValue(); 6639 6640 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too). 6641 SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), 6642 VMovVT, false, VMOVModImm); 6643 if (NewVal != SDValue()) { 6644 SDLoc DL(Op); 6645 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT, 6646 NewVal); 6647 if (IsDouble) 6648 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); 6649 6650 // It's a float: cast and extract a vector element. 6651 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 6652 VecConstant); 6653 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 6654 DAG.getConstant(0, DL, MVT::i32)); 6655 } 6656 6657 // Finally, try a VMVN.i32 6658 NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT, 6659 false, VMVNModImm); 6660 if (NewVal != SDValue()) { 6661 SDLoc DL(Op); 6662 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal); 6663 6664 if (IsDouble) 6665 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); 6666 6667 // It's a float: cast and extract a vector element. 6668 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 6669 VecConstant); 6670 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 6671 DAG.getConstant(0, DL, MVT::i32)); 6672 } 6673 6674 return SDValue(); 6675 } 6676 6677 // check if an VEXT instruction can handle the shuffle mask when the 6678 // vector sources of the shuffle are the same. 6679 static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { 6680 unsigned NumElts = VT.getVectorNumElements(); 6681 6682 // Assume that the first shuffle index is not UNDEF. Fail if it is. 6683 if (M[0] < 0) 6684 return false; 6685 6686 Imm = M[0]; 6687 6688 // If this is a VEXT shuffle, the immediate value is the index of the first 6689 // element. The other shuffle indices must be the successive elements after 6690 // the first one. 6691 unsigned ExpectedElt = Imm; 6692 for (unsigned i = 1; i < NumElts; ++i) { 6693 // Increment the expected index. If it wraps around, just follow it 6694 // back to index zero and keep going. 6695 ++ExpectedElt; 6696 if (ExpectedElt == NumElts) 6697 ExpectedElt = 0; 6698 6699 if (M[i] < 0) continue; // ignore UNDEF indices 6700 if (ExpectedElt != static_cast<unsigned>(M[i])) 6701 return false; 6702 } 6703 6704 return true; 6705 } 6706 6707 static bool isVEXTMask(ArrayRef<int> M, EVT VT, 6708 bool &ReverseVEXT, unsigned &Imm) { 6709 unsigned NumElts = VT.getVectorNumElements(); 6710 ReverseVEXT = false; 6711 6712 // Assume that the first shuffle index is not UNDEF. Fail if it is. 6713 if (M[0] < 0) 6714 return false; 6715 6716 Imm = M[0]; 6717 6718 // If this is a VEXT shuffle, the immediate value is the index of the first 6719 // element. The other shuffle indices must be the successive elements after 6720 // the first one. 6721 unsigned ExpectedElt = Imm; 6722 for (unsigned i = 1; i < NumElts; ++i) { 6723 // Increment the expected index. If it wraps around, it may still be 6724 // a VEXT but the source vectors must be swapped. 6725 ExpectedElt += 1; 6726 if (ExpectedElt == NumElts * 2) { 6727 ExpectedElt = 0; 6728 ReverseVEXT = true; 6729 } 6730 6731 if (M[i] < 0) continue; // ignore UNDEF indices 6732 if (ExpectedElt != static_cast<unsigned>(M[i])) 6733 return false; 6734 } 6735 6736 // Adjust the index value if the source operands will be swapped. 6737 if (ReverseVEXT) 6738 Imm -= NumElts; 6739 6740 return true; 6741 } 6742 6743 /// isVREVMask - Check if a vector shuffle corresponds to a VREV 6744 /// instruction with the specified blocksize. (The order of the elements 6745 /// within each block of the vector is reversed.) 6746 static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { 6747 assert((BlockSize==16 || BlockSize==32 || BlockSize==64) && 6748 "Only possible block sizes for VREV are: 16, 32, 64"); 6749 6750 unsigned EltSz = VT.getScalarSizeInBits(); 6751 if (EltSz == 64) 6752 return false; 6753 6754 unsigned NumElts = VT.getVectorNumElements(); 6755 unsigned BlockElts = M[0] + 1; 6756 // If the first shuffle index is UNDEF, be optimistic. 6757 if (M[0] < 0) 6758 BlockElts = BlockSize / EltSz; 6759 6760 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) 6761 return false; 6762 6763 for (unsigned i = 0; i < NumElts; ++i) { 6764 if (M[i] < 0) continue; // ignore UNDEF indices 6765 if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts)) 6766 return false; 6767 } 6768 6769 return true; 6770 } 6771 6772 static bool isVTBLMask(ArrayRef<int> M, EVT VT) { 6773 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of 6774 // range, then 0 is placed into the resulting vector. So pretty much any mask 6775 // of 8 elements can work here. 6776 return VT == MVT::v8i8 && M.size() == 8; 6777 } 6778 6779 static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask, 6780 unsigned Index) { 6781 if (Mask.size() == Elements * 2) 6782 return Index / Elements; 6783 return Mask[Index] == 0 ? 0 : 1; 6784 } 6785 6786 // Checks whether the shuffle mask represents a vector transpose (VTRN) by 6787 // checking that pairs of elements in the shuffle mask represent the same index 6788 // in each vector, incrementing the expected index by 2 at each step. 6789 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6] 6790 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g} 6791 // v2={e,f,g,h} 6792 // WhichResult gives the offset for each element in the mask based on which 6793 // of the two results it belongs to. 6794 // 6795 // The transpose can be represented either as: 6796 // result1 = shufflevector v1, v2, result1_shuffle_mask 6797 // result2 = shufflevector v1, v2, result2_shuffle_mask 6798 // where v1/v2 and the shuffle masks have the same number of elements 6799 // (here WhichResult (see below) indicates which result is being checked) 6800 // 6801 // or as: 6802 // results = shufflevector v1, v2, shuffle_mask 6803 // where both results are returned in one vector and the shuffle mask has twice 6804 // as many elements as v1/v2 (here WhichResult will always be 0 if true) here we 6805 // want to check the low half and high half of the shuffle mask as if it were 6806 // the other case 6807 static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 6808 unsigned EltSz = VT.getScalarSizeInBits(); 6809 if (EltSz == 64) 6810 return false; 6811 6812 unsigned NumElts = VT.getVectorNumElements(); 6813 if (M.size() != NumElts && M.size() != NumElts*2) 6814 return false; 6815 6816 // If the mask is twice as long as the input vector then we need to check the 6817 // upper and lower parts of the mask with a matching value for WhichResult 6818 // FIXME: A mask with only even values will be rejected in case the first 6819 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only 6820 // M[0] is used to determine WhichResult 6821 for (unsigned i = 0; i < M.size(); i += NumElts) { 6822 WhichResult = SelectPairHalf(NumElts, M, i); 6823 for (unsigned j = 0; j < NumElts; j += 2) { 6824 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || 6825 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult)) 6826 return false; 6827 } 6828 } 6829 6830 if (M.size() == NumElts*2) 6831 WhichResult = 0; 6832 6833 return true; 6834 } 6835 6836 /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of 6837 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 6838 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. 6839 static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 6840 unsigned EltSz = VT.getScalarSizeInBits(); 6841 if (EltSz == 64) 6842 return false; 6843 6844 unsigned NumElts = VT.getVectorNumElements(); 6845 if (M.size() != NumElts && M.size() != NumElts*2) 6846 return false; 6847 6848 for (unsigned i = 0; i < M.size(); i += NumElts) { 6849 WhichResult = SelectPairHalf(NumElts, M, i); 6850 for (unsigned j = 0; j < NumElts; j += 2) { 6851 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || 6852 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult)) 6853 return false; 6854 } 6855 } 6856 6857 if (M.size() == NumElts*2) 6858 WhichResult = 0; 6859 6860 return true; 6861 } 6862 6863 // Checks whether the shuffle mask represents a vector unzip (VUZP) by checking 6864 // that the mask elements are either all even and in steps of size 2 or all odd 6865 // and in steps of size 2. 6866 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6] 6867 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g} 6868 // v2={e,f,g,h} 6869 // Requires similar checks to that of isVTRNMask with 6870 // respect the how results are returned. 6871 static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 6872 unsigned EltSz = VT.getScalarSizeInBits(); 6873 if (EltSz == 64) 6874 return false; 6875 6876 unsigned NumElts = VT.getVectorNumElements(); 6877 if (M.size() != NumElts && M.size() != NumElts*2) 6878 return false; 6879 6880 for (unsigned i = 0; i < M.size(); i += NumElts) { 6881 WhichResult = SelectPairHalf(NumElts, M, i); 6882 for (unsigned j = 0; j < NumElts; ++j) { 6883 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult) 6884 return false; 6885 } 6886 } 6887 6888 if (M.size() == NumElts*2) 6889 WhichResult = 0; 6890 6891 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 6892 if (VT.is64BitVector() && EltSz == 32) 6893 return false; 6894 6895 return true; 6896 } 6897 6898 /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of 6899 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 6900 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, 6901 static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 6902 unsigned EltSz = VT.getScalarSizeInBits(); 6903 if (EltSz == 64) 6904 return false; 6905 6906 unsigned NumElts = VT.getVectorNumElements(); 6907 if (M.size() != NumElts && M.size() != NumElts*2) 6908 return false; 6909 6910 unsigned Half = NumElts / 2; 6911 for (unsigned i = 0; i < M.size(); i += NumElts) { 6912 WhichResult = SelectPairHalf(NumElts, M, i); 6913 for (unsigned j = 0; j < NumElts; j += Half) { 6914 unsigned Idx = WhichResult; 6915 for (unsigned k = 0; k < Half; ++k) { 6916 int MIdx = M[i + j + k]; 6917 if (MIdx >= 0 && (unsigned) MIdx != Idx) 6918 return false; 6919 Idx += 2; 6920 } 6921 } 6922 } 6923 6924 if (M.size() == NumElts*2) 6925 WhichResult = 0; 6926 6927 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 6928 if (VT.is64BitVector() && EltSz == 32) 6929 return false; 6930 6931 return true; 6932 } 6933 6934 // Checks whether the shuffle mask represents a vector zip (VZIP) by checking 6935 // that pairs of elements of the shufflemask represent the same index in each 6936 // vector incrementing sequentially through the vectors. 6937 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5] 6938 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f} 6939 // v2={e,f,g,h} 6940 // Requires similar checks to that of isVTRNMask with respect the how results 6941 // are returned. 6942 static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 6943 unsigned EltSz = VT.getScalarSizeInBits(); 6944 if (EltSz == 64) 6945 return false; 6946 6947 unsigned NumElts = VT.getVectorNumElements(); 6948 if (M.size() != NumElts && M.size() != NumElts*2) 6949 return false; 6950 6951 for (unsigned i = 0; i < M.size(); i += NumElts) { 6952 WhichResult = SelectPairHalf(NumElts, M, i); 6953 unsigned Idx = WhichResult * NumElts / 2; 6954 for (unsigned j = 0; j < NumElts; j += 2) { 6955 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || 6956 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts)) 6957 return false; 6958 Idx += 1; 6959 } 6960 } 6961 6962 if (M.size() == NumElts*2) 6963 WhichResult = 0; 6964 6965 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 6966 if (VT.is64BitVector() && EltSz == 32) 6967 return false; 6968 6969 return true; 6970 } 6971 6972 /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of 6973 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 6974 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. 6975 static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 6976 unsigned EltSz = VT.getScalarSizeInBits(); 6977 if (EltSz == 64) 6978 return false; 6979 6980 unsigned NumElts = VT.getVectorNumElements(); 6981 if (M.size() != NumElts && M.size() != NumElts*2) 6982 return false; 6983 6984 for (unsigned i = 0; i < M.size(); i += NumElts) { 6985 WhichResult = SelectPairHalf(NumElts, M, i); 6986 unsigned Idx = WhichResult * NumElts / 2; 6987 for (unsigned j = 0; j < NumElts; j += 2) { 6988 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || 6989 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx)) 6990 return false; 6991 Idx += 1; 6992 } 6993 } 6994 6995 if (M.size() == NumElts*2) 6996 WhichResult = 0; 6997 6998 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 6999 if (VT.is64BitVector() && EltSz == 32) 7000 return false; 7001 7002 return true; 7003 } 7004 7005 /// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), 7006 /// and return the corresponding ARMISD opcode if it is, or 0 if it isn't. 7007 static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT, 7008 unsigned &WhichResult, 7009 bool &isV_UNDEF) { 7010 isV_UNDEF = false; 7011 if (isVTRNMask(ShuffleMask, VT, WhichResult)) 7012 return ARMISD::VTRN; 7013 if (isVUZPMask(ShuffleMask, VT, WhichResult)) 7014 return ARMISD::VUZP; 7015 if (isVZIPMask(ShuffleMask, VT, WhichResult)) 7016 return ARMISD::VZIP; 7017 7018 isV_UNDEF = true; 7019 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) 7020 return ARMISD::VTRN; 7021 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 7022 return ARMISD::VUZP; 7023 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 7024 return ARMISD::VZIP; 7025 7026 return 0; 7027 } 7028 7029 /// \return true if this is a reverse operation on an vector. 7030 static bool isReverseMask(ArrayRef<int> M, EVT VT) { 7031 unsigned NumElts = VT.getVectorNumElements(); 7032 // Make sure the mask has the right size. 7033 if (NumElts != M.size()) 7034 return false; 7035 7036 // Look for <15, ..., 3, -1, 1, 0>. 7037 for (unsigned i = 0; i != NumElts; ++i) 7038 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i)) 7039 return false; 7040 7041 return true; 7042 } 7043 7044 static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top) { 7045 unsigned NumElts = VT.getVectorNumElements(); 7046 // Make sure the mask has the right size. 7047 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8)) 7048 return false; 7049 7050 // If Top 7051 // Look for <0, N, 2, N+2, 4, N+4, ..>. 7052 // This inserts Input2 into Input1 7053 // else if not Top 7054 // Look for <0, N+1, 2, N+3, 4, N+5, ..> 7055 // This inserts Input1 into Input2 7056 unsigned Offset = Top ? 0 : 1; 7057 for (unsigned i = 0; i < NumElts; i+=2) { 7058 if (M[i] >= 0 && M[i] != (int)i) 7059 return false; 7060 if (M[i+1] >= 0 && M[i+1] != (int)(NumElts + i + Offset)) 7061 return false; 7062 } 7063 7064 return true; 7065 } 7066 7067 // If N is an integer constant that can be moved into a register in one 7068 // instruction, return an SDValue of such a constant (will become a MOV 7069 // instruction). Otherwise return null. 7070 static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, 7071 const ARMSubtarget *ST, const SDLoc &dl) { 7072 uint64_t Val; 7073 if (!isa<ConstantSDNode>(N)) 7074 return SDValue(); 7075 Val = cast<ConstantSDNode>(N)->getZExtValue(); 7076 7077 if (ST->isThumb1Only()) { 7078 if (Val <= 255 || ~Val <= 255) 7079 return DAG.getConstant(Val, dl, MVT::i32); 7080 } else { 7081 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1) 7082 return DAG.getConstant(Val, dl, MVT::i32); 7083 } 7084 return SDValue(); 7085 } 7086 7087 static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, 7088 const ARMSubtarget *ST) { 7089 SDLoc dl(Op); 7090 EVT VT = Op.getValueType(); 7091 7092 assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!"); 7093 7094 unsigned NumElts = VT.getVectorNumElements(); 7095 unsigned BoolMask; 7096 unsigned BitsPerBool; 7097 if (NumElts == 4) { 7098 BitsPerBool = 4; 7099 BoolMask = 0xf; 7100 } else if (NumElts == 8) { 7101 BitsPerBool = 2; 7102 BoolMask = 0x3; 7103 } else if (NumElts == 16) { 7104 BitsPerBool = 1; 7105 BoolMask = 0x1; 7106 } else 7107 return SDValue(); 7108 7109 // If this is a single value copied into all lanes (a splat), we can just sign 7110 // extend that single value 7111 SDValue FirstOp = Op.getOperand(0); 7112 if (!isa<ConstantSDNode>(FirstOp) && 7113 std::all_of(std::next(Op->op_begin()), Op->op_end(), 7114 [&FirstOp](SDUse &U) { 7115 return U.get().isUndef() || U.get() == FirstOp; 7116 })) { 7117 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp, 7118 DAG.getValueType(MVT::i1)); 7119 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext); 7120 } 7121 7122 // First create base with bits set where known 7123 unsigned Bits32 = 0; 7124 for (unsigned i = 0; i < NumElts; ++i) { 7125 SDValue V = Op.getOperand(i); 7126 if (!isa<ConstantSDNode>(V) && !V.isUndef()) 7127 continue; 7128 bool BitSet = V.isUndef() ? false : cast<ConstantSDNode>(V)->getZExtValue(); 7129 if (BitSet) 7130 Bits32 |= BoolMask << (i * BitsPerBool); 7131 } 7132 7133 // Add in unknown nodes 7134 SDValue Base = DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, 7135 DAG.getConstant(Bits32, dl, MVT::i32)); 7136 for (unsigned i = 0; i < NumElts; ++i) { 7137 SDValue V = Op.getOperand(i); 7138 if (isa<ConstantSDNode>(V) || V.isUndef()) 7139 continue; 7140 Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V, 7141 DAG.getConstant(i, dl, MVT::i32)); 7142 } 7143 7144 return Base; 7145 } 7146 7147 // If this is a case we can't handle, return null and let the default 7148 // expansion code take care of it. 7149 SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, 7150 const ARMSubtarget *ST) const { 7151 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 7152 SDLoc dl(Op); 7153 EVT VT = Op.getValueType(); 7154 7155 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) 7156 return LowerBUILD_VECTOR_i1(Op, DAG, ST); 7157 7158 APInt SplatBits, SplatUndef; 7159 unsigned SplatBitSize; 7160 bool HasAnyUndefs; 7161 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 7162 if (SplatUndef.isAllOnesValue()) 7163 return DAG.getUNDEF(VT); 7164 7165 if ((ST->hasNEON() && SplatBitSize <= 64) || 7166 (ST->hasMVEIntegerOps() && SplatBitSize <= 32)) { 7167 // Check if an immediate VMOV works. 7168 EVT VmovVT; 7169 SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(), 7170 SplatUndef.getZExtValue(), SplatBitSize, 7171 DAG, dl, VmovVT, VT.is128BitVector(), 7172 VMOVModImm); 7173 7174 if (Val.getNode()) { 7175 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val); 7176 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 7177 } 7178 7179 // Try an immediate VMVN. 7180 uint64_t NegatedImm = (~SplatBits).getZExtValue(); 7181 Val = isVMOVModifiedImm( 7182 NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, 7183 DAG, dl, VmovVT, VT.is128BitVector(), 7184 ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm); 7185 if (Val.getNode()) { 7186 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val); 7187 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 7188 } 7189 7190 // Use vmov.f32 to materialize other v2f32 and v4f32 splats. 7191 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) { 7192 int ImmVal = ARM_AM::getFP32Imm(SplatBits); 7193 if (ImmVal != -1) { 7194 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32); 7195 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val); 7196 } 7197 } 7198 } 7199 } 7200 7201 // Scan through the operands to see if only one value is used. 7202 // 7203 // As an optimisation, even if more than one value is used it may be more 7204 // profitable to splat with one value then change some lanes. 7205 // 7206 // Heuristically we decide to do this if the vector has a "dominant" value, 7207 // defined as splatted to more than half of the lanes. 7208 unsigned NumElts = VT.getVectorNumElements(); 7209 bool isOnlyLowElement = true; 7210 bool usesOnlyOneValue = true; 7211 bool hasDominantValue = false; 7212 bool isConstant = true; 7213 7214 // Map of the number of times a particular SDValue appears in the 7215 // element list. 7216 DenseMap<SDValue, unsigned> ValueCounts; 7217 SDValue Value; 7218 for (unsigned i = 0; i < NumElts; ++i) { 7219 SDValue V = Op.getOperand(i); 7220 if (V.isUndef()) 7221 continue; 7222 if (i > 0) 7223 isOnlyLowElement = false; 7224 if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) 7225 isConstant = false; 7226 7227 ValueCounts.insert(std::make_pair(V, 0)); 7228 unsigned &Count = ValueCounts[V]; 7229 7230 // Is this value dominant? (takes up more than half of the lanes) 7231 if (++Count > (NumElts / 2)) { 7232 hasDominantValue = true; 7233 Value = V; 7234 } 7235 } 7236 if (ValueCounts.size() != 1) 7237 usesOnlyOneValue = false; 7238 if (!Value.getNode() && !ValueCounts.empty()) 7239 Value = ValueCounts.begin()->first; 7240 7241 if (ValueCounts.empty()) 7242 return DAG.getUNDEF(VT); 7243 7244 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR. 7245 // Keep going if we are hitting this case. 7246 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode())) 7247 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); 7248 7249 unsigned EltSize = VT.getScalarSizeInBits(); 7250 7251 // Use VDUP for non-constant splats. For f32 constant splats, reduce to 7252 // i32 and try again. 7253 if (hasDominantValue && EltSize <= 32) { 7254 if (!isConstant) { 7255 SDValue N; 7256 7257 // If we are VDUPing a value that comes directly from a vector, that will 7258 // cause an unnecessary move to and from a GPR, where instead we could 7259 // just use VDUPLANE. We can only do this if the lane being extracted 7260 // is at a constant index, as the VDUP from lane instructions only have 7261 // constant-index forms. 7262 ConstantSDNode *constIndex; 7263 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT && 7264 (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) { 7265 // We need to create a new undef vector to use for the VDUPLANE if the 7266 // size of the vector from which we get the value is different than the 7267 // size of the vector that we need to create. We will insert the element 7268 // such that the register coalescer will remove unnecessary copies. 7269 if (VT != Value->getOperand(0).getValueType()) { 7270 unsigned index = constIndex->getAPIntValue().getLimitedValue() % 7271 VT.getVectorNumElements(); 7272 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 7273 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT), 7274 Value, DAG.getConstant(index, dl, MVT::i32)), 7275 DAG.getConstant(index, dl, MVT::i32)); 7276 } else 7277 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 7278 Value->getOperand(0), Value->getOperand(1)); 7279 } else 7280 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value); 7281 7282 if (!usesOnlyOneValue) { 7283 // The dominant value was splatted as 'N', but we now have to insert 7284 // all differing elements. 7285 for (unsigned I = 0; I < NumElts; ++I) { 7286 if (Op.getOperand(I) == Value) 7287 continue; 7288 SmallVector<SDValue, 3> Ops; 7289 Ops.push_back(N); 7290 Ops.push_back(Op.getOperand(I)); 7291 Ops.push_back(DAG.getConstant(I, dl, MVT::i32)); 7292 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops); 7293 } 7294 } 7295 return N; 7296 } 7297 if (VT.getVectorElementType().isFloatingPoint()) { 7298 SmallVector<SDValue, 8> Ops; 7299 MVT FVT = VT.getVectorElementType().getSimpleVT(); 7300 assert(FVT == MVT::f32 || FVT == MVT::f16); 7301 MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16; 7302 for (unsigned i = 0; i < NumElts; ++i) 7303 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT, 7304 Op.getOperand(i))); 7305 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts); 7306 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); 7307 Val = LowerBUILD_VECTOR(Val, DAG, ST); 7308 if (Val.getNode()) 7309 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 7310 } 7311 if (usesOnlyOneValue) { 7312 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); 7313 if (isConstant && Val.getNode()) 7314 return DAG.getNode(ARMISD::VDUP, dl, VT, Val); 7315 } 7316 } 7317 7318 // If all elements are constants and the case above didn't get hit, fall back 7319 // to the default expansion, which will generate a load from the constant 7320 // pool. 7321 if (isConstant) 7322 return SDValue(); 7323 7324 // Empirical tests suggest this is rarely worth it for vectors of length <= 2. 7325 if (NumElts >= 4) { 7326 SDValue shuffle = ReconstructShuffle(Op, DAG); 7327 if (shuffle != SDValue()) 7328 return shuffle; 7329 } 7330 7331 if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) { 7332 // If we haven't found an efficient lowering, try splitting a 128-bit vector 7333 // into two 64-bit vectors; we might discover a better way to lower it. 7334 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts); 7335 EVT ExtVT = VT.getVectorElementType(); 7336 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2); 7337 SDValue Lower = 7338 DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2)); 7339 if (Lower.getOpcode() == ISD::BUILD_VECTOR) 7340 Lower = LowerBUILD_VECTOR(Lower, DAG, ST); 7341 SDValue Upper = DAG.getBuildVector( 7342 HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2)); 7343 if (Upper.getOpcode() == ISD::BUILD_VECTOR) 7344 Upper = LowerBUILD_VECTOR(Upper, DAG, ST); 7345 if (Lower && Upper) 7346 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper); 7347 } 7348 7349 // Vectors with 32- or 64-bit elements can be built by directly assigning 7350 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands 7351 // will be legalized. 7352 if (EltSize >= 32) { 7353 // Do the expansion with floating-point types, since that is what the VFP 7354 // registers are defined to use, and since i64 is not legal. 7355 EVT EltVT = EVT::getFloatingPointVT(EltSize); 7356 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 7357 SmallVector<SDValue, 8> Ops; 7358 for (unsigned i = 0; i < NumElts; ++i) 7359 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i))); 7360 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); 7361 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 7362 } 7363 7364 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we 7365 // know the default expansion would otherwise fall back on something even 7366 // worse. For a vector with one or two non-undef values, that's 7367 // scalar_to_vector for the elements followed by a shuffle (provided the 7368 // shuffle is valid for the target) and materialization element by element 7369 // on the stack followed by a load for everything else. 7370 if (!isConstant && !usesOnlyOneValue) { 7371 SDValue Vec = DAG.getUNDEF(VT); 7372 for (unsigned i = 0 ; i < NumElts; ++i) { 7373 SDValue V = Op.getOperand(i); 7374 if (V.isUndef()) 7375 continue; 7376 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32); 7377 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); 7378 } 7379 return Vec; 7380 } 7381 7382 return SDValue(); 7383 } 7384 7385 // Gather data to see if the operation can be modelled as a 7386 // shuffle in combination with VEXTs. 7387 SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, 7388 SelectionDAG &DAG) const { 7389 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 7390 SDLoc dl(Op); 7391 EVT VT = Op.getValueType(); 7392 unsigned NumElts = VT.getVectorNumElements(); 7393 7394 struct ShuffleSourceInfo { 7395 SDValue Vec; 7396 unsigned MinElt = std::numeric_limits<unsigned>::max(); 7397 unsigned MaxElt = 0; 7398 7399 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to 7400 // be compatible with the shuffle we intend to construct. As a result 7401 // ShuffleVec will be some sliding window into the original Vec. 7402 SDValue ShuffleVec; 7403 7404 // Code should guarantee that element i in Vec starts at element "WindowBase 7405 // + i * WindowScale in ShuffleVec". 7406 int WindowBase = 0; 7407 int WindowScale = 1; 7408 7409 ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {} 7410 7411 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } 7412 }; 7413 7414 // First gather all vectors used as an immediate source for this BUILD_VECTOR 7415 // node. 7416 SmallVector<ShuffleSourceInfo, 2> Sources; 7417 for (unsigned i = 0; i < NumElts; ++i) { 7418 SDValue V = Op.getOperand(i); 7419 if (V.isUndef()) 7420 continue; 7421 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { 7422 // A shuffle can only come from building a vector from various 7423 // elements of other vectors. 7424 return SDValue(); 7425 } else if (!isa<ConstantSDNode>(V.getOperand(1))) { 7426 // Furthermore, shuffles require a constant mask, whereas extractelts 7427 // accept variable indices. 7428 return SDValue(); 7429 } 7430 7431 // Add this element source to the list if it's not already there. 7432 SDValue SourceVec = V.getOperand(0); 7433 auto Source = llvm::find(Sources, SourceVec); 7434 if (Source == Sources.end()) 7435 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); 7436 7437 // Update the minimum and maximum lane number seen. 7438 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); 7439 Source->MinElt = std::min(Source->MinElt, EltNo); 7440 Source->MaxElt = std::max(Source->MaxElt, EltNo); 7441 } 7442 7443 // Currently only do something sane when at most two source vectors 7444 // are involved. 7445 if (Sources.size() > 2) 7446 return SDValue(); 7447 7448 // Find out the smallest element size among result and two sources, and use 7449 // it as element size to build the shuffle_vector. 7450 EVT SmallestEltTy = VT.getVectorElementType(); 7451 for (auto &Source : Sources) { 7452 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); 7453 if (SrcEltTy.bitsLT(SmallestEltTy)) 7454 SmallestEltTy = SrcEltTy; 7455 } 7456 unsigned ResMultiplier = 7457 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits(); 7458 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); 7459 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); 7460 7461 // If the source vector is too wide or too narrow, we may nevertheless be able 7462 // to construct a compatible shuffle either by concatenating it with UNDEF or 7463 // extracting a suitable range of elements. 7464 for (auto &Src : Sources) { 7465 EVT SrcVT = Src.ShuffleVec.getValueType(); 7466 7467 if (SrcVT.getSizeInBits() == VT.getSizeInBits()) 7468 continue; 7469 7470 // This stage of the search produces a source with the same element type as 7471 // the original, but with a total width matching the BUILD_VECTOR output. 7472 EVT EltVT = SrcVT.getVectorElementType(); 7473 unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); 7474 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); 7475 7476 if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { 7477 if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits()) 7478 return SDValue(); 7479 // We can pad out the smaller vector for free, so if it's part of a 7480 // shuffle... 7481 Src.ShuffleVec = 7482 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, 7483 DAG.getUNDEF(Src.ShuffleVec.getValueType())); 7484 continue; 7485 } 7486 7487 if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits()) 7488 return SDValue(); 7489 7490 if (Src.MaxElt - Src.MinElt >= NumSrcElts) { 7491 // Span too large for a VEXT to cope 7492 return SDValue(); 7493 } 7494 7495 if (Src.MinElt >= NumSrcElts) { 7496 // The extraction can just take the second half 7497 Src.ShuffleVec = 7498 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 7499 DAG.getConstant(NumSrcElts, dl, MVT::i32)); 7500 Src.WindowBase = -NumSrcElts; 7501 } else if (Src.MaxElt < NumSrcElts) { 7502 // The extraction can just take the first half 7503 Src.ShuffleVec = 7504 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 7505 DAG.getConstant(0, dl, MVT::i32)); 7506 } else { 7507 // An actual VEXT is needed 7508 SDValue VEXTSrc1 = 7509 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 7510 DAG.getConstant(0, dl, MVT::i32)); 7511 SDValue VEXTSrc2 = 7512 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 7513 DAG.getConstant(NumSrcElts, dl, MVT::i32)); 7514 7515 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1, 7516 VEXTSrc2, 7517 DAG.getConstant(Src.MinElt, dl, MVT::i32)); 7518 Src.WindowBase = -Src.MinElt; 7519 } 7520 } 7521 7522 // Another possible incompatibility occurs from the vector element types. We 7523 // can fix this by bitcasting the source vectors to the same type we intend 7524 // for the shuffle. 7525 for (auto &Src : Sources) { 7526 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); 7527 if (SrcEltTy == SmallestEltTy) 7528 continue; 7529 assert(ShuffleVT.getVectorElementType() == SmallestEltTy); 7530 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); 7531 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); 7532 Src.WindowBase *= Src.WindowScale; 7533 } 7534 7535 // Final sanity check before we try to actually produce a shuffle. 7536 LLVM_DEBUG(for (auto Src 7537 : Sources) 7538 assert(Src.ShuffleVec.getValueType() == ShuffleVT);); 7539 7540 // The stars all align, our next step is to produce the mask for the shuffle. 7541 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); 7542 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits(); 7543 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { 7544 SDValue Entry = Op.getOperand(i); 7545 if (Entry.isUndef()) 7546 continue; 7547 7548 auto Src = llvm::find(Sources, Entry.getOperand(0)); 7549 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); 7550 7551 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit 7552 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this 7553 // segment. 7554 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); 7555 int BitsDefined = std::min(OrigEltTy.getSizeInBits(), 7556 VT.getScalarSizeInBits()); 7557 int LanesDefined = BitsDefined / BitsPerShuffleLane; 7558 7559 // This source is expected to fill ResMultiplier lanes of the final shuffle, 7560 // starting at the appropriate offset. 7561 int *LaneMask = &Mask[i * ResMultiplier]; 7562 7563 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; 7564 ExtractBase += NumElts * (Src - Sources.begin()); 7565 for (int j = 0; j < LanesDefined; ++j) 7566 LaneMask[j] = ExtractBase + j; 7567 } 7568 7569 7570 // We can't handle more than two sources. This should have already 7571 // been checked before this point. 7572 assert(Sources.size() <= 2 && "Too many sources!"); 7573 7574 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; 7575 for (unsigned i = 0; i < Sources.size(); ++i) 7576 ShuffleOps[i] = Sources[i].ShuffleVec; 7577 7578 SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0], 7579 ShuffleOps[1], Mask, DAG); 7580 if (!Shuffle) 7581 return SDValue(); 7582 return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); 7583 } 7584 7585 enum ShuffleOpCodes { 7586 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 7587 OP_VREV, 7588 OP_VDUP0, 7589 OP_VDUP1, 7590 OP_VDUP2, 7591 OP_VDUP3, 7592 OP_VEXT1, 7593 OP_VEXT2, 7594 OP_VEXT3, 7595 OP_VUZPL, // VUZP, left result 7596 OP_VUZPR, // VUZP, right result 7597 OP_VZIPL, // VZIP, left result 7598 OP_VZIPR, // VZIP, right result 7599 OP_VTRNL, // VTRN, left result 7600 OP_VTRNR // VTRN, right result 7601 }; 7602 7603 static bool isLegalMVEShuffleOp(unsigned PFEntry) { 7604 unsigned OpNum = (PFEntry >> 26) & 0x0F; 7605 switch (OpNum) { 7606 case OP_COPY: 7607 case OP_VREV: 7608 case OP_VDUP0: 7609 case OP_VDUP1: 7610 case OP_VDUP2: 7611 case OP_VDUP3: 7612 return true; 7613 } 7614 return false; 7615 } 7616 7617 /// isShuffleMaskLegal - Targets can use this to indicate that they only 7618 /// support *some* VECTOR_SHUFFLE operations, those with specific masks. 7619 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 7620 /// are assumed to be legal. 7621 bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { 7622 if (VT.getVectorNumElements() == 4 && 7623 (VT.is128BitVector() || VT.is64BitVector())) { 7624 unsigned PFIndexes[4]; 7625 for (unsigned i = 0; i != 4; ++i) { 7626 if (M[i] < 0) 7627 PFIndexes[i] = 8; 7628 else 7629 PFIndexes[i] = M[i]; 7630 } 7631 7632 // Compute the index in the perfect shuffle table. 7633 unsigned PFTableIndex = 7634 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 7635 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 7636 unsigned Cost = (PFEntry >> 30); 7637 7638 if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry))) 7639 return true; 7640 } 7641 7642 bool ReverseVEXT, isV_UNDEF; 7643 unsigned Imm, WhichResult; 7644 7645 unsigned EltSize = VT.getScalarSizeInBits(); 7646 if (EltSize >= 32 || 7647 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 7648 ShuffleVectorInst::isIdentityMask(M) || 7649 isVREVMask(M, VT, 64) || 7650 isVREVMask(M, VT, 32) || 7651 isVREVMask(M, VT, 16)) 7652 return true; 7653 else if (Subtarget->hasNEON() && 7654 (isVEXTMask(M, VT, ReverseVEXT, Imm) || 7655 isVTBLMask(M, VT) || 7656 isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF))) 7657 return true; 7658 else if (Subtarget->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && 7659 isReverseMask(M, VT)) 7660 return true; 7661 else if (Subtarget->hasMVEIntegerOps() && 7662 (isVMOVNMask(M, VT, 0) || isVMOVNMask(M, VT, 1))) 7663 return true; 7664 else 7665 return false; 7666 } 7667 7668 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit 7669 /// the specified operations to build the shuffle. 7670 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, 7671 SDValue RHS, SelectionDAG &DAG, 7672 const SDLoc &dl) { 7673 unsigned OpNum = (PFEntry >> 26) & 0x0F; 7674 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); 7675 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); 7676 7677 if (OpNum == OP_COPY) { 7678 if (LHSID == (1*9+2)*9+3) return LHS; 7679 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); 7680 return RHS; 7681 } 7682 7683 SDValue OpLHS, OpRHS; 7684 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); 7685 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); 7686 EVT VT = OpLHS.getValueType(); 7687 7688 switch (OpNum) { 7689 default: llvm_unreachable("Unknown shuffle opcode!"); 7690 case OP_VREV: 7691 // VREV divides the vector in half and swaps within the half. 7692 if (VT.getVectorElementType() == MVT::i32 || 7693 VT.getVectorElementType() == MVT::f32) 7694 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS); 7695 // vrev <4 x i16> -> VREV32 7696 if (VT.getVectorElementType() == MVT::i16) 7697 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS); 7698 // vrev <4 x i8> -> VREV16 7699 assert(VT.getVectorElementType() == MVT::i8); 7700 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS); 7701 case OP_VDUP0: 7702 case OP_VDUP1: 7703 case OP_VDUP2: 7704 case OP_VDUP3: 7705 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, 7706 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32)); 7707 case OP_VEXT1: 7708 case OP_VEXT2: 7709 case OP_VEXT3: 7710 return DAG.getNode(ARMISD::VEXT, dl, VT, 7711 OpLHS, OpRHS, 7712 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32)); 7713 case OP_VUZPL: 7714 case OP_VUZPR: 7715 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), 7716 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL); 7717 case OP_VZIPL: 7718 case OP_VZIPR: 7719 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), 7720 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL); 7721 case OP_VTRNL: 7722 case OP_VTRNR: 7723 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), 7724 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL); 7725 } 7726 } 7727 7728 static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, 7729 ArrayRef<int> ShuffleMask, 7730 SelectionDAG &DAG) { 7731 // Check to see if we can use the VTBL instruction. 7732 SDValue V1 = Op.getOperand(0); 7733 SDValue V2 = Op.getOperand(1); 7734 SDLoc DL(Op); 7735 7736 SmallVector<SDValue, 8> VTBLMask; 7737 for (ArrayRef<int>::iterator 7738 I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I) 7739 VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32)); 7740 7741 if (V2.getNode()->isUndef()) 7742 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1, 7743 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); 7744 7745 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2, 7746 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); 7747 } 7748 7749 static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op, 7750 SelectionDAG &DAG) { 7751 SDLoc DL(Op); 7752 SDValue OpLHS = Op.getOperand(0); 7753 EVT VT = OpLHS.getValueType(); 7754 7755 assert((VT == MVT::v8i16 || VT == MVT::v16i8) && 7756 "Expect an v8i16/v16i8 type"); 7757 OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS); 7758 // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now, 7759 // extract the first 8 bytes into the top double word and the last 8 bytes 7760 // into the bottom double word. The v8i16 case is similar. 7761 unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4; 7762 return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS, 7763 DAG.getConstant(ExtractNum, DL, MVT::i32)); 7764 } 7765 7766 static EVT getVectorTyFromPredicateVector(EVT VT) { 7767 switch (VT.getSimpleVT().SimpleTy) { 7768 case MVT::v4i1: 7769 return MVT::v4i32; 7770 case MVT::v8i1: 7771 return MVT::v8i16; 7772 case MVT::v16i1: 7773 return MVT::v16i8; 7774 default: 7775 llvm_unreachable("Unexpected vector predicate type"); 7776 } 7777 } 7778 7779 static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT, 7780 SelectionDAG &DAG) { 7781 // Converting from boolean predicates to integers involves creating a vector 7782 // of all ones or all zeroes and selecting the lanes based upon the real 7783 // predicate. 7784 SDValue AllOnes = 7785 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32); 7786 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes); 7787 7788 SDValue AllZeroes = 7789 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32); 7790 AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes); 7791 7792 // Get full vector type from predicate type 7793 EVT NewVT = getVectorTyFromPredicateVector(VT); 7794 7795 SDValue RecastV1; 7796 // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast 7797 // this to a v16i1. This cannot be done with an ordinary bitcast because the 7798 // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node, 7799 // since we know in hardware the sizes are really the same. 7800 if (VT != MVT::v16i1) 7801 RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred); 7802 else 7803 RecastV1 = Pred; 7804 7805 // Select either all ones or zeroes depending upon the real predicate bits. 7806 SDValue PredAsVector = 7807 DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes); 7808 7809 // Recast our new predicate-as-integer v16i8 vector into something 7810 // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate. 7811 return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector); 7812 } 7813 7814 static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, 7815 const ARMSubtarget *ST) { 7816 EVT VT = Op.getValueType(); 7817 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 7818 ArrayRef<int> ShuffleMask = SVN->getMask(); 7819 7820 assert(ST->hasMVEIntegerOps() && 7821 "No support for vector shuffle of boolean predicates"); 7822 7823 SDValue V1 = Op.getOperand(0); 7824 SDLoc dl(Op); 7825 if (isReverseMask(ShuffleMask, VT)) { 7826 SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1); 7827 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast); 7828 SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit, 7829 DAG.getConstant(16, dl, MVT::i32)); 7830 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl); 7831 } 7832 7833 // Until we can come up with optimised cases for every single vector 7834 // shuffle in existence we have chosen the least painful strategy. This is 7835 // to essentially promote the boolean predicate to a 8-bit integer, where 7836 // each predicate represents a byte. Then we fall back on a normal integer 7837 // vector shuffle and convert the result back into a predicate vector. In 7838 // many cases the generated code might be even better than scalar code 7839 // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit 7840 // fields in a register into 8 other arbitrary 2-bit fields! 7841 SDValue PredAsVector = PromoteMVEPredVector(dl, V1, VT, DAG); 7842 EVT NewVT = PredAsVector.getValueType(); 7843 7844 // Do the shuffle! 7845 SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector, 7846 DAG.getUNDEF(NewVT), ShuffleMask); 7847 7848 // Now return the result of comparing the shuffled vector with zero, 7849 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. 7850 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled, 7851 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 7852 } 7853 7854 static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op, 7855 ArrayRef<int> ShuffleMask, 7856 SelectionDAG &DAG) { 7857 // Attempt to lower the vector shuffle using as many whole register movs as 7858 // possible. This is useful for types smaller than 32bits, which would 7859 // often otherwise become a series for grp movs. 7860 SDLoc dl(Op); 7861 EVT VT = Op.getValueType(); 7862 if (VT.getScalarSizeInBits() >= 32) 7863 return SDValue(); 7864 7865 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) && 7866 "Unexpected vector type"); 7867 int NumElts = VT.getVectorNumElements(); 7868 int QuarterSize = NumElts / 4; 7869 // The four final parts of the vector, as i32's 7870 SDValue Parts[4]; 7871 7872 // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not 7873 // <u,u,u,u>), returning the vmov lane index 7874 auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) { 7875 // Detect which mov lane this would be from the first non-undef element. 7876 int MovIdx = -1; 7877 for (int i = 0; i < Length; i++) { 7878 if (ShuffleMask[Start + i] >= 0) { 7879 if (ShuffleMask[Start + i] % Length != i) 7880 return -1; 7881 MovIdx = ShuffleMask[Start + i] / Length; 7882 break; 7883 } 7884 } 7885 // If all items are undef, leave this for other combines 7886 if (MovIdx == -1) 7887 return -1; 7888 // Check the remaining values are the correct part of the same mov 7889 for (int i = 1; i < Length; i++) { 7890 if (ShuffleMask[Start + i] >= 0 && 7891 (ShuffleMask[Start + i] / Length != MovIdx || 7892 ShuffleMask[Start + i] % Length != i)) 7893 return -1; 7894 } 7895 return MovIdx; 7896 }; 7897 7898 for (int Part = 0; Part < 4; ++Part) { 7899 // Does this part look like a mov 7900 int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize); 7901 if (Elt != -1) { 7902 SDValue Input = Op->getOperand(0); 7903 if (Elt >= 4) { 7904 Input = Op->getOperand(1); 7905 Elt -= 4; 7906 } 7907 SDValue BitCast = DAG.getBitcast(MVT::v4i32, Input); 7908 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, BitCast, 7909 DAG.getConstant(Elt, dl, MVT::i32)); 7910 } 7911 } 7912 7913 // Nothing interesting found, just return 7914 if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3]) 7915 return SDValue(); 7916 7917 // The other parts need to be built with the old shuffle vector, cast to a 7918 // v4i32 and extract_vector_elts 7919 if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) { 7920 SmallVector<int, 16> NewShuffleMask; 7921 for (int Part = 0; Part < 4; ++Part) 7922 for (int i = 0; i < QuarterSize; i++) 7923 NewShuffleMask.push_back( 7924 Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]); 7925 SDValue NewShuffle = DAG.getVectorShuffle( 7926 VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask); 7927 SDValue BitCast = DAG.getBitcast(MVT::v4i32, NewShuffle); 7928 7929 for (int Part = 0; Part < 4; ++Part) 7930 if (!Parts[Part]) 7931 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 7932 BitCast, DAG.getConstant(Part, dl, MVT::i32)); 7933 } 7934 // Build a vector out of the various parts and bitcast it back to the original 7935 // type. 7936 SDValue NewVec = DAG.getBuildVector(MVT::v4i32, dl, Parts); 7937 return DAG.getBitcast(VT, NewVec); 7938 } 7939 7940 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, 7941 const ARMSubtarget *ST) { 7942 SDValue V1 = Op.getOperand(0); 7943 SDValue V2 = Op.getOperand(1); 7944 SDLoc dl(Op); 7945 EVT VT = Op.getValueType(); 7946 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 7947 unsigned EltSize = VT.getScalarSizeInBits(); 7948 7949 if (ST->hasMVEIntegerOps() && EltSize == 1) 7950 return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST); 7951 7952 // Convert shuffles that are directly supported on NEON to target-specific 7953 // DAG nodes, instead of keeping them as shuffles and matching them again 7954 // during code selection. This is more efficient and avoids the possibility 7955 // of inconsistencies between legalization and selection. 7956 // FIXME: floating-point vectors should be canonicalized to integer vectors 7957 // of the same time so that they get CSEd properly. 7958 ArrayRef<int> ShuffleMask = SVN->getMask(); 7959 7960 if (EltSize <= 32) { 7961 if (SVN->isSplat()) { 7962 int Lane = SVN->getSplatIndex(); 7963 // If this is undef splat, generate it via "just" vdup, if possible. 7964 if (Lane == -1) Lane = 0; 7965 7966 // Test if V1 is a SCALAR_TO_VECTOR. 7967 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { 7968 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 7969 } 7970 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR 7971 // (and probably will turn into a SCALAR_TO_VECTOR once legalization 7972 // reaches it). 7973 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR && 7974 !isa<ConstantSDNode>(V1.getOperand(0))) { 7975 bool IsScalarToVector = true; 7976 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i) 7977 if (!V1.getOperand(i).isUndef()) { 7978 IsScalarToVector = false; 7979 break; 7980 } 7981 if (IsScalarToVector) 7982 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 7983 } 7984 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1, 7985 DAG.getConstant(Lane, dl, MVT::i32)); 7986 } 7987 7988 bool ReverseVEXT = false; 7989 unsigned Imm = 0; 7990 if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { 7991 if (ReverseVEXT) 7992 std::swap(V1, V2); 7993 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, 7994 DAG.getConstant(Imm, dl, MVT::i32)); 7995 } 7996 7997 if (isVREVMask(ShuffleMask, VT, 64)) 7998 return DAG.getNode(ARMISD::VREV64, dl, VT, V1); 7999 if (isVREVMask(ShuffleMask, VT, 32)) 8000 return DAG.getNode(ARMISD::VREV32, dl, VT, V1); 8001 if (isVREVMask(ShuffleMask, VT, 16)) 8002 return DAG.getNode(ARMISD::VREV16, dl, VT, V1); 8003 8004 if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) { 8005 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1, 8006 DAG.getConstant(Imm, dl, MVT::i32)); 8007 } 8008 8009 // Check for Neon shuffles that modify both input vectors in place. 8010 // If both results are used, i.e., if there are two shuffles with the same 8011 // source operands and with masks corresponding to both results of one of 8012 // these operations, DAG memoization will ensure that a single node is 8013 // used for both shuffles. 8014 unsigned WhichResult = 0; 8015 bool isV_UNDEF = false; 8016 if (ST->hasNEON()) { 8017 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( 8018 ShuffleMask, VT, WhichResult, isV_UNDEF)) { 8019 if (isV_UNDEF) 8020 V2 = V1; 8021 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2) 8022 .getValue(WhichResult); 8023 } 8024 } 8025 if (ST->hasMVEIntegerOps()) { 8026 if (isVMOVNMask(ShuffleMask, VT, 0)) 8027 return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1, 8028 DAG.getConstant(0, dl, MVT::i32)); 8029 if (isVMOVNMask(ShuffleMask, VT, 1)) 8030 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2, 8031 DAG.getConstant(1, dl, MVT::i32)); 8032 } 8033 8034 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize 8035 // shuffles that produce a result larger than their operands with: 8036 // shuffle(concat(v1, undef), concat(v2, undef)) 8037 // -> 8038 // shuffle(concat(v1, v2), undef) 8039 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine). 8040 // 8041 // This is useful in the general case, but there are special cases where 8042 // native shuffles produce larger results: the two-result ops. 8043 // 8044 // Look through the concat when lowering them: 8045 // shuffle(concat(v1, v2), undef) 8046 // -> 8047 // concat(VZIP(v1, v2):0, :1) 8048 // 8049 if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) { 8050 SDValue SubV1 = V1->getOperand(0); 8051 SDValue SubV2 = V1->getOperand(1); 8052 EVT SubVT = SubV1.getValueType(); 8053 8054 // We expect these to have been canonicalized to -1. 8055 assert(llvm::all_of(ShuffleMask, [&](int i) { 8056 return i < (int)VT.getVectorNumElements(); 8057 }) && "Unexpected shuffle index into UNDEF operand!"); 8058 8059 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( 8060 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) { 8061 if (isV_UNDEF) 8062 SubV2 = SubV1; 8063 assert((WhichResult == 0) && 8064 "In-place shuffle of concat can only have one result!"); 8065 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT), 8066 SubV1, SubV2); 8067 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0), 8068 Res.getValue(1)); 8069 } 8070 } 8071 } 8072 8073 // If the shuffle is not directly supported and it has 4 elements, use 8074 // the PerfectShuffle-generated table to synthesize it from other shuffles. 8075 unsigned NumElts = VT.getVectorNumElements(); 8076 if (NumElts == 4) { 8077 unsigned PFIndexes[4]; 8078 for (unsigned i = 0; i != 4; ++i) { 8079 if (ShuffleMask[i] < 0) 8080 PFIndexes[i] = 8; 8081 else 8082 PFIndexes[i] = ShuffleMask[i]; 8083 } 8084 8085 // Compute the index in the perfect shuffle table. 8086 unsigned PFTableIndex = 8087 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 8088 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 8089 unsigned Cost = (PFEntry >> 30); 8090 8091 if (Cost <= 4) { 8092 if (ST->hasNEON()) 8093 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 8094 else if (isLegalMVEShuffleOp(PFEntry)) { 8095 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); 8096 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); 8097 unsigned PFEntryLHS = PerfectShuffleTable[LHSID]; 8098 unsigned PFEntryRHS = PerfectShuffleTable[RHSID]; 8099 if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS)) 8100 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 8101 } 8102 } 8103 } 8104 8105 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs. 8106 if (EltSize >= 32) { 8107 // Do the expansion with floating-point types, since that is what the VFP 8108 // registers are defined to use, and since i64 is not legal. 8109 EVT EltVT = EVT::getFloatingPointVT(EltSize); 8110 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 8111 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1); 8112 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2); 8113 SmallVector<SDValue, 8> Ops; 8114 for (unsigned i = 0; i < NumElts; ++i) { 8115 if (ShuffleMask[i] < 0) 8116 Ops.push_back(DAG.getUNDEF(EltVT)); 8117 else 8118 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, 8119 ShuffleMask[i] < (int)NumElts ? V1 : V2, 8120 DAG.getConstant(ShuffleMask[i] & (NumElts-1), 8121 dl, MVT::i32))); 8122 } 8123 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); 8124 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 8125 } 8126 8127 if (ST->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT)) 8128 return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG); 8129 8130 if (ST->hasNEON() && VT == MVT::v8i8) 8131 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG)) 8132 return NewOp; 8133 8134 if (ST->hasMVEIntegerOps()) 8135 if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG)) 8136 return NewOp; 8137 8138 return SDValue(); 8139 } 8140 8141 static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, 8142 const ARMSubtarget *ST) { 8143 EVT VecVT = Op.getOperand(0).getValueType(); 8144 SDLoc dl(Op); 8145 8146 assert(ST->hasMVEIntegerOps() && 8147 "LowerINSERT_VECTOR_ELT_i1 called without MVE!"); 8148 8149 SDValue Conv = 8150 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0)); 8151 unsigned Lane = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 8152 unsigned LaneWidth = 8153 getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8; 8154 unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth; 8155 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, 8156 Op.getOperand(1), DAG.getValueType(MVT::i1)); 8157 SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext, 8158 DAG.getConstant(~Mask, dl, MVT::i32)); 8159 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI); 8160 } 8161 8162 SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 8163 SelectionDAG &DAG) const { 8164 // INSERT_VECTOR_ELT is legal only for immediate indexes. 8165 SDValue Lane = Op.getOperand(2); 8166 if (!isa<ConstantSDNode>(Lane)) 8167 return SDValue(); 8168 8169 SDValue Elt = Op.getOperand(1); 8170 EVT EltVT = Elt.getValueType(); 8171 8172 if (Subtarget->hasMVEIntegerOps() && 8173 Op.getValueType().getScalarSizeInBits() == 1) 8174 return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget); 8175 8176 if (getTypeAction(*DAG.getContext(), EltVT) == 8177 TargetLowering::TypePromoteFloat) { 8178 // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32, 8179 // but the type system will try to do that if we don't intervene. 8180 // Reinterpret any such vector-element insertion as one with the 8181 // corresponding integer types. 8182 8183 SDLoc dl(Op); 8184 8185 EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits()); 8186 assert(getTypeAction(*DAG.getContext(), IEltVT) != 8187 TargetLowering::TypePromoteFloat); 8188 8189 SDValue VecIn = Op.getOperand(0); 8190 EVT VecVT = VecIn.getValueType(); 8191 EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT, 8192 VecVT.getVectorNumElements()); 8193 8194 SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt); 8195 SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn); 8196 SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT, 8197 IVecIn, IElt, Lane); 8198 return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut); 8199 } 8200 8201 return Op; 8202 } 8203 8204 static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, 8205 const ARMSubtarget *ST) { 8206 EVT VecVT = Op.getOperand(0).getValueType(); 8207 SDLoc dl(Op); 8208 8209 assert(ST->hasMVEIntegerOps() && 8210 "LowerINSERT_VECTOR_ELT_i1 called without MVE!"); 8211 8212 SDValue Conv = 8213 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0)); 8214 unsigned Lane = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 8215 unsigned LaneWidth = 8216 getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8; 8217 SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv, 8218 DAG.getConstant(Lane * LaneWidth, dl, MVT::i32)); 8219 return Shift; 8220 } 8221 8222 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG, 8223 const ARMSubtarget *ST) { 8224 // EXTRACT_VECTOR_ELT is legal only for immediate indexes. 8225 SDValue Lane = Op.getOperand(1); 8226 if (!isa<ConstantSDNode>(Lane)) 8227 return SDValue(); 8228 8229 SDValue Vec = Op.getOperand(0); 8230 EVT VT = Vec.getValueType(); 8231 8232 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) 8233 return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST); 8234 8235 if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) { 8236 SDLoc dl(Op); 8237 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); 8238 } 8239 8240 return Op; 8241 } 8242 8243 static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, 8244 const ARMSubtarget *ST) { 8245 SDValue V1 = Op.getOperand(0); 8246 SDValue V2 = Op.getOperand(1); 8247 SDLoc dl(Op); 8248 EVT VT = Op.getValueType(); 8249 EVT Op1VT = V1.getValueType(); 8250 EVT Op2VT = V2.getValueType(); 8251 unsigned NumElts = VT.getVectorNumElements(); 8252 8253 assert(Op1VT == Op2VT && "Operand types don't match!"); 8254 assert(VT.getScalarSizeInBits() == 1 && 8255 "Unexpected custom CONCAT_VECTORS lowering"); 8256 assert(ST->hasMVEIntegerOps() && 8257 "CONCAT_VECTORS lowering only supported for MVE"); 8258 8259 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); 8260 SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG); 8261 8262 // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets 8263 // promoted to v8i16, etc. 8264 8265 MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); 8266 8267 // Extract the vector elements from Op1 and Op2 one by one and truncate them 8268 // to be the right size for the destination. For example, if Op1 is v4i1 then 8269 // the promoted vector is v4i32. The result of concatentation gives a v8i1, 8270 // which when promoted is v8i16. That means each i32 element from Op1 needs 8271 // truncating to i16 and inserting in the result. 8272 EVT ConcatVT = MVT::getVectorVT(ElType, NumElts); 8273 SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT); 8274 auto ExractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) { 8275 EVT NewVT = NewV.getValueType(); 8276 EVT ConcatVT = ConVec.getValueType(); 8277 for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) { 8278 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV, 8279 DAG.getIntPtrConstant(i, dl)); 8280 ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt, 8281 DAG.getConstant(j, dl, MVT::i32)); 8282 } 8283 return ConVec; 8284 }; 8285 unsigned j = 0; 8286 ConVec = ExractInto(NewV1, ConVec, j); 8287 ConVec = ExractInto(NewV2, ConVec, j); 8288 8289 // Now return the result of comparing the subvector with zero, 8290 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. 8291 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec, 8292 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 8293 } 8294 8295 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, 8296 const ARMSubtarget *ST) { 8297 EVT VT = Op->getValueType(0); 8298 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) 8299 return LowerCONCAT_VECTORS_i1(Op, DAG, ST); 8300 8301 // The only time a CONCAT_VECTORS operation can have legal types is when 8302 // two 64-bit vectors are concatenated to a 128-bit vector. 8303 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 && 8304 "unexpected CONCAT_VECTORS"); 8305 SDLoc dl(Op); 8306 SDValue Val = DAG.getUNDEF(MVT::v2f64); 8307 SDValue Op0 = Op.getOperand(0); 8308 SDValue Op1 = Op.getOperand(1); 8309 if (!Op0.isUndef()) 8310 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 8311 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0), 8312 DAG.getIntPtrConstant(0, dl)); 8313 if (!Op1.isUndef()) 8314 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 8315 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1), 8316 DAG.getIntPtrConstant(1, dl)); 8317 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val); 8318 } 8319 8320 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, 8321 const ARMSubtarget *ST) { 8322 SDValue V1 = Op.getOperand(0); 8323 SDValue V2 = Op.getOperand(1); 8324 SDLoc dl(Op); 8325 EVT VT = Op.getValueType(); 8326 EVT Op1VT = V1.getValueType(); 8327 unsigned NumElts = VT.getVectorNumElements(); 8328 unsigned Index = cast<ConstantSDNode>(V2)->getZExtValue(); 8329 8330 assert(VT.getScalarSizeInBits() == 1 && 8331 "Unexpected custom EXTRACT_SUBVECTOR lowering"); 8332 assert(ST->hasMVEIntegerOps() && 8333 "EXTRACT_SUBVECTOR lowering only supported for MVE"); 8334 8335 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); 8336 8337 // We now have Op1 promoted to a vector of integers, where v8i1 gets 8338 // promoted to v8i16, etc. 8339 8340 MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); 8341 8342 EVT SubVT = MVT::getVectorVT(ElType, NumElts); 8343 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT); 8344 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) { 8345 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1, 8346 DAG.getIntPtrConstant(i, dl)); 8347 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt, 8348 DAG.getConstant(j, dl, MVT::i32)); 8349 } 8350 8351 // Now return the result of comparing the subvector with zero, 8352 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. 8353 return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec, 8354 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 8355 } 8356 8357 /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each 8358 /// element has been zero/sign-extended, depending on the isSigned parameter, 8359 /// from an integer type half its size. 8360 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, 8361 bool isSigned) { 8362 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32. 8363 EVT VT = N->getValueType(0); 8364 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) { 8365 SDNode *BVN = N->getOperand(0).getNode(); 8366 if (BVN->getValueType(0) != MVT::v4i32 || 8367 BVN->getOpcode() != ISD::BUILD_VECTOR) 8368 return false; 8369 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; 8370 unsigned HiElt = 1 - LoElt; 8371 ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt)); 8372 ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt)); 8373 ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2)); 8374 ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2)); 8375 if (!Lo0 || !Hi0 || !Lo1 || !Hi1) 8376 return false; 8377 if (isSigned) { 8378 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 && 8379 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32) 8380 return true; 8381 } else { 8382 if (Hi0->isNullValue() && Hi1->isNullValue()) 8383 return true; 8384 } 8385 return false; 8386 } 8387 8388 if (N->getOpcode() != ISD::BUILD_VECTOR) 8389 return false; 8390 8391 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 8392 SDNode *Elt = N->getOperand(i).getNode(); 8393 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { 8394 unsigned EltSize = VT.getScalarSizeInBits(); 8395 unsigned HalfSize = EltSize / 2; 8396 if (isSigned) { 8397 if (!isIntN(HalfSize, C->getSExtValue())) 8398 return false; 8399 } else { 8400 if (!isUIntN(HalfSize, C->getZExtValue())) 8401 return false; 8402 } 8403 continue; 8404 } 8405 return false; 8406 } 8407 8408 return true; 8409 } 8410 8411 /// isSignExtended - Check if a node is a vector value that is sign-extended 8412 /// or a constant BUILD_VECTOR with sign-extended elements. 8413 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { 8414 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N)) 8415 return true; 8416 if (isExtendedBUILD_VECTOR(N, DAG, true)) 8417 return true; 8418 return false; 8419 } 8420 8421 /// isZeroExtended - Check if a node is a vector value that is zero-extended 8422 /// or a constant BUILD_VECTOR with zero-extended elements. 8423 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { 8424 if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N)) 8425 return true; 8426 if (isExtendedBUILD_VECTOR(N, DAG, false)) 8427 return true; 8428 return false; 8429 } 8430 8431 static EVT getExtensionTo64Bits(const EVT &OrigVT) { 8432 if (OrigVT.getSizeInBits() >= 64) 8433 return OrigVT; 8434 8435 assert(OrigVT.isSimple() && "Expecting a simple value type"); 8436 8437 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; 8438 switch (OrigSimpleTy) { 8439 default: llvm_unreachable("Unexpected Vector Type"); 8440 case MVT::v2i8: 8441 case MVT::v2i16: 8442 return MVT::v2i32; 8443 case MVT::v4i8: 8444 return MVT::v4i16; 8445 } 8446 } 8447 8448 /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total 8449 /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL. 8450 /// We insert the required extension here to get the vector to fill a D register. 8451 static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, 8452 const EVT &OrigTy, 8453 const EVT &ExtTy, 8454 unsigned ExtOpcode) { 8455 // The vector originally had a size of OrigTy. It was then extended to ExtTy. 8456 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than 8457 // 64-bits we need to insert a new extension so that it will be 64-bits. 8458 assert(ExtTy.is128BitVector() && "Unexpected extension size"); 8459 if (OrigTy.getSizeInBits() >= 64) 8460 return N; 8461 8462 // Must extend size to at least 64 bits to be used as an operand for VMULL. 8463 EVT NewVT = getExtensionTo64Bits(OrigTy); 8464 8465 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); 8466 } 8467 8468 /// SkipLoadExtensionForVMULL - return a load of the original vector size that 8469 /// does not do any sign/zero extension. If the original vector is less 8470 /// than 64 bits, an appropriate extension will be added after the load to 8471 /// reach a total size of 64 bits. We have to add the extension separately 8472 /// because ARM does not have a sign/zero extending load for vectors. 8473 static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) { 8474 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT()); 8475 8476 // The load already has the right type. 8477 if (ExtendedTy == LD->getMemoryVT()) 8478 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(), 8479 LD->getBasePtr(), LD->getPointerInfo(), 8480 LD->getAlignment(), LD->getMemOperand()->getFlags()); 8481 8482 // We need to create a zextload/sextload. We cannot just create a load 8483 // followed by a zext/zext node because LowerMUL is also run during normal 8484 // operation legalization where we can't create illegal types. 8485 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy, 8486 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), 8487 LD->getMemoryVT(), LD->getAlignment(), 8488 LD->getMemOperand()->getFlags()); 8489 } 8490 8491 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, 8492 /// extending load, or BUILD_VECTOR with extended elements, return the 8493 /// unextended value. The unextended vector should be 64 bits so that it can 8494 /// be used as an operand to a VMULL instruction. If the original vector size 8495 /// before extension is less than 64 bits we add a an extension to resize 8496 /// the vector to 64 bits. 8497 static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { 8498 if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) 8499 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG, 8500 N->getOperand(0)->getValueType(0), 8501 N->getValueType(0), 8502 N->getOpcode()); 8503 8504 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 8505 assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) && 8506 "Expected extending load"); 8507 8508 SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG); 8509 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1)); 8510 unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 8511 SDValue extLoad = 8512 DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad); 8513 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad); 8514 8515 return newLoad; 8516 } 8517 8518 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will 8519 // have been legalized as a BITCAST from v4i32. 8520 if (N->getOpcode() == ISD::BITCAST) { 8521 SDNode *BVN = N->getOperand(0).getNode(); 8522 assert(BVN->getOpcode() == ISD::BUILD_VECTOR && 8523 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR"); 8524 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; 8525 return DAG.getBuildVector( 8526 MVT::v2i32, SDLoc(N), 8527 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)}); 8528 } 8529 // Construct a new BUILD_VECTOR with elements truncated to half the size. 8530 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); 8531 EVT VT = N->getValueType(0); 8532 unsigned EltSize = VT.getScalarSizeInBits() / 2; 8533 unsigned NumElts = VT.getVectorNumElements(); 8534 MVT TruncVT = MVT::getIntegerVT(EltSize); 8535 SmallVector<SDValue, 8> Ops; 8536 SDLoc dl(N); 8537 for (unsigned i = 0; i != NumElts; ++i) { 8538 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); 8539 const APInt &CInt = C->getAPIntValue(); 8540 // Element types smaller than 32 bits are not legal, so use i32 elements. 8541 // The values are implicitly truncated so sext vs. zext doesn't matter. 8542 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); 8543 } 8544 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops); 8545 } 8546 8547 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { 8548 unsigned Opcode = N->getOpcode(); 8549 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 8550 SDNode *N0 = N->getOperand(0).getNode(); 8551 SDNode *N1 = N->getOperand(1).getNode(); 8552 return N0->hasOneUse() && N1->hasOneUse() && 8553 isSignExtended(N0, DAG) && isSignExtended(N1, DAG); 8554 } 8555 return false; 8556 } 8557 8558 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { 8559 unsigned Opcode = N->getOpcode(); 8560 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 8561 SDNode *N0 = N->getOperand(0).getNode(); 8562 SDNode *N1 = N->getOperand(1).getNode(); 8563 return N0->hasOneUse() && N1->hasOneUse() && 8564 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); 8565 } 8566 return false; 8567 } 8568 8569 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { 8570 // Multiplications are only custom-lowered for 128-bit vectors so that 8571 // VMULL can be detected. Otherwise v2i64 multiplications are not legal. 8572 EVT VT = Op.getValueType(); 8573 assert(VT.is128BitVector() && VT.isInteger() && 8574 "unexpected type for custom-lowering ISD::MUL"); 8575 SDNode *N0 = Op.getOperand(0).getNode(); 8576 SDNode *N1 = Op.getOperand(1).getNode(); 8577 unsigned NewOpc = 0; 8578 bool isMLA = false; 8579 bool isN0SExt = isSignExtended(N0, DAG); 8580 bool isN1SExt = isSignExtended(N1, DAG); 8581 if (isN0SExt && isN1SExt) 8582 NewOpc = ARMISD::VMULLs; 8583 else { 8584 bool isN0ZExt = isZeroExtended(N0, DAG); 8585 bool isN1ZExt = isZeroExtended(N1, DAG); 8586 if (isN0ZExt && isN1ZExt) 8587 NewOpc = ARMISD::VMULLu; 8588 else if (isN1SExt || isN1ZExt) { 8589 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these 8590 // into (s/zext A * s/zext C) + (s/zext B * s/zext C) 8591 if (isN1SExt && isAddSubSExt(N0, DAG)) { 8592 NewOpc = ARMISD::VMULLs; 8593 isMLA = true; 8594 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { 8595 NewOpc = ARMISD::VMULLu; 8596 isMLA = true; 8597 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { 8598 std::swap(N0, N1); 8599 NewOpc = ARMISD::VMULLu; 8600 isMLA = true; 8601 } 8602 } 8603 8604 if (!NewOpc) { 8605 if (VT == MVT::v2i64) 8606 // Fall through to expand this. It is not legal. 8607 return SDValue(); 8608 else 8609 // Other vector multiplications are legal. 8610 return Op; 8611 } 8612 } 8613 8614 // Legalize to a VMULL instruction. 8615 SDLoc DL(Op); 8616 SDValue Op0; 8617 SDValue Op1 = SkipExtensionForVMULL(N1, DAG); 8618 if (!isMLA) { 8619 Op0 = SkipExtensionForVMULL(N0, DAG); 8620 assert(Op0.getValueType().is64BitVector() && 8621 Op1.getValueType().is64BitVector() && 8622 "unexpected types for extended operands to VMULL"); 8623 return DAG.getNode(NewOpc, DL, VT, Op0, Op1); 8624 } 8625 8626 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during 8627 // isel lowering to take advantage of no-stall back to back vmul + vmla. 8628 // vmull q0, d4, d6 8629 // vmlal q0, d5, d6 8630 // is faster than 8631 // vaddl q0, d4, d5 8632 // vmovl q1, d6 8633 // vmul q0, q0, q1 8634 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG); 8635 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG); 8636 EVT Op1VT = Op1.getValueType(); 8637 return DAG.getNode(N0->getOpcode(), DL, VT, 8638 DAG.getNode(NewOpc, DL, VT, 8639 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), 8640 DAG.getNode(NewOpc, DL, VT, 8641 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); 8642 } 8643 8644 static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, 8645 SelectionDAG &DAG) { 8646 // TODO: Should this propagate fast-math-flags? 8647 8648 // Convert to float 8649 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo)); 8650 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo)); 8651 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X); 8652 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y); 8653 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X); 8654 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y); 8655 // Get reciprocal estimate. 8656 // float4 recip = vrecpeq_f32(yf); 8657 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8658 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 8659 Y); 8660 // Because char has a smaller range than uchar, we can actually get away 8661 // without any newton steps. This requires that we use a weird bias 8662 // of 0xb000, however (again, this has been exhaustively tested). 8663 // float4 result = as_float4(as_int4(xf*recip) + 0xb000); 8664 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y); 8665 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X); 8666 Y = DAG.getConstant(0xb000, dl, MVT::v4i32); 8667 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y); 8668 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X); 8669 // Convert back to short. 8670 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X); 8671 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X); 8672 return X; 8673 } 8674 8675 static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, 8676 SelectionDAG &DAG) { 8677 // TODO: Should this propagate fast-math-flags? 8678 8679 SDValue N2; 8680 // Convert to float. 8681 // float4 yf = vcvt_f32_s32(vmovl_s16(y)); 8682 // float4 xf = vcvt_f32_s32(vmovl_s16(x)); 8683 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0); 8684 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1); 8685 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 8686 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 8687 8688 // Use reciprocal estimate and one refinement step. 8689 // float4 recip = vrecpeq_f32(yf); 8690 // recip *= vrecpsq_f32(yf, recip); 8691 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8692 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 8693 N1); 8694 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8695 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 8696 N1, N2); 8697 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 8698 // Because short has a smaller range than ushort, we can actually get away 8699 // with only a single newton step. This requires that we use a weird bias 8700 // of 89, however (again, this has been exhaustively tested). 8701 // float4 result = as_float4(as_int4(xf*recip) + 0x89); 8702 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 8703 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 8704 N1 = DAG.getConstant(0x89, dl, MVT::v4i32); 8705 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 8706 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 8707 // Convert back to integer and return. 8708 // return vmovn_s32(vcvt_s32_f32(result)); 8709 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 8710 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 8711 return N0; 8712 } 8713 8714 static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG, 8715 const ARMSubtarget *ST) { 8716 EVT VT = Op.getValueType(); 8717 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 8718 "unexpected type for custom-lowering ISD::SDIV"); 8719 8720 SDLoc dl(Op); 8721 SDValue N0 = Op.getOperand(0); 8722 SDValue N1 = Op.getOperand(1); 8723 SDValue N2, N3; 8724 8725 if (VT == MVT::v8i8) { 8726 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0); 8727 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1); 8728 8729 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 8730 DAG.getIntPtrConstant(4, dl)); 8731 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 8732 DAG.getIntPtrConstant(4, dl)); 8733 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 8734 DAG.getIntPtrConstant(0, dl)); 8735 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 8736 DAG.getIntPtrConstant(0, dl)); 8737 8738 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16 8739 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16 8740 8741 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 8742 N0 = LowerCONCAT_VECTORS(N0, DAG, ST); 8743 8744 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0); 8745 return N0; 8746 } 8747 return LowerSDIV_v4i16(N0, N1, dl, DAG); 8748 } 8749 8750 static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG, 8751 const ARMSubtarget *ST) { 8752 // TODO: Should this propagate fast-math-flags? 8753 EVT VT = Op.getValueType(); 8754 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 8755 "unexpected type for custom-lowering ISD::UDIV"); 8756 8757 SDLoc dl(Op); 8758 SDValue N0 = Op.getOperand(0); 8759 SDValue N1 = Op.getOperand(1); 8760 SDValue N2, N3; 8761 8762 if (VT == MVT::v8i8) { 8763 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0); 8764 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1); 8765 8766 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 8767 DAG.getIntPtrConstant(4, dl)); 8768 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 8769 DAG.getIntPtrConstant(4, dl)); 8770 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 8771 DAG.getIntPtrConstant(0, dl)); 8772 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 8773 DAG.getIntPtrConstant(0, dl)); 8774 8775 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16 8776 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16 8777 8778 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 8779 N0 = LowerCONCAT_VECTORS(N0, DAG, ST); 8780 8781 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8, 8782 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl, 8783 MVT::i32), 8784 N0); 8785 return N0; 8786 } 8787 8788 // v4i16 sdiv ... Convert to float. 8789 // float4 yf = vcvt_f32_s32(vmovl_u16(y)); 8790 // float4 xf = vcvt_f32_s32(vmovl_u16(x)); 8791 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0); 8792 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1); 8793 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 8794 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 8795 8796 // Use reciprocal estimate and two refinement steps. 8797 // float4 recip = vrecpeq_f32(yf); 8798 // recip *= vrecpsq_f32(yf, recip); 8799 // recip *= vrecpsq_f32(yf, recip); 8800 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8801 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 8802 BN1); 8803 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8804 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 8805 BN1, N2); 8806 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 8807 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8808 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 8809 BN1, N2); 8810 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 8811 // Simply multiplying by the reciprocal estimate can leave us a few ulps 8812 // too low, so we add 2 ulps (exhaustive testing shows that this is enough, 8813 // and that it will never cause us to return an answer too large). 8814 // float4 result = as_float4(as_int4(xf*recip) + 2); 8815 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 8816 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 8817 N1 = DAG.getConstant(2, dl, MVT::v4i32); 8818 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 8819 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 8820 // Convert back to integer and return. 8821 // return vmovn_u32(vcvt_s32_f32(result)); 8822 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 8823 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 8824 return N0; 8825 } 8826 8827 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) { 8828 SDNode *N = Op.getNode(); 8829 EVT VT = N->getValueType(0); 8830 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 8831 8832 SDValue Carry = Op.getOperand(2); 8833 8834 SDLoc DL(Op); 8835 8836 SDValue Result; 8837 if (Op.getOpcode() == ISD::ADDCARRY) { 8838 // This converts the boolean value carry into the carry flag. 8839 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 8840 8841 // Do the addition proper using the carry flag we wanted. 8842 Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0), 8843 Op.getOperand(1), Carry); 8844 8845 // Now convert the carry flag into a boolean value. 8846 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG); 8847 } else { 8848 // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we 8849 // have to invert the carry first. 8850 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 8851 DAG.getConstant(1, DL, MVT::i32), Carry); 8852 // This converts the boolean value carry into the carry flag. 8853 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 8854 8855 // Do the subtraction proper using the carry flag we wanted. 8856 Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0), 8857 Op.getOperand(1), Carry); 8858 8859 // Now convert the carry flag into a boolean value. 8860 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG); 8861 // But the carry returned by ARMISD::SUBE is not a borrow as expected 8862 // by ISD::SUBCARRY, so compute 1 - C. 8863 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 8864 DAG.getConstant(1, DL, MVT::i32), Carry); 8865 } 8866 8867 // Return both values. 8868 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry); 8869 } 8870 8871 SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { 8872 assert(Subtarget->isTargetDarwin()); 8873 8874 // For iOS, we want to call an alternative entry point: __sincos_stret, 8875 // return values are passed via sret. 8876 SDLoc dl(Op); 8877 SDValue Arg = Op.getOperand(0); 8878 EVT ArgVT = Arg.getValueType(); 8879 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 8880 auto PtrVT = getPointerTy(DAG.getDataLayout()); 8881 8882 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 8883 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 8884 8885 // Pair of floats / doubles used to pass the result. 8886 Type *RetTy = StructType::get(ArgTy, ArgTy); 8887 auto &DL = DAG.getDataLayout(); 8888 8889 ArgListTy Args; 8890 bool ShouldUseSRet = Subtarget->isAPCS_ABI(); 8891 SDValue SRet; 8892 if (ShouldUseSRet) { 8893 // Create stack object for sret. 8894 const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); 8895 const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy); 8896 int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); 8897 SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL)); 8898 8899 ArgListEntry Entry; 8900 Entry.Node = SRet; 8901 Entry.Ty = RetTy->getPointerTo(); 8902 Entry.IsSExt = false; 8903 Entry.IsZExt = false; 8904 Entry.IsSRet = true; 8905 Args.push_back(Entry); 8906 RetTy = Type::getVoidTy(*DAG.getContext()); 8907 } 8908 8909 ArgListEntry Entry; 8910 Entry.Node = Arg; 8911 Entry.Ty = ArgTy; 8912 Entry.IsSExt = false; 8913 Entry.IsZExt = false; 8914 Args.push_back(Entry); 8915 8916 RTLIB::Libcall LC = 8917 (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; 8918 const char *LibcallName = getLibcallName(LC); 8919 CallingConv::ID CC = getLibcallCallingConv(LC); 8920 SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL)); 8921 8922 TargetLowering::CallLoweringInfo CLI(DAG); 8923 CLI.setDebugLoc(dl) 8924 .setChain(DAG.getEntryNode()) 8925 .setCallee(CC, RetTy, Callee, std::move(Args)) 8926 .setDiscardResult(ShouldUseSRet); 8927 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 8928 8929 if (!ShouldUseSRet) 8930 return CallResult.first; 8931 8932 SDValue LoadSin = 8933 DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo()); 8934 8935 // Address of cos field. 8936 SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet, 8937 DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl)); 8938 SDValue LoadCos = 8939 DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo()); 8940 8941 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); 8942 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, 8943 LoadSin.getValue(0), LoadCos.getValue(0)); 8944 } 8945 8946 SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, 8947 bool Signed, 8948 SDValue &Chain) const { 8949 EVT VT = Op.getValueType(); 8950 assert((VT == MVT::i32 || VT == MVT::i64) && 8951 "unexpected type for custom lowering DIV"); 8952 SDLoc dl(Op); 8953 8954 const auto &DL = DAG.getDataLayout(); 8955 const auto &TLI = DAG.getTargetLoweringInfo(); 8956 8957 const char *Name = nullptr; 8958 if (Signed) 8959 Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64"; 8960 else 8961 Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64"; 8962 8963 SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL)); 8964 8965 ARMTargetLowering::ArgListTy Args; 8966 8967 for (auto AI : {1, 0}) { 8968 ArgListEntry Arg; 8969 Arg.Node = Op.getOperand(AI); 8970 Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext()); 8971 Args.push_back(Arg); 8972 } 8973 8974 CallLoweringInfo CLI(DAG); 8975 CLI.setDebugLoc(dl) 8976 .setChain(Chain) 8977 .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()), 8978 ES, std::move(Args)); 8979 8980 return LowerCallTo(CLI).first; 8981 } 8982 8983 // This is a code size optimisation: return the original SDIV node to 8984 // DAGCombiner when we don't want to expand SDIV into a sequence of 8985 // instructions, and an empty node otherwise which will cause the 8986 // SDIV to be expanded in DAGCombine. 8987 SDValue 8988 ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, 8989 SelectionDAG &DAG, 8990 SmallVectorImpl<SDNode *> &Created) const { 8991 // TODO: Support SREM 8992 if (N->getOpcode() != ISD::SDIV) 8993 return SDValue(); 8994 8995 const auto &ST = static_cast<const ARMSubtarget&>(DAG.getSubtarget()); 8996 const bool MinSize = ST.hasMinSize(); 8997 const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode() 8998 : ST.hasDivideInARMMode(); 8999 9000 // Don't touch vector types; rewriting this may lead to scalarizing 9001 // the int divs. 9002 if (N->getOperand(0).getValueType().isVector()) 9003 return SDValue(); 9004 9005 // Bail if MinSize is not set, and also for both ARM and Thumb mode we need 9006 // hwdiv support for this to be really profitable. 9007 if (!(MinSize && HasDivide)) 9008 return SDValue(); 9009 9010 // ARM mode is a bit simpler than Thumb: we can handle large power 9011 // of 2 immediates with 1 mov instruction; no further checks required, 9012 // just return the sdiv node. 9013 if (!ST.isThumb()) 9014 return SDValue(N, 0); 9015 9016 // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV, 9017 // and thus lose the code size benefits of a MOVS that requires only 2. 9018 // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here, 9019 // but as it's doing exactly this, it's not worth the trouble to get TTI. 9020 if (Divisor.sgt(128)) 9021 return SDValue(); 9022 9023 return SDValue(N, 0); 9024 } 9025 9026 SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, 9027 bool Signed) const { 9028 assert(Op.getValueType() == MVT::i32 && 9029 "unexpected type for custom lowering DIV"); 9030 SDLoc dl(Op); 9031 9032 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, 9033 DAG.getEntryNode(), Op.getOperand(1)); 9034 9035 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); 9036 } 9037 9038 static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain) { 9039 SDLoc DL(N); 9040 SDValue Op = N->getOperand(1); 9041 if (N->getValueType(0) == MVT::i32) 9042 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op); 9043 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op, 9044 DAG.getConstant(0, DL, MVT::i32)); 9045 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op, 9046 DAG.getConstant(1, DL, MVT::i32)); 9047 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, 9048 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi)); 9049 } 9050 9051 void ARMTargetLowering::ExpandDIV_Windows( 9052 SDValue Op, SelectionDAG &DAG, bool Signed, 9053 SmallVectorImpl<SDValue> &Results) const { 9054 const auto &DL = DAG.getDataLayout(); 9055 const auto &TLI = DAG.getTargetLoweringInfo(); 9056 9057 assert(Op.getValueType() == MVT::i64 && 9058 "unexpected type for custom lowering DIV"); 9059 SDLoc dl(Op); 9060 9061 SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode()); 9062 9063 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); 9064 9065 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result); 9066 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result, 9067 DAG.getConstant(32, dl, TLI.getPointerTy(DL))); 9068 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper); 9069 9070 Results.push_back(Lower); 9071 Results.push_back(Upper); 9072 } 9073 9074 static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) { 9075 LoadSDNode *LD = cast<LoadSDNode>(Op.getNode()); 9076 EVT MemVT = LD->getMemoryVT(); 9077 assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) && 9078 "Expected a predicate type!"); 9079 assert(MemVT == Op.getValueType()); 9080 assert(LD->getExtensionType() == ISD::NON_EXTLOAD && 9081 "Expected a non-extending load"); 9082 assert(LD->isUnindexed() && "Expected a unindexed load"); 9083 9084 // The basic MVE VLDR on a v4i1/v8i1 actually loads the entire 16bit 9085 // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We 9086 // need to make sure that 8/4 bits are actually loaded into the correct 9087 // place, which means loading the value and then shuffling the values into 9088 // the bottom bits of the predicate. 9089 // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect 9090 // for BE). 9091 9092 SDLoc dl(Op); 9093 SDValue Load = DAG.getExtLoad( 9094 ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(), 9095 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()), 9096 LD->getMemOperand()); 9097 SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Load); 9098 if (MemVT != MVT::v16i1) 9099 Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred, 9100 DAG.getConstant(0, dl, MVT::i32)); 9101 return DAG.getMergeValues({Pred, Load.getValue(1)}, dl); 9102 } 9103 9104 static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) { 9105 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode()); 9106 EVT MemVT = ST->getMemoryVT(); 9107 assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) && 9108 "Expected a predicate type!"); 9109 assert(MemVT == ST->getValue().getValueType()); 9110 assert(!ST->isTruncatingStore() && "Expected a non-extending store"); 9111 assert(ST->isUnindexed() && "Expected a unindexed store"); 9112 9113 // Only store the v4i1 or v8i1 worth of bits, via a buildvector with top bits 9114 // unset and a scalar store. 9115 SDLoc dl(Op); 9116 SDValue Build = ST->getValue(); 9117 if (MemVT != MVT::v16i1) { 9118 SmallVector<SDValue, 16> Ops; 9119 for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) 9120 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build, 9121 DAG.getConstant(I, dl, MVT::i32))); 9122 for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++) 9123 Ops.push_back(DAG.getUNDEF(MVT::i32)); 9124 Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops); 9125 } 9126 SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build); 9127 return DAG.getTruncStore( 9128 ST->getChain(), dl, GRP, ST->getBasePtr(), 9129 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()), 9130 ST->getMemOperand()); 9131 } 9132 9133 static bool isZeroVector(SDValue N) { 9134 return (ISD::isBuildVectorAllZeros(N.getNode()) || 9135 (N->getOpcode() == ARMISD::VMOVIMM && 9136 isNullConstant(N->getOperand(0)))); 9137 } 9138 9139 static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) { 9140 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode()); 9141 MVT VT = Op.getSimpleValueType(); 9142 SDValue Mask = N->getMask(); 9143 SDValue PassThru = N->getPassThru(); 9144 SDLoc dl(Op); 9145 9146 if (isZeroVector(PassThru)) 9147 return Op; 9148 9149 // MVE Masked loads use zero as the passthru value. Here we convert undef to 9150 // zero too, and other values are lowered to a select. 9151 SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 9152 DAG.getTargetConstant(0, dl, MVT::i32)); 9153 SDValue NewLoad = DAG.getMaskedLoad( 9154 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec, 9155 N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(), 9156 N->getExtensionType(), N->isExpandingLoad()); 9157 SDValue Combo = NewLoad; 9158 if (!PassThru.isUndef() && 9159 (PassThru.getOpcode() != ISD::BITCAST || 9160 !isZeroVector(PassThru->getOperand(0)))) 9161 Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru); 9162 return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl); 9163 } 9164 9165 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { 9166 if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering())) 9167 // Acquire/Release load/store is not legal for targets without a dmb or 9168 // equivalent available. 9169 return SDValue(); 9170 9171 // Monotonic load/store is legal for all targets. 9172 return Op; 9173 } 9174 9175 static void ReplaceREADCYCLECOUNTER(SDNode *N, 9176 SmallVectorImpl<SDValue> &Results, 9177 SelectionDAG &DAG, 9178 const ARMSubtarget *Subtarget) { 9179 SDLoc DL(N); 9180 // Under Power Management extensions, the cycle-count is: 9181 // mrc p15, #0, <Rt>, c9, c13, #0 9182 SDValue Ops[] = { N->getOperand(0), // Chain 9183 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32), 9184 DAG.getTargetConstant(15, DL, MVT::i32), 9185 DAG.getTargetConstant(0, DL, MVT::i32), 9186 DAG.getTargetConstant(9, DL, MVT::i32), 9187 DAG.getTargetConstant(13, DL, MVT::i32), 9188 DAG.getTargetConstant(0, DL, MVT::i32) 9189 }; 9190 9191 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, 9192 DAG.getVTList(MVT::i32, MVT::Other), Ops); 9193 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32, 9194 DAG.getConstant(0, DL, MVT::i32))); 9195 Results.push_back(Cycles32.getValue(1)); 9196 } 9197 9198 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) { 9199 SDLoc dl(V.getNode()); 9200 SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32); 9201 SDValue VHi = DAG.getAnyExtOrTrunc( 9202 DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)), 9203 dl, MVT::i32); 9204 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 9205 if (isBigEndian) 9206 std::swap (VLo, VHi); 9207 SDValue RegClass = 9208 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32); 9209 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32); 9210 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32); 9211 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 }; 9212 return SDValue( 9213 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0); 9214 } 9215 9216 static void ReplaceCMP_SWAP_64Results(SDNode *N, 9217 SmallVectorImpl<SDValue> & Results, 9218 SelectionDAG &DAG) { 9219 assert(N->getValueType(0) == MVT::i64 && 9220 "AtomicCmpSwap on types less than 64 should be legal"); 9221 SDValue Ops[] = {N->getOperand(1), 9222 createGPRPairNode(DAG, N->getOperand(2)), 9223 createGPRPairNode(DAG, N->getOperand(3)), 9224 N->getOperand(0)}; 9225 SDNode *CmpSwap = DAG.getMachineNode( 9226 ARM::CMP_SWAP_64, SDLoc(N), 9227 DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops); 9228 9229 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand(); 9230 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp}); 9231 9232 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 9233 9234 Results.push_back( 9235 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0, 9236 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0))); 9237 Results.push_back( 9238 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1, 9239 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0))); 9240 Results.push_back(SDValue(CmpSwap, 2)); 9241 } 9242 9243 SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const { 9244 SDLoc dl(Op); 9245 EVT VT = Op.getValueType(); 9246 SDValue Chain = Op.getOperand(0); 9247 SDValue LHS = Op.getOperand(1); 9248 SDValue RHS = Op.getOperand(2); 9249 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get(); 9250 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS; 9251 9252 // If we don't have instructions of this float type then soften to a libcall 9253 // and use SETCC instead. 9254 if (isUnsupportedFloatingType(LHS.getValueType())) { 9255 DAG.getTargetLoweringInfo().softenSetCCOperands( 9256 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS, Chain, IsSignaling); 9257 if (!RHS.getNode()) { 9258 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 9259 CC = ISD::SETNE; 9260 } 9261 SDValue Result = DAG.getNode(ISD::SETCC, dl, VT, LHS, RHS, 9262 DAG.getCondCode(CC)); 9263 return DAG.getMergeValues({Result, Chain}, dl); 9264 } 9265 9266 ARMCC::CondCodes CondCode, CondCode2; 9267 FPCCToARMCC(CC, CondCode, CondCode2); 9268 9269 // FIXME: Chain is not handled correctly here. Currently the FPSCR is implicit 9270 // in CMPFP and CMPFPE, but instead it should be made explicit by these 9271 // instructions using a chain instead of glue. This would also fix the problem 9272 // here (and also in LowerSELECT_CC) where we generate two comparisons when 9273 // CondCode2 != AL. 9274 SDValue True = DAG.getConstant(1, dl, VT); 9275 SDValue False = DAG.getConstant(0, dl, VT); 9276 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 9277 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 9278 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling); 9279 SDValue Result = getCMOV(dl, VT, False, True, ARMcc, CCR, Cmp, DAG); 9280 if (CondCode2 != ARMCC::AL) { 9281 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32); 9282 Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling); 9283 Result = getCMOV(dl, VT, Result, True, ARMcc, CCR, Cmp, DAG); 9284 } 9285 return DAG.getMergeValues({Result, Chain}, dl); 9286 } 9287 9288 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 9289 LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump()); 9290 switch (Op.getOpcode()) { 9291 default: llvm_unreachable("Don't know how to custom lower this!"); 9292 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG); 9293 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 9294 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 9295 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 9296 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 9297 case ISD::SELECT: return LowerSELECT(Op, DAG); 9298 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 9299 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 9300 case ISD::BR_CC: return LowerBR_CC(Op, DAG); 9301 case ISD::BR_JT: return LowerBR_JT(Op, DAG); 9302 case ISD::VASTART: return LowerVASTART(Op, DAG); 9303 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget); 9304 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget); 9305 case ISD::SINT_TO_FP: 9306 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); 9307 case ISD::STRICT_FP_TO_SINT: 9308 case ISD::STRICT_FP_TO_UINT: 9309 case ISD::FP_TO_SINT: 9310 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); 9311 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 9312 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 9313 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 9314 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG); 9315 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG); 9316 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); 9317 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget); 9318 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, 9319 Subtarget); 9320 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget); 9321 case ISD::SHL: 9322 case ISD::SRL: 9323 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget); 9324 case ISD::SREM: return LowerREM(Op.getNode(), DAG); 9325 case ISD::UREM: return LowerREM(Op.getNode(), DAG); 9326 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG); 9327 case ISD::SRL_PARTS: 9328 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); 9329 case ISD::CTTZ: 9330 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget); 9331 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget); 9332 case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget); 9333 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); 9334 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); 9335 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget); 9336 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget); 9337 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget); 9338 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 9339 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget); 9340 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget); 9341 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 9342 case ISD::MUL: return LowerMUL(Op, DAG); 9343 case ISD::SDIV: 9344 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) 9345 return LowerDIV_Windows(Op, DAG, /* Signed */ true); 9346 return LowerSDIV(Op, DAG, Subtarget); 9347 case ISD::UDIV: 9348 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) 9349 return LowerDIV_Windows(Op, DAG, /* Signed */ false); 9350 return LowerUDIV(Op, DAG, Subtarget); 9351 case ISD::ADDCARRY: 9352 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG); 9353 case ISD::SADDO: 9354 case ISD::SSUBO: 9355 return LowerSignedALUO(Op, DAG); 9356 case ISD::UADDO: 9357 case ISD::USUBO: 9358 return LowerUnsignedALUO(Op, DAG); 9359 case ISD::SADDSAT: 9360 case ISD::SSUBSAT: 9361 return LowerSADDSUBSAT(Op, DAG, Subtarget); 9362 case ISD::LOAD: 9363 return LowerPredicateLoad(Op, DAG); 9364 case ISD::STORE: 9365 return LowerPredicateStore(Op, DAG); 9366 case ISD::MLOAD: 9367 return LowerMLOAD(Op, DAG); 9368 case ISD::ATOMIC_LOAD: 9369 case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); 9370 case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); 9371 case ISD::SDIVREM: 9372 case ISD::UDIVREM: return LowerDivRem(Op, DAG); 9373 case ISD::DYNAMIC_STACKALLOC: 9374 if (Subtarget->isTargetWindows()) 9375 return LowerDYNAMIC_STACKALLOC(Op, DAG); 9376 llvm_unreachable("Don't know how to custom lower this!"); 9377 case ISD::STRICT_FP_ROUND: 9378 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); 9379 case ISD::STRICT_FP_EXTEND: 9380 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); 9381 case ISD::STRICT_FSETCC: 9382 case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG); 9383 case ARMISD::WIN__DBZCHK: return SDValue(); 9384 } 9385 } 9386 9387 static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results, 9388 SelectionDAG &DAG) { 9389 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 9390 unsigned Opc = 0; 9391 if (IntNo == Intrinsic::arm_smlald) 9392 Opc = ARMISD::SMLALD; 9393 else if (IntNo == Intrinsic::arm_smlaldx) 9394 Opc = ARMISD::SMLALDX; 9395 else if (IntNo == Intrinsic::arm_smlsld) 9396 Opc = ARMISD::SMLSLD; 9397 else if (IntNo == Intrinsic::arm_smlsldx) 9398 Opc = ARMISD::SMLSLDX; 9399 else 9400 return; 9401 9402 SDLoc dl(N); 9403 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 9404 N->getOperand(3), 9405 DAG.getConstant(0, dl, MVT::i32)); 9406 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 9407 N->getOperand(3), 9408 DAG.getConstant(1, dl, MVT::i32)); 9409 9410 SDValue LongMul = DAG.getNode(Opc, dl, 9411 DAG.getVTList(MVT::i32, MVT::i32), 9412 N->getOperand(1), N->getOperand(2), 9413 Lo, Hi); 9414 Results.push_back(LongMul.getValue(0)); 9415 Results.push_back(LongMul.getValue(1)); 9416 } 9417 9418 /// ReplaceNodeResults - Replace the results of node with an illegal result 9419 /// type with new values built out of custom code. 9420 void ARMTargetLowering::ReplaceNodeResults(SDNode *N, 9421 SmallVectorImpl<SDValue> &Results, 9422 SelectionDAG &DAG) const { 9423 SDValue Res; 9424 switch (N->getOpcode()) { 9425 default: 9426 llvm_unreachable("Don't know how to custom expand this!"); 9427 case ISD::READ_REGISTER: 9428 ExpandREAD_REGISTER(N, Results, DAG); 9429 break; 9430 case ISD::BITCAST: 9431 Res = ExpandBITCAST(N, DAG, Subtarget); 9432 break; 9433 case ISD::SRL: 9434 case ISD::SRA: 9435 case ISD::SHL: 9436 Res = Expand64BitShift(N, DAG, Subtarget); 9437 break; 9438 case ISD::SREM: 9439 case ISD::UREM: 9440 Res = LowerREM(N, DAG); 9441 break; 9442 case ISD::SDIVREM: 9443 case ISD::UDIVREM: 9444 Res = LowerDivRem(SDValue(N, 0), DAG); 9445 assert(Res.getNumOperands() == 2 && "DivRem needs two values"); 9446 Results.push_back(Res.getValue(0)); 9447 Results.push_back(Res.getValue(1)); 9448 return; 9449 case ISD::SADDSAT: 9450 case ISD::SSUBSAT: 9451 Res = LowerSADDSUBSAT(SDValue(N, 0), DAG, Subtarget); 9452 break; 9453 case ISD::READCYCLECOUNTER: 9454 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget); 9455 return; 9456 case ISD::UDIV: 9457 case ISD::SDIV: 9458 assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows"); 9459 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV, 9460 Results); 9461 case ISD::ATOMIC_CMP_SWAP: 9462 ReplaceCMP_SWAP_64Results(N, Results, DAG); 9463 return; 9464 case ISD::INTRINSIC_WO_CHAIN: 9465 return ReplaceLongIntrinsic(N, Results, DAG); 9466 case ISD::ABS: 9467 lowerABS(N, Results, DAG); 9468 return ; 9469 9470 } 9471 if (Res.getNode()) 9472 Results.push_back(Res); 9473 } 9474 9475 //===----------------------------------------------------------------------===// 9476 // ARM Scheduler Hooks 9477 //===----------------------------------------------------------------------===// 9478 9479 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and 9480 /// registers the function context. 9481 void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, 9482 MachineBasicBlock *MBB, 9483 MachineBasicBlock *DispatchBB, 9484 int FI) const { 9485 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 9486 "ROPI/RWPI not currently supported with SjLj"); 9487 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 9488 DebugLoc dl = MI.getDebugLoc(); 9489 MachineFunction *MF = MBB->getParent(); 9490 MachineRegisterInfo *MRI = &MF->getRegInfo(); 9491 MachineConstantPool *MCP = MF->getConstantPool(); 9492 ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>(); 9493 const Function &F = MF->getFunction(); 9494 9495 bool isThumb = Subtarget->isThumb(); 9496 bool isThumb2 = Subtarget->isThumb2(); 9497 9498 unsigned PCLabelId = AFI->createPICLabelUId(); 9499 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8; 9500 ARMConstantPoolValue *CPV = 9501 ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj); 9502 unsigned CPI = MCP->getConstantPoolIndex(CPV, 4); 9503 9504 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass 9505 : &ARM::GPRRegClass; 9506 9507 // Grab constant pool and fixed stack memory operands. 9508 MachineMemOperand *CPMMO = 9509 MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), 9510 MachineMemOperand::MOLoad, 4, 4); 9511 9512 MachineMemOperand *FIMMOSt = 9513 MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI), 9514 MachineMemOperand::MOStore, 4, 4); 9515 9516 // Load the address of the dispatch MBB into the jump buffer. 9517 if (isThumb2) { 9518 // Incoming value: jbuf 9519 // ldr.n r5, LCPI1_1 9520 // orr r5, r5, #1 9521 // add r5, pc 9522 // str r5, [$jbuf, #+4] ; &jbuf[1] 9523 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9524 BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1) 9525 .addConstantPoolIndex(CPI) 9526 .addMemOperand(CPMMO) 9527 .add(predOps(ARMCC::AL)); 9528 // Set the low bit because of thumb mode. 9529 Register NewVReg2 = MRI->createVirtualRegister(TRC); 9530 BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2) 9531 .addReg(NewVReg1, RegState::Kill) 9532 .addImm(0x01) 9533 .add(predOps(ARMCC::AL)) 9534 .add(condCodeOp()); 9535 Register NewVReg3 = MRI->createVirtualRegister(TRC); 9536 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3) 9537 .addReg(NewVReg2, RegState::Kill) 9538 .addImm(PCLabelId); 9539 BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12)) 9540 .addReg(NewVReg3, RegState::Kill) 9541 .addFrameIndex(FI) 9542 .addImm(36) // &jbuf[1] :: pc 9543 .addMemOperand(FIMMOSt) 9544 .add(predOps(ARMCC::AL)); 9545 } else if (isThumb) { 9546 // Incoming value: jbuf 9547 // ldr.n r1, LCPI1_4 9548 // add r1, pc 9549 // mov r2, #1 9550 // orrs r1, r2 9551 // add r2, $jbuf, #+4 ; &jbuf[1] 9552 // str r1, [r2] 9553 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9554 BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1) 9555 .addConstantPoolIndex(CPI) 9556 .addMemOperand(CPMMO) 9557 .add(predOps(ARMCC::AL)); 9558 Register NewVReg2 = MRI->createVirtualRegister(TRC); 9559 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2) 9560 .addReg(NewVReg1, RegState::Kill) 9561 .addImm(PCLabelId); 9562 // Set the low bit because of thumb mode. 9563 Register NewVReg3 = MRI->createVirtualRegister(TRC); 9564 BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3) 9565 .addReg(ARM::CPSR, RegState::Define) 9566 .addImm(1) 9567 .add(predOps(ARMCC::AL)); 9568 Register NewVReg4 = MRI->createVirtualRegister(TRC); 9569 BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4) 9570 .addReg(ARM::CPSR, RegState::Define) 9571 .addReg(NewVReg2, RegState::Kill) 9572 .addReg(NewVReg3, RegState::Kill) 9573 .add(predOps(ARMCC::AL)); 9574 Register NewVReg5 = MRI->createVirtualRegister(TRC); 9575 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5) 9576 .addFrameIndex(FI) 9577 .addImm(36); // &jbuf[1] :: pc 9578 BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi)) 9579 .addReg(NewVReg4, RegState::Kill) 9580 .addReg(NewVReg5, RegState::Kill) 9581 .addImm(0) 9582 .addMemOperand(FIMMOSt) 9583 .add(predOps(ARMCC::AL)); 9584 } else { 9585 // Incoming value: jbuf 9586 // ldr r1, LCPI1_1 9587 // add r1, pc, r1 9588 // str r1, [$jbuf, #+4] ; &jbuf[1] 9589 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9590 BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1) 9591 .addConstantPoolIndex(CPI) 9592 .addImm(0) 9593 .addMemOperand(CPMMO) 9594 .add(predOps(ARMCC::AL)); 9595 Register NewVReg2 = MRI->createVirtualRegister(TRC); 9596 BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2) 9597 .addReg(NewVReg1, RegState::Kill) 9598 .addImm(PCLabelId) 9599 .add(predOps(ARMCC::AL)); 9600 BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12)) 9601 .addReg(NewVReg2, RegState::Kill) 9602 .addFrameIndex(FI) 9603 .addImm(36) // &jbuf[1] :: pc 9604 .addMemOperand(FIMMOSt) 9605 .add(predOps(ARMCC::AL)); 9606 } 9607 } 9608 9609 void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, 9610 MachineBasicBlock *MBB) const { 9611 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 9612 DebugLoc dl = MI.getDebugLoc(); 9613 MachineFunction *MF = MBB->getParent(); 9614 MachineRegisterInfo *MRI = &MF->getRegInfo(); 9615 MachineFrameInfo &MFI = MF->getFrameInfo(); 9616 int FI = MFI.getFunctionContextIndex(); 9617 9618 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass 9619 : &ARM::GPRnopcRegClass; 9620 9621 // Get a mapping of the call site numbers to all of the landing pads they're 9622 // associated with. 9623 DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad; 9624 unsigned MaxCSNum = 0; 9625 for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E; 9626 ++BB) { 9627 if (!BB->isEHPad()) continue; 9628 9629 // FIXME: We should assert that the EH_LABEL is the first MI in the landing 9630 // pad. 9631 for (MachineBasicBlock::iterator 9632 II = BB->begin(), IE = BB->end(); II != IE; ++II) { 9633 if (!II->isEHLabel()) continue; 9634 9635 MCSymbol *Sym = II->getOperand(0).getMCSymbol(); 9636 if (!MF->hasCallSiteLandingPad(Sym)) continue; 9637 9638 SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym); 9639 for (SmallVectorImpl<unsigned>::iterator 9640 CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end(); 9641 CSI != CSE; ++CSI) { 9642 CallSiteNumToLPad[*CSI].push_back(&*BB); 9643 MaxCSNum = std::max(MaxCSNum, *CSI); 9644 } 9645 break; 9646 } 9647 } 9648 9649 // Get an ordered list of the machine basic blocks for the jump table. 9650 std::vector<MachineBasicBlock*> LPadList; 9651 SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs; 9652 LPadList.reserve(CallSiteNumToLPad.size()); 9653 for (unsigned I = 1; I <= MaxCSNum; ++I) { 9654 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I]; 9655 for (SmallVectorImpl<MachineBasicBlock*>::iterator 9656 II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) { 9657 LPadList.push_back(*II); 9658 InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end()); 9659 } 9660 } 9661 9662 assert(!LPadList.empty() && 9663 "No landing pad destinations for the dispatch jump table!"); 9664 9665 // Create the jump table and associated information. 9666 MachineJumpTableInfo *JTI = 9667 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline); 9668 unsigned MJTI = JTI->createJumpTableIndex(LPadList); 9669 9670 // Create the MBBs for the dispatch code. 9671 9672 // Shove the dispatch's address into the return slot in the function context. 9673 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock(); 9674 DispatchBB->setIsEHPad(); 9675 9676 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 9677 unsigned trap_opcode; 9678 if (Subtarget->isThumb()) 9679 trap_opcode = ARM::tTRAP; 9680 else 9681 trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP; 9682 9683 BuildMI(TrapBB, dl, TII->get(trap_opcode)); 9684 DispatchBB->addSuccessor(TrapBB); 9685 9686 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock(); 9687 DispatchBB->addSuccessor(DispContBB); 9688 9689 // Insert and MBBs. 9690 MF->insert(MF->end(), DispatchBB); 9691 MF->insert(MF->end(), DispContBB); 9692 MF->insert(MF->end(), TrapBB); 9693 9694 // Insert code into the entry block that creates and registers the function 9695 // context. 9696 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI); 9697 9698 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand( 9699 MachinePointerInfo::getFixedStack(*MF, FI), 9700 MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4); 9701 9702 MachineInstrBuilder MIB; 9703 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup)); 9704 9705 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII); 9706 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo(); 9707 9708 // Add a register mask with no preserved registers. This results in all 9709 // registers being marked as clobbered. This can't work if the dispatch block 9710 // is in a Thumb1 function and is linked with ARM code which uses the FP 9711 // registers, as there is no way to preserve the FP registers in Thumb1 mode. 9712 MIB.addRegMask(RI.getSjLjDispatchPreservedMask(*MF)); 9713 9714 bool IsPositionIndependent = isPositionIndependent(); 9715 unsigned NumLPads = LPadList.size(); 9716 if (Subtarget->isThumb2()) { 9717 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9718 BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1) 9719 .addFrameIndex(FI) 9720 .addImm(4) 9721 .addMemOperand(FIMMOLd) 9722 .add(predOps(ARMCC::AL)); 9723 9724 if (NumLPads < 256) { 9725 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri)) 9726 .addReg(NewVReg1) 9727 .addImm(LPadList.size()) 9728 .add(predOps(ARMCC::AL)); 9729 } else { 9730 Register VReg1 = MRI->createVirtualRegister(TRC); 9731 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1) 9732 .addImm(NumLPads & 0xFFFF) 9733 .add(predOps(ARMCC::AL)); 9734 9735 unsigned VReg2 = VReg1; 9736 if ((NumLPads & 0xFFFF0000) != 0) { 9737 VReg2 = MRI->createVirtualRegister(TRC); 9738 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2) 9739 .addReg(VReg1) 9740 .addImm(NumLPads >> 16) 9741 .add(predOps(ARMCC::AL)); 9742 } 9743 9744 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr)) 9745 .addReg(NewVReg1) 9746 .addReg(VReg2) 9747 .add(predOps(ARMCC::AL)); 9748 } 9749 9750 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc)) 9751 .addMBB(TrapBB) 9752 .addImm(ARMCC::HI) 9753 .addReg(ARM::CPSR); 9754 9755 Register NewVReg3 = MRI->createVirtualRegister(TRC); 9756 BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3) 9757 .addJumpTableIndex(MJTI) 9758 .add(predOps(ARMCC::AL)); 9759 9760 Register NewVReg4 = MRI->createVirtualRegister(TRC); 9761 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4) 9762 .addReg(NewVReg3, RegState::Kill) 9763 .addReg(NewVReg1) 9764 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) 9765 .add(predOps(ARMCC::AL)) 9766 .add(condCodeOp()); 9767 9768 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT)) 9769 .addReg(NewVReg4, RegState::Kill) 9770 .addReg(NewVReg1) 9771 .addJumpTableIndex(MJTI); 9772 } else if (Subtarget->isThumb()) { 9773 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9774 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1) 9775 .addFrameIndex(FI) 9776 .addImm(1) 9777 .addMemOperand(FIMMOLd) 9778 .add(predOps(ARMCC::AL)); 9779 9780 if (NumLPads < 256) { 9781 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8)) 9782 .addReg(NewVReg1) 9783 .addImm(NumLPads) 9784 .add(predOps(ARMCC::AL)); 9785 } else { 9786 MachineConstantPool *ConstantPool = MF->getConstantPool(); 9787 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 9788 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 9789 9790 // MachineConstantPool wants an explicit alignment. 9791 unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); 9792 if (Align == 0) 9793 Align = MF->getDataLayout().getTypeAllocSize(C->getType()); 9794 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 9795 9796 Register VReg1 = MRI->createVirtualRegister(TRC); 9797 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci)) 9798 .addReg(VReg1, RegState::Define) 9799 .addConstantPoolIndex(Idx) 9800 .add(predOps(ARMCC::AL)); 9801 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr)) 9802 .addReg(NewVReg1) 9803 .addReg(VReg1) 9804 .add(predOps(ARMCC::AL)); 9805 } 9806 9807 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc)) 9808 .addMBB(TrapBB) 9809 .addImm(ARMCC::HI) 9810 .addReg(ARM::CPSR); 9811 9812 Register NewVReg2 = MRI->createVirtualRegister(TRC); 9813 BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2) 9814 .addReg(ARM::CPSR, RegState::Define) 9815 .addReg(NewVReg1) 9816 .addImm(2) 9817 .add(predOps(ARMCC::AL)); 9818 9819 Register NewVReg3 = MRI->createVirtualRegister(TRC); 9820 BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3) 9821 .addJumpTableIndex(MJTI) 9822 .add(predOps(ARMCC::AL)); 9823 9824 Register NewVReg4 = MRI->createVirtualRegister(TRC); 9825 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4) 9826 .addReg(ARM::CPSR, RegState::Define) 9827 .addReg(NewVReg2, RegState::Kill) 9828 .addReg(NewVReg3) 9829 .add(predOps(ARMCC::AL)); 9830 9831 MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( 9832 MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); 9833 9834 Register NewVReg5 = MRI->createVirtualRegister(TRC); 9835 BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5) 9836 .addReg(NewVReg4, RegState::Kill) 9837 .addImm(0) 9838 .addMemOperand(JTMMOLd) 9839 .add(predOps(ARMCC::AL)); 9840 9841 unsigned NewVReg6 = NewVReg5; 9842 if (IsPositionIndependent) { 9843 NewVReg6 = MRI->createVirtualRegister(TRC); 9844 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6) 9845 .addReg(ARM::CPSR, RegState::Define) 9846 .addReg(NewVReg5, RegState::Kill) 9847 .addReg(NewVReg3) 9848 .add(predOps(ARMCC::AL)); 9849 } 9850 9851 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr)) 9852 .addReg(NewVReg6, RegState::Kill) 9853 .addJumpTableIndex(MJTI); 9854 } else { 9855 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9856 BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1) 9857 .addFrameIndex(FI) 9858 .addImm(4) 9859 .addMemOperand(FIMMOLd) 9860 .add(predOps(ARMCC::AL)); 9861 9862 if (NumLPads < 256) { 9863 BuildMI(DispatchBB, dl, TII->get(ARM::CMPri)) 9864 .addReg(NewVReg1) 9865 .addImm(NumLPads) 9866 .add(predOps(ARMCC::AL)); 9867 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) { 9868 Register VReg1 = MRI->createVirtualRegister(TRC); 9869 BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1) 9870 .addImm(NumLPads & 0xFFFF) 9871 .add(predOps(ARMCC::AL)); 9872 9873 unsigned VReg2 = VReg1; 9874 if ((NumLPads & 0xFFFF0000) != 0) { 9875 VReg2 = MRI->createVirtualRegister(TRC); 9876 BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2) 9877 .addReg(VReg1) 9878 .addImm(NumLPads >> 16) 9879 .add(predOps(ARMCC::AL)); 9880 } 9881 9882 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 9883 .addReg(NewVReg1) 9884 .addReg(VReg2) 9885 .add(predOps(ARMCC::AL)); 9886 } else { 9887 MachineConstantPool *ConstantPool = MF->getConstantPool(); 9888 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 9889 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 9890 9891 // MachineConstantPool wants an explicit alignment. 9892 unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); 9893 if (Align == 0) 9894 Align = MF->getDataLayout().getTypeAllocSize(C->getType()); 9895 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 9896 9897 Register VReg1 = MRI->createVirtualRegister(TRC); 9898 BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp)) 9899 .addReg(VReg1, RegState::Define) 9900 .addConstantPoolIndex(Idx) 9901 .addImm(0) 9902 .add(predOps(ARMCC::AL)); 9903 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 9904 .addReg(NewVReg1) 9905 .addReg(VReg1, RegState::Kill) 9906 .add(predOps(ARMCC::AL)); 9907 } 9908 9909 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc)) 9910 .addMBB(TrapBB) 9911 .addImm(ARMCC::HI) 9912 .addReg(ARM::CPSR); 9913 9914 Register NewVReg3 = MRI->createVirtualRegister(TRC); 9915 BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3) 9916 .addReg(NewVReg1) 9917 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) 9918 .add(predOps(ARMCC::AL)) 9919 .add(condCodeOp()); 9920 Register NewVReg4 = MRI->createVirtualRegister(TRC); 9921 BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4) 9922 .addJumpTableIndex(MJTI) 9923 .add(predOps(ARMCC::AL)); 9924 9925 MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( 9926 MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); 9927 Register NewVReg5 = MRI->createVirtualRegister(TRC); 9928 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5) 9929 .addReg(NewVReg3, RegState::Kill) 9930 .addReg(NewVReg4) 9931 .addImm(0) 9932 .addMemOperand(JTMMOLd) 9933 .add(predOps(ARMCC::AL)); 9934 9935 if (IsPositionIndependent) { 9936 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd)) 9937 .addReg(NewVReg5, RegState::Kill) 9938 .addReg(NewVReg4) 9939 .addJumpTableIndex(MJTI); 9940 } else { 9941 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr)) 9942 .addReg(NewVReg5, RegState::Kill) 9943 .addJumpTableIndex(MJTI); 9944 } 9945 } 9946 9947 // Add the jump table entries as successors to the MBB. 9948 SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs; 9949 for (std::vector<MachineBasicBlock*>::iterator 9950 I = LPadList.begin(), E = LPadList.end(); I != E; ++I) { 9951 MachineBasicBlock *CurMBB = *I; 9952 if (SeenMBBs.insert(CurMBB).second) 9953 DispContBB->addSuccessor(CurMBB); 9954 } 9955 9956 // N.B. the order the invoke BBs are processed in doesn't matter here. 9957 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF); 9958 SmallVector<MachineBasicBlock*, 64> MBBLPads; 9959 for (MachineBasicBlock *BB : InvokeBBs) { 9960 9961 // Remove the landing pad successor from the invoke block and replace it 9962 // with the new dispatch block. 9963 SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(), 9964 BB->succ_end()); 9965 while (!Successors.empty()) { 9966 MachineBasicBlock *SMBB = Successors.pop_back_val(); 9967 if (SMBB->isEHPad()) { 9968 BB->removeSuccessor(SMBB); 9969 MBBLPads.push_back(SMBB); 9970 } 9971 } 9972 9973 BB->addSuccessor(DispatchBB, BranchProbability::getZero()); 9974 BB->normalizeSuccProbs(); 9975 9976 // Find the invoke call and mark all of the callee-saved registers as 9977 // 'implicit defined' so that they're spilled. This prevents code from 9978 // moving instructions to before the EH block, where they will never be 9979 // executed. 9980 for (MachineBasicBlock::reverse_iterator 9981 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) { 9982 if (!II->isCall()) continue; 9983 9984 DenseMap<unsigned, bool> DefRegs; 9985 for (MachineInstr::mop_iterator 9986 OI = II->operands_begin(), OE = II->operands_end(); 9987 OI != OE; ++OI) { 9988 if (!OI->isReg()) continue; 9989 DefRegs[OI->getReg()] = true; 9990 } 9991 9992 MachineInstrBuilder MIB(*MF, &*II); 9993 9994 for (unsigned i = 0; SavedRegs[i] != 0; ++i) { 9995 unsigned Reg = SavedRegs[i]; 9996 if (Subtarget->isThumb2() && 9997 !ARM::tGPRRegClass.contains(Reg) && 9998 !ARM::hGPRRegClass.contains(Reg)) 9999 continue; 10000 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg)) 10001 continue; 10002 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg)) 10003 continue; 10004 if (!DefRegs[Reg]) 10005 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead); 10006 } 10007 10008 break; 10009 } 10010 } 10011 10012 // Mark all former landing pads as non-landing pads. The dispatch is the only 10013 // landing pad now. 10014 for (SmallVectorImpl<MachineBasicBlock*>::iterator 10015 I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I) 10016 (*I)->setIsEHPad(false); 10017 10018 // The instruction is gone now. 10019 MI.eraseFromParent(); 10020 } 10021 10022 static 10023 MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { 10024 for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(), 10025 E = MBB->succ_end(); I != E; ++I) 10026 if (*I != Succ) 10027 return *I; 10028 llvm_unreachable("Expecting a BB with two successors!"); 10029 } 10030 10031 /// Return the load opcode for a given load size. If load size >= 8, 10032 /// neon opcode will be returned. 10033 static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) { 10034 if (LdSize >= 8) 10035 return LdSize == 16 ? ARM::VLD1q32wb_fixed 10036 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0; 10037 if (IsThumb1) 10038 return LdSize == 4 ? ARM::tLDRi 10039 : LdSize == 2 ? ARM::tLDRHi 10040 : LdSize == 1 ? ARM::tLDRBi : 0; 10041 if (IsThumb2) 10042 return LdSize == 4 ? ARM::t2LDR_POST 10043 : LdSize == 2 ? ARM::t2LDRH_POST 10044 : LdSize == 1 ? ARM::t2LDRB_POST : 0; 10045 return LdSize == 4 ? ARM::LDR_POST_IMM 10046 : LdSize == 2 ? ARM::LDRH_POST 10047 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0; 10048 } 10049 10050 /// Return the store opcode for a given store size. If store size >= 8, 10051 /// neon opcode will be returned. 10052 static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) { 10053 if (StSize >= 8) 10054 return StSize == 16 ? ARM::VST1q32wb_fixed 10055 : StSize == 8 ? ARM::VST1d32wb_fixed : 0; 10056 if (IsThumb1) 10057 return StSize == 4 ? ARM::tSTRi 10058 : StSize == 2 ? ARM::tSTRHi 10059 : StSize == 1 ? ARM::tSTRBi : 0; 10060 if (IsThumb2) 10061 return StSize == 4 ? ARM::t2STR_POST 10062 : StSize == 2 ? ARM::t2STRH_POST 10063 : StSize == 1 ? ARM::t2STRB_POST : 0; 10064 return StSize == 4 ? ARM::STR_POST_IMM 10065 : StSize == 2 ? ARM::STRH_POST 10066 : StSize == 1 ? ARM::STRB_POST_IMM : 0; 10067 } 10068 10069 /// Emit a post-increment load operation with given size. The instructions 10070 /// will be added to BB at Pos. 10071 static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, 10072 const TargetInstrInfo *TII, const DebugLoc &dl, 10073 unsigned LdSize, unsigned Data, unsigned AddrIn, 10074 unsigned AddrOut, bool IsThumb1, bool IsThumb2) { 10075 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2); 10076 assert(LdOpc != 0 && "Should have a load opcode"); 10077 if (LdSize >= 8) { 10078 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 10079 .addReg(AddrOut, RegState::Define) 10080 .addReg(AddrIn) 10081 .addImm(0) 10082 .add(predOps(ARMCC::AL)); 10083 } else if (IsThumb1) { 10084 // load + update AddrIn 10085 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 10086 .addReg(AddrIn) 10087 .addImm(0) 10088 .add(predOps(ARMCC::AL)); 10089 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) 10090 .add(t1CondCodeOp()) 10091 .addReg(AddrIn) 10092 .addImm(LdSize) 10093 .add(predOps(ARMCC::AL)); 10094 } else if (IsThumb2) { 10095 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 10096 .addReg(AddrOut, RegState::Define) 10097 .addReg(AddrIn) 10098 .addImm(LdSize) 10099 .add(predOps(ARMCC::AL)); 10100 } else { // arm 10101 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 10102 .addReg(AddrOut, RegState::Define) 10103 .addReg(AddrIn) 10104 .addReg(0) 10105 .addImm(LdSize) 10106 .add(predOps(ARMCC::AL)); 10107 } 10108 } 10109 10110 /// Emit a post-increment store operation with given size. The instructions 10111 /// will be added to BB at Pos. 10112 static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, 10113 const TargetInstrInfo *TII, const DebugLoc &dl, 10114 unsigned StSize, unsigned Data, unsigned AddrIn, 10115 unsigned AddrOut, bool IsThumb1, bool IsThumb2) { 10116 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2); 10117 assert(StOpc != 0 && "Should have a store opcode"); 10118 if (StSize >= 8) { 10119 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 10120 .addReg(AddrIn) 10121 .addImm(0) 10122 .addReg(Data) 10123 .add(predOps(ARMCC::AL)); 10124 } else if (IsThumb1) { 10125 // store + update AddrIn 10126 BuildMI(*BB, Pos, dl, TII->get(StOpc)) 10127 .addReg(Data) 10128 .addReg(AddrIn) 10129 .addImm(0) 10130 .add(predOps(ARMCC::AL)); 10131 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) 10132 .add(t1CondCodeOp()) 10133 .addReg(AddrIn) 10134 .addImm(StSize) 10135 .add(predOps(ARMCC::AL)); 10136 } else if (IsThumb2) { 10137 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 10138 .addReg(Data) 10139 .addReg(AddrIn) 10140 .addImm(StSize) 10141 .add(predOps(ARMCC::AL)); 10142 } else { // arm 10143 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 10144 .addReg(Data) 10145 .addReg(AddrIn) 10146 .addReg(0) 10147 .addImm(StSize) 10148 .add(predOps(ARMCC::AL)); 10149 } 10150 } 10151 10152 MachineBasicBlock * 10153 ARMTargetLowering::EmitStructByval(MachineInstr &MI, 10154 MachineBasicBlock *BB) const { 10155 // This pseudo instruction has 3 operands: dst, src, size 10156 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold(). 10157 // Otherwise, we will generate unrolled scalar copies. 10158 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 10159 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 10160 MachineFunction::iterator It = ++BB->getIterator(); 10161 10162 Register dest = MI.getOperand(0).getReg(); 10163 Register src = MI.getOperand(1).getReg(); 10164 unsigned SizeVal = MI.getOperand(2).getImm(); 10165 unsigned Align = MI.getOperand(3).getImm(); 10166 DebugLoc dl = MI.getDebugLoc(); 10167 10168 MachineFunction *MF = BB->getParent(); 10169 MachineRegisterInfo &MRI = MF->getRegInfo(); 10170 unsigned UnitSize = 0; 10171 const TargetRegisterClass *TRC = nullptr; 10172 const TargetRegisterClass *VecTRC = nullptr; 10173 10174 bool IsThumb1 = Subtarget->isThumb1Only(); 10175 bool IsThumb2 = Subtarget->isThumb2(); 10176 bool IsThumb = Subtarget->isThumb(); 10177 10178 if (Align & 1) { 10179 UnitSize = 1; 10180 } else if (Align & 2) { 10181 UnitSize = 2; 10182 } else { 10183 // Check whether we can use NEON instructions. 10184 if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) && 10185 Subtarget->hasNEON()) { 10186 if ((Align % 16 == 0) && SizeVal >= 16) 10187 UnitSize = 16; 10188 else if ((Align % 8 == 0) && SizeVal >= 8) 10189 UnitSize = 8; 10190 } 10191 // Can't use NEON instructions. 10192 if (UnitSize == 0) 10193 UnitSize = 4; 10194 } 10195 10196 // Select the correct opcode and register class for unit size load/store 10197 bool IsNeon = UnitSize >= 8; 10198 TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass; 10199 if (IsNeon) 10200 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass 10201 : UnitSize == 8 ? &ARM::DPRRegClass 10202 : nullptr; 10203 10204 unsigned BytesLeft = SizeVal % UnitSize; 10205 unsigned LoopSize = SizeVal - BytesLeft; 10206 10207 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) { 10208 // Use LDR and STR to copy. 10209 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize) 10210 // [destOut] = STR_POST(scratch, destIn, UnitSize) 10211 unsigned srcIn = src; 10212 unsigned destIn = dest; 10213 for (unsigned i = 0; i < LoopSize; i+=UnitSize) { 10214 Register srcOut = MRI.createVirtualRegister(TRC); 10215 Register destOut = MRI.createVirtualRegister(TRC); 10216 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); 10217 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut, 10218 IsThumb1, IsThumb2); 10219 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut, 10220 IsThumb1, IsThumb2); 10221 srcIn = srcOut; 10222 destIn = destOut; 10223 } 10224 10225 // Handle the leftover bytes with LDRB and STRB. 10226 // [scratch, srcOut] = LDRB_POST(srcIn, 1) 10227 // [destOut] = STRB_POST(scratch, destIn, 1) 10228 for (unsigned i = 0; i < BytesLeft; i++) { 10229 Register srcOut = MRI.createVirtualRegister(TRC); 10230 Register destOut = MRI.createVirtualRegister(TRC); 10231 Register scratch = MRI.createVirtualRegister(TRC); 10232 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut, 10233 IsThumb1, IsThumb2); 10234 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut, 10235 IsThumb1, IsThumb2); 10236 srcIn = srcOut; 10237 destIn = destOut; 10238 } 10239 MI.eraseFromParent(); // The instruction is gone now. 10240 return BB; 10241 } 10242 10243 // Expand the pseudo op to a loop. 10244 // thisMBB: 10245 // ... 10246 // movw varEnd, # --> with thumb2 10247 // movt varEnd, # 10248 // ldrcp varEnd, idx --> without thumb2 10249 // fallthrough --> loopMBB 10250 // loopMBB: 10251 // PHI varPhi, varEnd, varLoop 10252 // PHI srcPhi, src, srcLoop 10253 // PHI destPhi, dst, destLoop 10254 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 10255 // [destLoop] = STR_POST(scratch, destPhi, UnitSize) 10256 // subs varLoop, varPhi, #UnitSize 10257 // bne loopMBB 10258 // fallthrough --> exitMBB 10259 // exitMBB: 10260 // epilogue to handle left-over bytes 10261 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 10262 // [destOut] = STRB_POST(scratch, destLoop, 1) 10263 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); 10264 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); 10265 MF->insert(It, loopMBB); 10266 MF->insert(It, exitMBB); 10267 10268 // Transfer the remainder of BB and its successor edges to exitMBB. 10269 exitMBB->splice(exitMBB->begin(), BB, 10270 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10271 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 10272 10273 // Load an immediate to varEnd. 10274 Register varEnd = MRI.createVirtualRegister(TRC); 10275 if (Subtarget->useMovt()) { 10276 unsigned Vtmp = varEnd; 10277 if ((LoopSize & 0xFFFF0000) != 0) 10278 Vtmp = MRI.createVirtualRegister(TRC); 10279 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16), Vtmp) 10280 .addImm(LoopSize & 0xFFFF) 10281 .add(predOps(ARMCC::AL)); 10282 10283 if ((LoopSize & 0xFFFF0000) != 0) 10284 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16), varEnd) 10285 .addReg(Vtmp) 10286 .addImm(LoopSize >> 16) 10287 .add(predOps(ARMCC::AL)); 10288 } else { 10289 MachineConstantPool *ConstantPool = MF->getConstantPool(); 10290 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 10291 const Constant *C = ConstantInt::get(Int32Ty, LoopSize); 10292 10293 // MachineConstantPool wants an explicit alignment. 10294 unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); 10295 if (Align == 0) 10296 Align = MF->getDataLayout().getTypeAllocSize(C->getType()); 10297 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 10298 MachineMemOperand *CPMMO = 10299 MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), 10300 MachineMemOperand::MOLoad, 4, 4); 10301 10302 if (IsThumb) 10303 BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)) 10304 .addReg(varEnd, RegState::Define) 10305 .addConstantPoolIndex(Idx) 10306 .add(predOps(ARMCC::AL)) 10307 .addMemOperand(CPMMO); 10308 else 10309 BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)) 10310 .addReg(varEnd, RegState::Define) 10311 .addConstantPoolIndex(Idx) 10312 .addImm(0) 10313 .add(predOps(ARMCC::AL)) 10314 .addMemOperand(CPMMO); 10315 } 10316 BB->addSuccessor(loopMBB); 10317 10318 // Generate the loop body: 10319 // varPhi = PHI(varLoop, varEnd) 10320 // srcPhi = PHI(srcLoop, src) 10321 // destPhi = PHI(destLoop, dst) 10322 MachineBasicBlock *entryBB = BB; 10323 BB = loopMBB; 10324 Register varLoop = MRI.createVirtualRegister(TRC); 10325 Register varPhi = MRI.createVirtualRegister(TRC); 10326 Register srcLoop = MRI.createVirtualRegister(TRC); 10327 Register srcPhi = MRI.createVirtualRegister(TRC); 10328 Register destLoop = MRI.createVirtualRegister(TRC); 10329 Register destPhi = MRI.createVirtualRegister(TRC); 10330 10331 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi) 10332 .addReg(varLoop).addMBB(loopMBB) 10333 .addReg(varEnd).addMBB(entryBB); 10334 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi) 10335 .addReg(srcLoop).addMBB(loopMBB) 10336 .addReg(src).addMBB(entryBB); 10337 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi) 10338 .addReg(destLoop).addMBB(loopMBB) 10339 .addReg(dest).addMBB(entryBB); 10340 10341 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 10342 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz) 10343 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); 10344 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop, 10345 IsThumb1, IsThumb2); 10346 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop, 10347 IsThumb1, IsThumb2); 10348 10349 // Decrement loop variable by UnitSize. 10350 if (IsThumb1) { 10351 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop) 10352 .add(t1CondCodeOp()) 10353 .addReg(varPhi) 10354 .addImm(UnitSize) 10355 .add(predOps(ARMCC::AL)); 10356 } else { 10357 MachineInstrBuilder MIB = 10358 BuildMI(*BB, BB->end(), dl, 10359 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop); 10360 MIB.addReg(varPhi) 10361 .addImm(UnitSize) 10362 .add(predOps(ARMCC::AL)) 10363 .add(condCodeOp()); 10364 MIB->getOperand(5).setReg(ARM::CPSR); 10365 MIB->getOperand(5).setIsDef(true); 10366 } 10367 BuildMI(*BB, BB->end(), dl, 10368 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc)) 10369 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); 10370 10371 // loopMBB can loop back to loopMBB or fall through to exitMBB. 10372 BB->addSuccessor(loopMBB); 10373 BB->addSuccessor(exitMBB); 10374 10375 // Add epilogue to handle BytesLeft. 10376 BB = exitMBB; 10377 auto StartOfExit = exitMBB->begin(); 10378 10379 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 10380 // [destOut] = STRB_POST(scratch, destLoop, 1) 10381 unsigned srcIn = srcLoop; 10382 unsigned destIn = destLoop; 10383 for (unsigned i = 0; i < BytesLeft; i++) { 10384 Register srcOut = MRI.createVirtualRegister(TRC); 10385 Register destOut = MRI.createVirtualRegister(TRC); 10386 Register scratch = MRI.createVirtualRegister(TRC); 10387 emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut, 10388 IsThumb1, IsThumb2); 10389 emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut, 10390 IsThumb1, IsThumb2); 10391 srcIn = srcOut; 10392 destIn = destOut; 10393 } 10394 10395 MI.eraseFromParent(); // The instruction is gone now. 10396 return BB; 10397 } 10398 10399 MachineBasicBlock * 10400 ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI, 10401 MachineBasicBlock *MBB) const { 10402 const TargetMachine &TM = getTargetMachine(); 10403 const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); 10404 DebugLoc DL = MI.getDebugLoc(); 10405 10406 assert(Subtarget->isTargetWindows() && 10407 "__chkstk is only supported on Windows"); 10408 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode"); 10409 10410 // __chkstk takes the number of words to allocate on the stack in R4, and 10411 // returns the stack adjustment in number of bytes in R4. This will not 10412 // clober any other registers (other than the obvious lr). 10413 // 10414 // Although, technically, IP should be considered a register which may be 10415 // clobbered, the call itself will not touch it. Windows on ARM is a pure 10416 // thumb-2 environment, so there is no interworking required. As a result, we 10417 // do not expect a veneer to be emitted by the linker, clobbering IP. 10418 // 10419 // Each module receives its own copy of __chkstk, so no import thunk is 10420 // required, again, ensuring that IP is not clobbered. 10421 // 10422 // Finally, although some linkers may theoretically provide a trampoline for 10423 // out of range calls (which is quite common due to a 32M range limitation of 10424 // branches for Thumb), we can generate the long-call version via 10425 // -mcmodel=large, alleviating the need for the trampoline which may clobber 10426 // IP. 10427 10428 switch (TM.getCodeModel()) { 10429 case CodeModel::Tiny: 10430 llvm_unreachable("Tiny code model not available on ARM."); 10431 case CodeModel::Small: 10432 case CodeModel::Medium: 10433 case CodeModel::Kernel: 10434 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL)) 10435 .add(predOps(ARMCC::AL)) 10436 .addExternalSymbol("__chkstk") 10437 .addReg(ARM::R4, RegState::Implicit | RegState::Kill) 10438 .addReg(ARM::R4, RegState::Implicit | RegState::Define) 10439 .addReg(ARM::R12, 10440 RegState::Implicit | RegState::Define | RegState::Dead) 10441 .addReg(ARM::CPSR, 10442 RegState::Implicit | RegState::Define | RegState::Dead); 10443 break; 10444 case CodeModel::Large: { 10445 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 10446 Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 10447 10448 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg) 10449 .addExternalSymbol("__chkstk"); 10450 BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr)) 10451 .add(predOps(ARMCC::AL)) 10452 .addReg(Reg, RegState::Kill) 10453 .addReg(ARM::R4, RegState::Implicit | RegState::Kill) 10454 .addReg(ARM::R4, RegState::Implicit | RegState::Define) 10455 .addReg(ARM::R12, 10456 RegState::Implicit | RegState::Define | RegState::Dead) 10457 .addReg(ARM::CPSR, 10458 RegState::Implicit | RegState::Define | RegState::Dead); 10459 break; 10460 } 10461 } 10462 10463 BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP) 10464 .addReg(ARM::SP, RegState::Kill) 10465 .addReg(ARM::R4, RegState::Kill) 10466 .setMIFlags(MachineInstr::FrameSetup) 10467 .add(predOps(ARMCC::AL)) 10468 .add(condCodeOp()); 10469 10470 MI.eraseFromParent(); 10471 return MBB; 10472 } 10473 10474 MachineBasicBlock * 10475 ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI, 10476 MachineBasicBlock *MBB) const { 10477 DebugLoc DL = MI.getDebugLoc(); 10478 MachineFunction *MF = MBB->getParent(); 10479 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 10480 10481 MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock(); 10482 MF->insert(++MBB->getIterator(), ContBB); 10483 ContBB->splice(ContBB->begin(), MBB, 10484 std::next(MachineBasicBlock::iterator(MI)), MBB->end()); 10485 ContBB->transferSuccessorsAndUpdatePHIs(MBB); 10486 MBB->addSuccessor(ContBB); 10487 10488 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 10489 BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0)); 10490 MF->push_back(TrapBB); 10491 MBB->addSuccessor(TrapBB); 10492 10493 BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8)) 10494 .addReg(MI.getOperand(0).getReg()) 10495 .addImm(0) 10496 .add(predOps(ARMCC::AL)); 10497 BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc)) 10498 .addMBB(TrapBB) 10499 .addImm(ARMCC::EQ) 10500 .addReg(ARM::CPSR); 10501 10502 MI.eraseFromParent(); 10503 return ContBB; 10504 } 10505 10506 // The CPSR operand of SelectItr might be missing a kill marker 10507 // because there were multiple uses of CPSR, and ISel didn't know 10508 // which to mark. Figure out whether SelectItr should have had a 10509 // kill marker, and set it if it should. Returns the correct kill 10510 // marker value. 10511 static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr, 10512 MachineBasicBlock* BB, 10513 const TargetRegisterInfo* TRI) { 10514 // Scan forward through BB for a use/def of CPSR. 10515 MachineBasicBlock::iterator miI(std::next(SelectItr)); 10516 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { 10517 const MachineInstr& mi = *miI; 10518 if (mi.readsRegister(ARM::CPSR)) 10519 return false; 10520 if (mi.definesRegister(ARM::CPSR)) 10521 break; // Should have kill-flag - update below. 10522 } 10523 10524 // If we hit the end of the block, check whether CPSR is live into a 10525 // successor. 10526 if (miI == BB->end()) { 10527 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), 10528 sEnd = BB->succ_end(); 10529 sItr != sEnd; ++sItr) { 10530 MachineBasicBlock* succ = *sItr; 10531 if (succ->isLiveIn(ARM::CPSR)) 10532 return false; 10533 } 10534 } 10535 10536 // We found a def, or hit the end of the basic block and CPSR wasn't live 10537 // out. SelectMI should have a kill flag on CPSR. 10538 SelectItr->addRegisterKilled(ARM::CPSR, TRI); 10539 return true; 10540 } 10541 10542 MachineBasicBlock * 10543 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, 10544 MachineBasicBlock *BB) const { 10545 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 10546 DebugLoc dl = MI.getDebugLoc(); 10547 bool isThumb2 = Subtarget->isThumb2(); 10548 switch (MI.getOpcode()) { 10549 default: { 10550 MI.print(errs()); 10551 llvm_unreachable("Unexpected instr type to insert"); 10552 } 10553 10554 // Thumb1 post-indexed loads are really just single-register LDMs. 10555 case ARM::tLDR_postidx: { 10556 MachineOperand Def(MI.getOperand(1)); 10557 BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD)) 10558 .add(Def) // Rn_wb 10559 .add(MI.getOperand(2)) // Rn 10560 .add(MI.getOperand(3)) // PredImm 10561 .add(MI.getOperand(4)) // PredReg 10562 .add(MI.getOperand(0)) // Rt 10563 .cloneMemRefs(MI); 10564 MI.eraseFromParent(); 10565 return BB; 10566 } 10567 10568 // The Thumb2 pre-indexed stores have the same MI operands, they just 10569 // define them differently in the .td files from the isel patterns, so 10570 // they need pseudos. 10571 case ARM::t2STR_preidx: 10572 MI.setDesc(TII->get(ARM::t2STR_PRE)); 10573 return BB; 10574 case ARM::t2STRB_preidx: 10575 MI.setDesc(TII->get(ARM::t2STRB_PRE)); 10576 return BB; 10577 case ARM::t2STRH_preidx: 10578 MI.setDesc(TII->get(ARM::t2STRH_PRE)); 10579 return BB; 10580 10581 case ARM::STRi_preidx: 10582 case ARM::STRBi_preidx: { 10583 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM 10584 : ARM::STRB_PRE_IMM; 10585 // Decode the offset. 10586 unsigned Offset = MI.getOperand(4).getImm(); 10587 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub; 10588 Offset = ARM_AM::getAM2Offset(Offset); 10589 if (isSub) 10590 Offset = -Offset; 10591 10592 MachineMemOperand *MMO = *MI.memoperands_begin(); 10593 BuildMI(*BB, MI, dl, TII->get(NewOpc)) 10594 .add(MI.getOperand(0)) // Rn_wb 10595 .add(MI.getOperand(1)) // Rt 10596 .add(MI.getOperand(2)) // Rn 10597 .addImm(Offset) // offset (skip GPR==zero_reg) 10598 .add(MI.getOperand(5)) // pred 10599 .add(MI.getOperand(6)) 10600 .addMemOperand(MMO); 10601 MI.eraseFromParent(); 10602 return BB; 10603 } 10604 case ARM::STRr_preidx: 10605 case ARM::STRBr_preidx: 10606 case ARM::STRH_preidx: { 10607 unsigned NewOpc; 10608 switch (MI.getOpcode()) { 10609 default: llvm_unreachable("unexpected opcode!"); 10610 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break; 10611 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break; 10612 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break; 10613 } 10614 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc)); 10615 for (unsigned i = 0; i < MI.getNumOperands(); ++i) 10616 MIB.add(MI.getOperand(i)); 10617 MI.eraseFromParent(); 10618 return BB; 10619 } 10620 10621 case ARM::tMOVCCr_pseudo: { 10622 // To "insert" a SELECT_CC instruction, we actually have to insert the 10623 // diamond control-flow pattern. The incoming instruction knows the 10624 // destination vreg to set, the condition code register to branch on, the 10625 // true/false values to select between, and a branch opcode to use. 10626 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 10627 MachineFunction::iterator It = ++BB->getIterator(); 10628 10629 // thisMBB: 10630 // ... 10631 // TrueVal = ... 10632 // cmpTY ccX, r1, r2 10633 // bCC copy1MBB 10634 // fallthrough --> copy0MBB 10635 MachineBasicBlock *thisMBB = BB; 10636 MachineFunction *F = BB->getParent(); 10637 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 10638 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 10639 F->insert(It, copy0MBB); 10640 F->insert(It, sinkMBB); 10641 10642 // Check whether CPSR is live past the tMOVCCr_pseudo. 10643 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); 10644 if (!MI.killsRegister(ARM::CPSR) && 10645 !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) { 10646 copy0MBB->addLiveIn(ARM::CPSR); 10647 sinkMBB->addLiveIn(ARM::CPSR); 10648 } 10649 10650 // Transfer the remainder of BB and its successor edges to sinkMBB. 10651 sinkMBB->splice(sinkMBB->begin(), BB, 10652 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10653 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 10654 10655 BB->addSuccessor(copy0MBB); 10656 BB->addSuccessor(sinkMBB); 10657 10658 BuildMI(BB, dl, TII->get(ARM::tBcc)) 10659 .addMBB(sinkMBB) 10660 .addImm(MI.getOperand(3).getImm()) 10661 .addReg(MI.getOperand(4).getReg()); 10662 10663 // copy0MBB: 10664 // %FalseValue = ... 10665 // # fallthrough to sinkMBB 10666 BB = copy0MBB; 10667 10668 // Update machine-CFG edges 10669 BB->addSuccessor(sinkMBB); 10670 10671 // sinkMBB: 10672 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 10673 // ... 10674 BB = sinkMBB; 10675 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg()) 10676 .addReg(MI.getOperand(1).getReg()) 10677 .addMBB(copy0MBB) 10678 .addReg(MI.getOperand(2).getReg()) 10679 .addMBB(thisMBB); 10680 10681 MI.eraseFromParent(); // The pseudo instruction is gone now. 10682 return BB; 10683 } 10684 10685 case ARM::BCCi64: 10686 case ARM::BCCZi64: { 10687 // If there is an unconditional branch to the other successor, remove it. 10688 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10689 10690 // Compare both parts that make up the double comparison separately for 10691 // equality. 10692 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64; 10693 10694 Register LHS1 = MI.getOperand(1).getReg(); 10695 Register LHS2 = MI.getOperand(2).getReg(); 10696 if (RHSisZero) { 10697 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 10698 .addReg(LHS1) 10699 .addImm(0) 10700 .add(predOps(ARMCC::AL)); 10701 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 10702 .addReg(LHS2).addImm(0) 10703 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 10704 } else { 10705 Register RHS1 = MI.getOperand(3).getReg(); 10706 Register RHS2 = MI.getOperand(4).getReg(); 10707 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 10708 .addReg(LHS1) 10709 .addReg(RHS1) 10710 .add(predOps(ARMCC::AL)); 10711 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 10712 .addReg(LHS2).addReg(RHS2) 10713 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 10714 } 10715 10716 MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB(); 10717 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB); 10718 if (MI.getOperand(0).getImm() == ARMCC::NE) 10719 std::swap(destMBB, exitMBB); 10720 10721 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) 10722 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR); 10723 if (isThumb2) 10724 BuildMI(BB, dl, TII->get(ARM::t2B)) 10725 .addMBB(exitMBB) 10726 .add(predOps(ARMCC::AL)); 10727 else 10728 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB); 10729 10730 MI.eraseFromParent(); // The pseudo instruction is gone now. 10731 return BB; 10732 } 10733 10734 case ARM::Int_eh_sjlj_setjmp: 10735 case ARM::Int_eh_sjlj_setjmp_nofp: 10736 case ARM::tInt_eh_sjlj_setjmp: 10737 case ARM::t2Int_eh_sjlj_setjmp: 10738 case ARM::t2Int_eh_sjlj_setjmp_nofp: 10739 return BB; 10740 10741 case ARM::Int_eh_sjlj_setup_dispatch: 10742 EmitSjLjDispatchBlock(MI, BB); 10743 return BB; 10744 10745 case ARM::ABS: 10746 case ARM::t2ABS: { 10747 // To insert an ABS instruction, we have to insert the 10748 // diamond control-flow pattern. The incoming instruction knows the 10749 // source vreg to test against 0, the destination vreg to set, 10750 // the condition code register to branch on, the 10751 // true/false values to select between, and a branch opcode to use. 10752 // It transforms 10753 // V1 = ABS V0 10754 // into 10755 // V2 = MOVS V0 10756 // BCC (branch to SinkBB if V0 >= 0) 10757 // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0) 10758 // SinkBB: V1 = PHI(V2, V3) 10759 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 10760 MachineFunction::iterator BBI = ++BB->getIterator(); 10761 MachineFunction *Fn = BB->getParent(); 10762 MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB); 10763 MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB); 10764 Fn->insert(BBI, RSBBB); 10765 Fn->insert(BBI, SinkBB); 10766 10767 Register ABSSrcReg = MI.getOperand(1).getReg(); 10768 Register ABSDstReg = MI.getOperand(0).getReg(); 10769 bool ABSSrcKIll = MI.getOperand(1).isKill(); 10770 bool isThumb2 = Subtarget->isThumb2(); 10771 MachineRegisterInfo &MRI = Fn->getRegInfo(); 10772 // In Thumb mode S must not be specified if source register is the SP or 10773 // PC and if destination register is the SP, so restrict register class 10774 Register NewRsbDstReg = MRI.createVirtualRegister( 10775 isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass); 10776 10777 // Transfer the remainder of BB and its successor edges to sinkMBB. 10778 SinkBB->splice(SinkBB->begin(), BB, 10779 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10780 SinkBB->transferSuccessorsAndUpdatePHIs(BB); 10781 10782 BB->addSuccessor(RSBBB); 10783 BB->addSuccessor(SinkBB); 10784 10785 // fall through to SinkMBB 10786 RSBBB->addSuccessor(SinkBB); 10787 10788 // insert a cmp at the end of BB 10789 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 10790 .addReg(ABSSrcReg) 10791 .addImm(0) 10792 .add(predOps(ARMCC::AL)); 10793 10794 // insert a bcc with opposite CC to ARMCC::MI at the end of BB 10795 BuildMI(BB, dl, 10796 TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB) 10797 .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR); 10798 10799 // insert rsbri in RSBBB 10800 // Note: BCC and rsbri will be converted into predicated rsbmi 10801 // by if-conversion pass 10802 BuildMI(*RSBBB, RSBBB->begin(), dl, 10803 TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg) 10804 .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0) 10805 .addImm(0) 10806 .add(predOps(ARMCC::AL)) 10807 .add(condCodeOp()); 10808 10809 // insert PHI in SinkBB, 10810 // reuse ABSDstReg to not change uses of ABS instruction 10811 BuildMI(*SinkBB, SinkBB->begin(), dl, 10812 TII->get(ARM::PHI), ABSDstReg) 10813 .addReg(NewRsbDstReg).addMBB(RSBBB) 10814 .addReg(ABSSrcReg).addMBB(BB); 10815 10816 // remove ABS instruction 10817 MI.eraseFromParent(); 10818 10819 // return last added BB 10820 return SinkBB; 10821 } 10822 case ARM::COPY_STRUCT_BYVAL_I32: 10823 ++NumLoopByVals; 10824 return EmitStructByval(MI, BB); 10825 case ARM::WIN__CHKSTK: 10826 return EmitLowered__chkstk(MI, BB); 10827 case ARM::WIN__DBZCHK: 10828 return EmitLowered__dbzchk(MI, BB); 10829 } 10830 } 10831 10832 /// Attaches vregs to MEMCPY that it will use as scratch registers 10833 /// when it is expanded into LDM/STM. This is done as a post-isel lowering 10834 /// instead of as a custom inserter because we need the use list from the SDNode. 10835 static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, 10836 MachineInstr &MI, const SDNode *Node) { 10837 bool isThumb1 = Subtarget->isThumb1Only(); 10838 10839 DebugLoc DL = MI.getDebugLoc(); 10840 MachineFunction *MF = MI.getParent()->getParent(); 10841 MachineRegisterInfo &MRI = MF->getRegInfo(); 10842 MachineInstrBuilder MIB(*MF, MI); 10843 10844 // If the new dst/src is unused mark it as dead. 10845 if (!Node->hasAnyUseOfValue(0)) { 10846 MI.getOperand(0).setIsDead(true); 10847 } 10848 if (!Node->hasAnyUseOfValue(1)) { 10849 MI.getOperand(1).setIsDead(true); 10850 } 10851 10852 // The MEMCPY both defines and kills the scratch registers. 10853 for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) { 10854 Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass 10855 : &ARM::GPRRegClass); 10856 MIB.addReg(TmpReg, RegState::Define|RegState::Dead); 10857 } 10858 } 10859 10860 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, 10861 SDNode *Node) const { 10862 if (MI.getOpcode() == ARM::MEMCPY) { 10863 attachMEMCPYScratchRegs(Subtarget, MI, Node); 10864 return; 10865 } 10866 10867 const MCInstrDesc *MCID = &MI.getDesc(); 10868 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB, 10869 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional 10870 // operand is still set to noreg. If needed, set the optional operand's 10871 // register to CPSR, and remove the redundant implicit def. 10872 // 10873 // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR). 10874 10875 // Rename pseudo opcodes. 10876 unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode()); 10877 unsigned ccOutIdx; 10878 if (NewOpc) { 10879 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo(); 10880 MCID = &TII->get(NewOpc); 10881 10882 assert(MCID->getNumOperands() == 10883 MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize() 10884 && "converted opcode should be the same except for cc_out" 10885 " (and, on Thumb1, pred)"); 10886 10887 MI.setDesc(*MCID); 10888 10889 // Add the optional cc_out operand 10890 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true)); 10891 10892 // On Thumb1, move all input operands to the end, then add the predicate 10893 if (Subtarget->isThumb1Only()) { 10894 for (unsigned c = MCID->getNumOperands() - 4; c--;) { 10895 MI.addOperand(MI.getOperand(1)); 10896 MI.RemoveOperand(1); 10897 } 10898 10899 // Restore the ties 10900 for (unsigned i = MI.getNumOperands(); i--;) { 10901 const MachineOperand& op = MI.getOperand(i); 10902 if (op.isReg() && op.isUse()) { 10903 int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO); 10904 if (DefIdx != -1) 10905 MI.tieOperands(DefIdx, i); 10906 } 10907 } 10908 10909 MI.addOperand(MachineOperand::CreateImm(ARMCC::AL)); 10910 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false)); 10911 ccOutIdx = 1; 10912 } else 10913 ccOutIdx = MCID->getNumOperands() - 1; 10914 } else 10915 ccOutIdx = MCID->getNumOperands() - 1; 10916 10917 // Any ARM instruction that sets the 's' bit should specify an optional 10918 // "cc_out" operand in the last operand position. 10919 if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) { 10920 assert(!NewOpc && "Optional cc_out operand required"); 10921 return; 10922 } 10923 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it 10924 // since we already have an optional CPSR def. 10925 bool definesCPSR = false; 10926 bool deadCPSR = false; 10927 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e; 10928 ++i) { 10929 const MachineOperand &MO = MI.getOperand(i); 10930 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) { 10931 definesCPSR = true; 10932 if (MO.isDead()) 10933 deadCPSR = true; 10934 MI.RemoveOperand(i); 10935 break; 10936 } 10937 } 10938 if (!definesCPSR) { 10939 assert(!NewOpc && "Optional cc_out operand required"); 10940 return; 10941 } 10942 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag"); 10943 if (deadCPSR) { 10944 assert(!MI.getOperand(ccOutIdx).getReg() && 10945 "expect uninitialized optional cc_out operand"); 10946 // Thumb1 instructions must have the S bit even if the CPSR is dead. 10947 if (!Subtarget->isThumb1Only()) 10948 return; 10949 } 10950 10951 // If this instruction was defined with an optional CPSR def and its dag node 10952 // had a live implicit CPSR def, then activate the optional CPSR def. 10953 MachineOperand &MO = MI.getOperand(ccOutIdx); 10954 MO.setReg(ARM::CPSR); 10955 MO.setIsDef(true); 10956 } 10957 10958 //===----------------------------------------------------------------------===// 10959 // ARM Optimization Hooks 10960 //===----------------------------------------------------------------------===// 10961 10962 // Helper function that checks if N is a null or all ones constant. 10963 static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) { 10964 return AllOnes ? isAllOnesConstant(N) : isNullConstant(N); 10965 } 10966 10967 // Return true if N is conditionally 0 or all ones. 10968 // Detects these expressions where cc is an i1 value: 10969 // 10970 // (select cc 0, y) [AllOnes=0] 10971 // (select cc y, 0) [AllOnes=0] 10972 // (zext cc) [AllOnes=0] 10973 // (sext cc) [AllOnes=0/1] 10974 // (select cc -1, y) [AllOnes=1] 10975 // (select cc y, -1) [AllOnes=1] 10976 // 10977 // Invert is set when N is the null/all ones constant when CC is false. 10978 // OtherOp is set to the alternative value of N. 10979 static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, 10980 SDValue &CC, bool &Invert, 10981 SDValue &OtherOp, 10982 SelectionDAG &DAG) { 10983 switch (N->getOpcode()) { 10984 default: return false; 10985 case ISD::SELECT: { 10986 CC = N->getOperand(0); 10987 SDValue N1 = N->getOperand(1); 10988 SDValue N2 = N->getOperand(2); 10989 if (isZeroOrAllOnes(N1, AllOnes)) { 10990 Invert = false; 10991 OtherOp = N2; 10992 return true; 10993 } 10994 if (isZeroOrAllOnes(N2, AllOnes)) { 10995 Invert = true; 10996 OtherOp = N1; 10997 return true; 10998 } 10999 return false; 11000 } 11001 case ISD::ZERO_EXTEND: 11002 // (zext cc) can never be the all ones value. 11003 if (AllOnes) 11004 return false; 11005 LLVM_FALLTHROUGH; 11006 case ISD::SIGN_EXTEND: { 11007 SDLoc dl(N); 11008 EVT VT = N->getValueType(0); 11009 CC = N->getOperand(0); 11010 if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC) 11011 return false; 11012 Invert = !AllOnes; 11013 if (AllOnes) 11014 // When looking for an AllOnes constant, N is an sext, and the 'other' 11015 // value is 0. 11016 OtherOp = DAG.getConstant(0, dl, VT); 11017 else if (N->getOpcode() == ISD::ZERO_EXTEND) 11018 // When looking for a 0 constant, N can be zext or sext. 11019 OtherOp = DAG.getConstant(1, dl, VT); 11020 else 11021 OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl, 11022 VT); 11023 return true; 11024 } 11025 } 11026 } 11027 11028 // Combine a constant select operand into its use: 11029 // 11030 // (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 11031 // (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 11032 // (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1] 11033 // (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 11034 // (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 11035 // 11036 // The transform is rejected if the select doesn't have a constant operand that 11037 // is null, or all ones when AllOnes is set. 11038 // 11039 // Also recognize sext/zext from i1: 11040 // 11041 // (add (zext cc), x) -> (select cc (add x, 1), x) 11042 // (add (sext cc), x) -> (select cc (add x, -1), x) 11043 // 11044 // These transformations eventually create predicated instructions. 11045 // 11046 // @param N The node to transform. 11047 // @param Slct The N operand that is a select. 11048 // @param OtherOp The other N operand (x above). 11049 // @param DCI Context. 11050 // @param AllOnes Require the select constant to be all ones instead of null. 11051 // @returns The new node, or SDValue() on failure. 11052 static 11053 SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, 11054 TargetLowering::DAGCombinerInfo &DCI, 11055 bool AllOnes = false) { 11056 SelectionDAG &DAG = DCI.DAG; 11057 EVT VT = N->getValueType(0); 11058 SDValue NonConstantVal; 11059 SDValue CCOp; 11060 bool SwapSelectOps; 11061 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps, 11062 NonConstantVal, DAG)) 11063 return SDValue(); 11064 11065 // Slct is now know to be the desired identity constant when CC is true. 11066 SDValue TrueVal = OtherOp; 11067 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT, 11068 OtherOp, NonConstantVal); 11069 // Unless SwapSelectOps says CC should be false. 11070 if (SwapSelectOps) 11071 std::swap(TrueVal, FalseVal); 11072 11073 return DAG.getNode(ISD::SELECT, SDLoc(N), VT, 11074 CCOp, TrueVal, FalseVal); 11075 } 11076 11077 // Attempt combineSelectAndUse on each operand of a commutative operator N. 11078 static 11079 SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, 11080 TargetLowering::DAGCombinerInfo &DCI) { 11081 SDValue N0 = N->getOperand(0); 11082 SDValue N1 = N->getOperand(1); 11083 if (N0.getNode()->hasOneUse()) 11084 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes)) 11085 return Result; 11086 if (N1.getNode()->hasOneUse()) 11087 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes)) 11088 return Result; 11089 return SDValue(); 11090 } 11091 11092 static bool IsVUZPShuffleNode(SDNode *N) { 11093 // VUZP shuffle node. 11094 if (N->getOpcode() == ARMISD::VUZP) 11095 return true; 11096 11097 // "VUZP" on i32 is an alias for VTRN. 11098 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32) 11099 return true; 11100 11101 return false; 11102 } 11103 11104 static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, 11105 TargetLowering::DAGCombinerInfo &DCI, 11106 const ARMSubtarget *Subtarget) { 11107 // Look for ADD(VUZP.0, VUZP.1). 11108 if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() || 11109 N0 == N1) 11110 return SDValue(); 11111 11112 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD. 11113 if (!N->getValueType(0).is64BitVector()) 11114 return SDValue(); 11115 11116 // Generate vpadd. 11117 SelectionDAG &DAG = DCI.DAG; 11118 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11119 SDLoc dl(N); 11120 SDNode *Unzip = N0.getNode(); 11121 EVT VT = N->getValueType(0); 11122 11123 SmallVector<SDValue, 8> Ops; 11124 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl, 11125 TLI.getPointerTy(DAG.getDataLayout()))); 11126 Ops.push_back(Unzip->getOperand(0)); 11127 Ops.push_back(Unzip->getOperand(1)); 11128 11129 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); 11130 } 11131 11132 static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, 11133 TargetLowering::DAGCombinerInfo &DCI, 11134 const ARMSubtarget *Subtarget) { 11135 // Check for two extended operands. 11136 if (!(N0.getOpcode() == ISD::SIGN_EXTEND && 11137 N1.getOpcode() == ISD::SIGN_EXTEND) && 11138 !(N0.getOpcode() == ISD::ZERO_EXTEND && 11139 N1.getOpcode() == ISD::ZERO_EXTEND)) 11140 return SDValue(); 11141 11142 SDValue N00 = N0.getOperand(0); 11143 SDValue N10 = N1.getOperand(0); 11144 11145 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1)) 11146 if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() || 11147 N00 == N10) 11148 return SDValue(); 11149 11150 // We only recognize Q register paddl here; this can't be reached until 11151 // after type legalization. 11152 if (!N00.getValueType().is64BitVector() || 11153 !N0.getValueType().is128BitVector()) 11154 return SDValue(); 11155 11156 // Generate vpaddl. 11157 SelectionDAG &DAG = DCI.DAG; 11158 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11159 SDLoc dl(N); 11160 EVT VT = N->getValueType(0); 11161 11162 SmallVector<SDValue, 8> Ops; 11163 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension. 11164 unsigned Opcode; 11165 if (N0.getOpcode() == ISD::SIGN_EXTEND) 11166 Opcode = Intrinsic::arm_neon_vpaddls; 11167 else 11168 Opcode = Intrinsic::arm_neon_vpaddlu; 11169 Ops.push_back(DAG.getConstant(Opcode, dl, 11170 TLI.getPointerTy(DAG.getDataLayout()))); 11171 EVT ElemTy = N00.getValueType().getVectorElementType(); 11172 unsigned NumElts = VT.getVectorNumElements(); 11173 EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2); 11174 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT, 11175 N00.getOperand(0), N00.getOperand(1)); 11176 Ops.push_back(Concat); 11177 11178 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); 11179 } 11180 11181 // FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in 11182 // an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is 11183 // much easier to match. 11184 static SDValue 11185 AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, 11186 TargetLowering::DAGCombinerInfo &DCI, 11187 const ARMSubtarget *Subtarget) { 11188 // Only perform optimization if after legalize, and if NEON is available. We 11189 // also expected both operands to be BUILD_VECTORs. 11190 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON() 11191 || N0.getOpcode() != ISD::BUILD_VECTOR 11192 || N1.getOpcode() != ISD::BUILD_VECTOR) 11193 return SDValue(); 11194 11195 // Check output type since VPADDL operand elements can only be 8, 16, or 32. 11196 EVT VT = N->getValueType(0); 11197 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64) 11198 return SDValue(); 11199 11200 // Check that the vector operands are of the right form. 11201 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR 11202 // operands, where N is the size of the formed vector. 11203 // Each EXTRACT_VECTOR should have the same input vector and odd or even 11204 // index such that we have a pair wise add pattern. 11205 11206 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing. 11207 if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 11208 return SDValue(); 11209 SDValue Vec = N0->getOperand(0)->getOperand(0); 11210 SDNode *V = Vec.getNode(); 11211 unsigned nextIndex = 0; 11212 11213 // For each operands to the ADD which are BUILD_VECTORs, 11214 // check to see if each of their operands are an EXTRACT_VECTOR with 11215 // the same vector and appropriate index. 11216 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) { 11217 if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT 11218 && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 11219 11220 SDValue ExtVec0 = N0->getOperand(i); 11221 SDValue ExtVec1 = N1->getOperand(i); 11222 11223 // First operand is the vector, verify its the same. 11224 if (V != ExtVec0->getOperand(0).getNode() || 11225 V != ExtVec1->getOperand(0).getNode()) 11226 return SDValue(); 11227 11228 // Second is the constant, verify its correct. 11229 ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1)); 11230 ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1)); 11231 11232 // For the constant, we want to see all the even or all the odd. 11233 if (!C0 || !C1 || C0->getZExtValue() != nextIndex 11234 || C1->getZExtValue() != nextIndex+1) 11235 return SDValue(); 11236 11237 // Increment index. 11238 nextIndex+=2; 11239 } else 11240 return SDValue(); 11241 } 11242 11243 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure 11244 // we're using the entire input vector, otherwise there's a size/legality 11245 // mismatch somewhere. 11246 if (nextIndex != Vec.getValueType().getVectorNumElements() || 11247 Vec.getValueType().getVectorElementType() == VT.getVectorElementType()) 11248 return SDValue(); 11249 11250 // Create VPADDL node. 11251 SelectionDAG &DAG = DCI.DAG; 11252 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11253 11254 SDLoc dl(N); 11255 11256 // Build operand list. 11257 SmallVector<SDValue, 8> Ops; 11258 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl, 11259 TLI.getPointerTy(DAG.getDataLayout()))); 11260 11261 // Input is the vector. 11262 Ops.push_back(Vec); 11263 11264 // Get widened type and narrowed type. 11265 MVT widenType; 11266 unsigned numElem = VT.getVectorNumElements(); 11267 11268 EVT inputLaneType = Vec.getValueType().getVectorElementType(); 11269 switch (inputLaneType.getSimpleVT().SimpleTy) { 11270 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break; 11271 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break; 11272 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break; 11273 default: 11274 llvm_unreachable("Invalid vector element type for padd optimization."); 11275 } 11276 11277 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops); 11278 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE; 11279 return DAG.getNode(ExtOp, dl, VT, tmp); 11280 } 11281 11282 static SDValue findMUL_LOHI(SDValue V) { 11283 if (V->getOpcode() == ISD::UMUL_LOHI || 11284 V->getOpcode() == ISD::SMUL_LOHI) 11285 return V; 11286 return SDValue(); 11287 } 11288 11289 static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, 11290 TargetLowering::DAGCombinerInfo &DCI, 11291 const ARMSubtarget *Subtarget) { 11292 if (!Subtarget->hasBaseDSP()) 11293 return SDValue(); 11294 11295 // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and 11296 // accumulates the product into a 64-bit value. The 16-bit values will 11297 // be sign extended somehow or SRA'd into 32-bit values 11298 // (addc (adde (mul 16bit, 16bit), lo), hi) 11299 SDValue Mul = AddcNode->getOperand(0); 11300 SDValue Lo = AddcNode->getOperand(1); 11301 if (Mul.getOpcode() != ISD::MUL) { 11302 Lo = AddcNode->getOperand(0); 11303 Mul = AddcNode->getOperand(1); 11304 if (Mul.getOpcode() != ISD::MUL) 11305 return SDValue(); 11306 } 11307 11308 SDValue SRA = AddeNode->getOperand(0); 11309 SDValue Hi = AddeNode->getOperand(1); 11310 if (SRA.getOpcode() != ISD::SRA) { 11311 SRA = AddeNode->getOperand(1); 11312 Hi = AddeNode->getOperand(0); 11313 if (SRA.getOpcode() != ISD::SRA) 11314 return SDValue(); 11315 } 11316 if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) { 11317 if (Const->getZExtValue() != 31) 11318 return SDValue(); 11319 } else 11320 return SDValue(); 11321 11322 if (SRA.getOperand(0) != Mul) 11323 return SDValue(); 11324 11325 SelectionDAG &DAG = DCI.DAG; 11326 SDLoc dl(AddcNode); 11327 unsigned Opcode = 0; 11328 SDValue Op0; 11329 SDValue Op1; 11330 11331 if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) { 11332 Opcode = ARMISD::SMLALBB; 11333 Op0 = Mul.getOperand(0); 11334 Op1 = Mul.getOperand(1); 11335 } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) { 11336 Opcode = ARMISD::SMLALBT; 11337 Op0 = Mul.getOperand(0); 11338 Op1 = Mul.getOperand(1).getOperand(0); 11339 } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) { 11340 Opcode = ARMISD::SMLALTB; 11341 Op0 = Mul.getOperand(0).getOperand(0); 11342 Op1 = Mul.getOperand(1); 11343 } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) { 11344 Opcode = ARMISD::SMLALTT; 11345 Op0 = Mul->getOperand(0).getOperand(0); 11346 Op1 = Mul->getOperand(1).getOperand(0); 11347 } 11348 11349 if (!Op0 || !Op1) 11350 return SDValue(); 11351 11352 SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32), 11353 Op0, Op1, Lo, Hi); 11354 // Replace the ADDs' nodes uses by the MLA node's values. 11355 SDValue HiMLALResult(SMLAL.getNode(), 1); 11356 SDValue LoMLALResult(SMLAL.getNode(), 0); 11357 11358 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult); 11359 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult); 11360 11361 // Return original node to notify the driver to stop replacing. 11362 SDValue resNode(AddcNode, 0); 11363 return resNode; 11364 } 11365 11366 static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode, 11367 TargetLowering::DAGCombinerInfo &DCI, 11368 const ARMSubtarget *Subtarget) { 11369 // Look for multiply add opportunities. 11370 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where 11371 // each add nodes consumes a value from ISD::UMUL_LOHI and there is 11372 // a glue link from the first add to the second add. 11373 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by 11374 // a S/UMLAL instruction. 11375 // UMUL_LOHI 11376 // / :lo \ :hi 11377 // V \ [no multiline comment] 11378 // loAdd -> ADDC | 11379 // \ :carry / 11380 // V V 11381 // ADDE <- hiAdd 11382 // 11383 // In the special case where only the higher part of a signed result is used 11384 // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts 11385 // a constant with the exact value of 0x80000000, we recognize we are dealing 11386 // with a "rounded multiply and add" (or subtract) and transform it into 11387 // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively. 11388 11389 assert((AddeSubeNode->getOpcode() == ARMISD::ADDE || 11390 AddeSubeNode->getOpcode() == ARMISD::SUBE) && 11391 "Expect an ADDE or SUBE"); 11392 11393 assert(AddeSubeNode->getNumOperands() == 3 && 11394 AddeSubeNode->getOperand(2).getValueType() == MVT::i32 && 11395 "ADDE node has the wrong inputs"); 11396 11397 // Check that we are chained to the right ADDC or SUBC node. 11398 SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode(); 11399 if ((AddeSubeNode->getOpcode() == ARMISD::ADDE && 11400 AddcSubcNode->getOpcode() != ARMISD::ADDC) || 11401 (AddeSubeNode->getOpcode() == ARMISD::SUBE && 11402 AddcSubcNode->getOpcode() != ARMISD::SUBC)) 11403 return SDValue(); 11404 11405 SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0); 11406 SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1); 11407 11408 // Check if the two operands are from the same mul_lohi node. 11409 if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode()) 11410 return SDValue(); 11411 11412 assert(AddcSubcNode->getNumValues() == 2 && 11413 AddcSubcNode->getValueType(0) == MVT::i32 && 11414 "Expect ADDC with two result values. First: i32"); 11415 11416 // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it 11417 // maybe a SMLAL which multiplies two 16-bit values. 11418 if (AddeSubeNode->getOpcode() == ARMISD::ADDE && 11419 AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI && 11420 AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI && 11421 AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI && 11422 AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI) 11423 return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget); 11424 11425 // Check for the triangle shape. 11426 SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0); 11427 SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1); 11428 11429 // Make sure that the ADDE/SUBE operands are not coming from the same node. 11430 if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode()) 11431 return SDValue(); 11432 11433 // Find the MUL_LOHI node walking up ADDE/SUBE's operands. 11434 bool IsLeftOperandMUL = false; 11435 SDValue MULOp = findMUL_LOHI(AddeSubeOp0); 11436 if (MULOp == SDValue()) 11437 MULOp = findMUL_LOHI(AddeSubeOp1); 11438 else 11439 IsLeftOperandMUL = true; 11440 if (MULOp == SDValue()) 11441 return SDValue(); 11442 11443 // Figure out the right opcode. 11444 unsigned Opc = MULOp->getOpcode(); 11445 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL; 11446 11447 // Figure out the high and low input values to the MLAL node. 11448 SDValue *HiAddSub = nullptr; 11449 SDValue *LoMul = nullptr; 11450 SDValue *LowAddSub = nullptr; 11451 11452 // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI. 11453 if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1))) 11454 return SDValue(); 11455 11456 if (IsLeftOperandMUL) 11457 HiAddSub = &AddeSubeOp1; 11458 else 11459 HiAddSub = &AddeSubeOp0; 11460 11461 // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node 11462 // whose low result is fed to the ADDC/SUBC we are checking. 11463 11464 if (AddcSubcOp0 == MULOp.getValue(0)) { 11465 LoMul = &AddcSubcOp0; 11466 LowAddSub = &AddcSubcOp1; 11467 } 11468 if (AddcSubcOp1 == MULOp.getValue(0)) { 11469 LoMul = &AddcSubcOp1; 11470 LowAddSub = &AddcSubcOp0; 11471 } 11472 11473 if (!LoMul) 11474 return SDValue(); 11475 11476 // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC 11477 // the replacement below will create a cycle. 11478 if (AddcSubcNode == HiAddSub->getNode() || 11479 AddcSubcNode->isPredecessorOf(HiAddSub->getNode())) 11480 return SDValue(); 11481 11482 // Create the merged node. 11483 SelectionDAG &DAG = DCI.DAG; 11484 11485 // Start building operand list. 11486 SmallVector<SDValue, 8> Ops; 11487 Ops.push_back(LoMul->getOperand(0)); 11488 Ops.push_back(LoMul->getOperand(1)); 11489 11490 // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be 11491 // the case, we must be doing signed multiplication and only use the higher 11492 // part of the result of the MLAL, furthermore the LowAddSub must be a constant 11493 // addition or subtraction with the value of 0x800000. 11494 if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() && 11495 FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) && 11496 LowAddSub->getNode()->getOpcode() == ISD::Constant && 11497 static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() == 11498 0x80000000) { 11499 Ops.push_back(*HiAddSub); 11500 if (AddcSubcNode->getOpcode() == ARMISD::SUBC) { 11501 FinalOpc = ARMISD::SMMLSR; 11502 } else { 11503 FinalOpc = ARMISD::SMMLAR; 11504 } 11505 SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops); 11506 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode); 11507 11508 return SDValue(AddeSubeNode, 0); 11509 } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC) 11510 // SMMLS is generated during instruction selection and the rest of this 11511 // function can not handle the case where AddcSubcNode is a SUBC. 11512 return SDValue(); 11513 11514 // Finish building the operand list for {U/S}MLAL 11515 Ops.push_back(*LowAddSub); 11516 Ops.push_back(*HiAddSub); 11517 11518 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), 11519 DAG.getVTList(MVT::i32, MVT::i32), Ops); 11520 11521 // Replace the ADDs' nodes uses by the MLA node's values. 11522 SDValue HiMLALResult(MLALNode.getNode(), 1); 11523 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult); 11524 11525 SDValue LoMLALResult(MLALNode.getNode(), 0); 11526 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult); 11527 11528 // Return original node to notify the driver to stop replacing. 11529 return SDValue(AddeSubeNode, 0); 11530 } 11531 11532 static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, 11533 TargetLowering::DAGCombinerInfo &DCI, 11534 const ARMSubtarget *Subtarget) { 11535 // UMAAL is similar to UMLAL except that it adds two unsigned values. 11536 // While trying to combine for the other MLAL nodes, first search for the 11537 // chance to use UMAAL. Check if Addc uses a node which has already 11538 // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde 11539 // as the addend, and it's handled in PerformUMLALCombine. 11540 11541 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) 11542 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); 11543 11544 // Check that we have a glued ADDC node. 11545 SDNode* AddcNode = AddeNode->getOperand(2).getNode(); 11546 if (AddcNode->getOpcode() != ARMISD::ADDC) 11547 return SDValue(); 11548 11549 // Find the converted UMAAL or quit if it doesn't exist. 11550 SDNode *UmlalNode = nullptr; 11551 SDValue AddHi; 11552 if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) { 11553 UmlalNode = AddcNode->getOperand(0).getNode(); 11554 AddHi = AddcNode->getOperand(1); 11555 } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) { 11556 UmlalNode = AddcNode->getOperand(1).getNode(); 11557 AddHi = AddcNode->getOperand(0); 11558 } else { 11559 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); 11560 } 11561 11562 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as 11563 // the ADDC as well as Zero. 11564 if (!isNullConstant(UmlalNode->getOperand(3))) 11565 return SDValue(); 11566 11567 if ((isNullConstant(AddeNode->getOperand(0)) && 11568 AddeNode->getOperand(1).getNode() == UmlalNode) || 11569 (AddeNode->getOperand(0).getNode() == UmlalNode && 11570 isNullConstant(AddeNode->getOperand(1)))) { 11571 SelectionDAG &DAG = DCI.DAG; 11572 SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1), 11573 UmlalNode->getOperand(2), AddHi }; 11574 SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode), 11575 DAG.getVTList(MVT::i32, MVT::i32), Ops); 11576 11577 // Replace the ADDs' nodes uses by the UMAAL node's values. 11578 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1)); 11579 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0)); 11580 11581 // Return original node to notify the driver to stop replacing. 11582 return SDValue(AddeNode, 0); 11583 } 11584 return SDValue(); 11585 } 11586 11587 static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG, 11588 const ARMSubtarget *Subtarget) { 11589 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) 11590 return SDValue(); 11591 11592 // Check that we have a pair of ADDC and ADDE as operands. 11593 // Both addends of the ADDE must be zero. 11594 SDNode* AddcNode = N->getOperand(2).getNode(); 11595 SDNode* AddeNode = N->getOperand(3).getNode(); 11596 if ((AddcNode->getOpcode() == ARMISD::ADDC) && 11597 (AddeNode->getOpcode() == ARMISD::ADDE) && 11598 isNullConstant(AddeNode->getOperand(0)) && 11599 isNullConstant(AddeNode->getOperand(1)) && 11600 (AddeNode->getOperand(2).getNode() == AddcNode)) 11601 return DAG.getNode(ARMISD::UMAAL, SDLoc(N), 11602 DAG.getVTList(MVT::i32, MVT::i32), 11603 {N->getOperand(0), N->getOperand(1), 11604 AddcNode->getOperand(0), AddcNode->getOperand(1)}); 11605 else 11606 return SDValue(); 11607 } 11608 11609 static SDValue PerformAddcSubcCombine(SDNode *N, 11610 TargetLowering::DAGCombinerInfo &DCI, 11611 const ARMSubtarget *Subtarget) { 11612 SelectionDAG &DAG(DCI.DAG); 11613 11614 if (N->getOpcode() == ARMISD::SUBC) { 11615 // (SUBC (ADDE 0, 0, C), 1) -> C 11616 SDValue LHS = N->getOperand(0); 11617 SDValue RHS = N->getOperand(1); 11618 if (LHS->getOpcode() == ARMISD::ADDE && 11619 isNullConstant(LHS->getOperand(0)) && 11620 isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) { 11621 return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2)); 11622 } 11623 } 11624 11625 if (Subtarget->isThumb1Only()) { 11626 SDValue RHS = N->getOperand(1); 11627 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) { 11628 int32_t imm = C->getSExtValue(); 11629 if (imm < 0 && imm > std::numeric_limits<int>::min()) { 11630 SDLoc DL(N); 11631 RHS = DAG.getConstant(-imm, DL, MVT::i32); 11632 unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC 11633 : ARMISD::ADDC; 11634 return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS); 11635 } 11636 } 11637 } 11638 11639 return SDValue(); 11640 } 11641 11642 static SDValue PerformAddeSubeCombine(SDNode *N, 11643 TargetLowering::DAGCombinerInfo &DCI, 11644 const ARMSubtarget *Subtarget) { 11645 if (Subtarget->isThumb1Only()) { 11646 SelectionDAG &DAG = DCI.DAG; 11647 SDValue RHS = N->getOperand(1); 11648 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) { 11649 int64_t imm = C->getSExtValue(); 11650 if (imm < 0) { 11651 SDLoc DL(N); 11652 11653 // The with-carry-in form matches bitwise not instead of the negation. 11654 // Effectively, the inverse interpretation of the carry flag already 11655 // accounts for part of the negation. 11656 RHS = DAG.getConstant(~imm, DL, MVT::i32); 11657 11658 unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE 11659 : ARMISD::ADDE; 11660 return DAG.getNode(Opcode, DL, N->getVTList(), 11661 N->getOperand(0), RHS, N->getOperand(2)); 11662 } 11663 } 11664 } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) { 11665 return AddCombineTo64bitMLAL(N, DCI, Subtarget); 11666 } 11667 return SDValue(); 11668 } 11669 11670 static SDValue PerformABSCombine(SDNode *N, 11671 TargetLowering::DAGCombinerInfo &DCI, 11672 const ARMSubtarget *Subtarget) { 11673 SDValue res; 11674 SelectionDAG &DAG = DCI.DAG; 11675 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11676 11677 if (TLI.isOperationLegal(N->getOpcode(), N->getValueType(0))) 11678 return SDValue(); 11679 11680 if (!TLI.expandABS(N, res, DAG)) 11681 return SDValue(); 11682 11683 return res; 11684 } 11685 11686 /// PerformADDECombine - Target-specific dag combine transform from 11687 /// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or 11688 /// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL 11689 static SDValue PerformADDECombine(SDNode *N, 11690 TargetLowering::DAGCombinerInfo &DCI, 11691 const ARMSubtarget *Subtarget) { 11692 // Only ARM and Thumb2 support UMLAL/SMLAL. 11693 if (Subtarget->isThumb1Only()) 11694 return PerformAddeSubeCombine(N, DCI, Subtarget); 11695 11696 // Only perform the checks after legalize when the pattern is available. 11697 if (DCI.isBeforeLegalize()) return SDValue(); 11698 11699 return AddCombineTo64bitUMAAL(N, DCI, Subtarget); 11700 } 11701 11702 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with 11703 /// operands N0 and N1. This is a helper for PerformADDCombine that is 11704 /// called with the default operands, and if that fails, with commuted 11705 /// operands. 11706 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, 11707 TargetLowering::DAGCombinerInfo &DCI, 11708 const ARMSubtarget *Subtarget){ 11709 // Attempt to create vpadd for this add. 11710 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget)) 11711 return Result; 11712 11713 // Attempt to create vpaddl for this add. 11714 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget)) 11715 return Result; 11716 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI, 11717 Subtarget)) 11718 return Result; 11719 11720 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 11721 if (N0.getNode()->hasOneUse()) 11722 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI)) 11723 return Result; 11724 return SDValue(); 11725 } 11726 11727 bool 11728 ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N, 11729 CombineLevel Level) const { 11730 if (Level == BeforeLegalizeTypes) 11731 return true; 11732 11733 if (N->getOpcode() != ISD::SHL) 11734 return true; 11735 11736 if (Subtarget->isThumb1Only()) { 11737 // Avoid making expensive immediates by commuting shifts. (This logic 11738 // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted 11739 // for free.) 11740 if (N->getOpcode() != ISD::SHL) 11741 return true; 11742 SDValue N1 = N->getOperand(0); 11743 if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND && 11744 N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR) 11745 return true; 11746 if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) { 11747 if (Const->getAPIntValue().ult(256)) 11748 return false; 11749 if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) && 11750 Const->getAPIntValue().sgt(-256)) 11751 return false; 11752 } 11753 return true; 11754 } 11755 11756 // Turn off commute-with-shift transform after legalization, so it doesn't 11757 // conflict with PerformSHLSimplify. (We could try to detect when 11758 // PerformSHLSimplify would trigger more precisely, but it isn't 11759 // really necessary.) 11760 return false; 11761 } 11762 11763 bool ARMTargetLowering::shouldFoldConstantShiftPairToMask( 11764 const SDNode *N, CombineLevel Level) const { 11765 if (!Subtarget->isThumb1Only()) 11766 return true; 11767 11768 if (Level == BeforeLegalizeTypes) 11769 return true; 11770 11771 return false; 11772 } 11773 11774 bool ARMTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const { 11775 if (!Subtarget->hasNEON()) { 11776 if (Subtarget->isThumb1Only()) 11777 return VT.getScalarSizeInBits() <= 32; 11778 return true; 11779 } 11780 return VT.isScalarInteger(); 11781 } 11782 11783 static SDValue PerformSHLSimplify(SDNode *N, 11784 TargetLowering::DAGCombinerInfo &DCI, 11785 const ARMSubtarget *ST) { 11786 // Allow the generic combiner to identify potential bswaps. 11787 if (DCI.isBeforeLegalize()) 11788 return SDValue(); 11789 11790 // DAG combiner will fold: 11791 // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) 11792 // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2 11793 // Other code patterns that can be also be modified have the following form: 11794 // b + ((a << 1) | 510) 11795 // b + ((a << 1) & 510) 11796 // b + ((a << 1) ^ 510) 11797 // b + ((a << 1) + 510) 11798 11799 // Many instructions can perform the shift for free, but it requires both 11800 // the operands to be registers. If c1 << c2 is too large, a mov immediate 11801 // instruction will needed. So, unfold back to the original pattern if: 11802 // - if c1 and c2 are small enough that they don't require mov imms. 11803 // - the user(s) of the node can perform an shl 11804 11805 // No shifted operands for 16-bit instructions. 11806 if (ST->isThumb() && ST->isThumb1Only()) 11807 return SDValue(); 11808 11809 // Check that all the users could perform the shl themselves. 11810 for (auto U : N->uses()) { 11811 switch(U->getOpcode()) { 11812 default: 11813 return SDValue(); 11814 case ISD::SUB: 11815 case ISD::ADD: 11816 case ISD::AND: 11817 case ISD::OR: 11818 case ISD::XOR: 11819 case ISD::SETCC: 11820 case ARMISD::CMP: 11821 // Check that the user isn't already using a constant because there 11822 // aren't any instructions that support an immediate operand and a 11823 // shifted operand. 11824 if (isa<ConstantSDNode>(U->getOperand(0)) || 11825 isa<ConstantSDNode>(U->getOperand(1))) 11826 return SDValue(); 11827 11828 // Check that it's not already using a shift. 11829 if (U->getOperand(0).getOpcode() == ISD::SHL || 11830 U->getOperand(1).getOpcode() == ISD::SHL) 11831 return SDValue(); 11832 break; 11833 } 11834 } 11835 11836 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR && 11837 N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND) 11838 return SDValue(); 11839 11840 if (N->getOperand(0).getOpcode() != ISD::SHL) 11841 return SDValue(); 11842 11843 SDValue SHL = N->getOperand(0); 11844 11845 auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1)); 11846 auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1)); 11847 if (!C1ShlC2 || !C2) 11848 return SDValue(); 11849 11850 APInt C2Int = C2->getAPIntValue(); 11851 APInt C1Int = C1ShlC2->getAPIntValue(); 11852 11853 // Check that performing a lshr will not lose any information. 11854 APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(), 11855 C2Int.getBitWidth() - C2->getZExtValue()); 11856 if ((C1Int & Mask) != C1Int) 11857 return SDValue(); 11858 11859 // Shift the first constant. 11860 C1Int.lshrInPlace(C2Int); 11861 11862 // The immediates are encoded as an 8-bit value that can be rotated. 11863 auto LargeImm = [](const APInt &Imm) { 11864 unsigned Zeros = Imm.countLeadingZeros() + Imm.countTrailingZeros(); 11865 return Imm.getBitWidth() - Zeros > 8; 11866 }; 11867 11868 if (LargeImm(C1Int) || LargeImm(C2Int)) 11869 return SDValue(); 11870 11871 SelectionDAG &DAG = DCI.DAG; 11872 SDLoc dl(N); 11873 SDValue X = SHL.getOperand(0); 11874 SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X, 11875 DAG.getConstant(C1Int, dl, MVT::i32)); 11876 // Shift left to compensate for the lshr of C1Int. 11877 SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1)); 11878 11879 LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump(); 11880 SHL.dump(); N->dump()); 11881 LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump()); 11882 return Res; 11883 } 11884 11885 11886 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. 11887 /// 11888 static SDValue PerformADDCombine(SDNode *N, 11889 TargetLowering::DAGCombinerInfo &DCI, 11890 const ARMSubtarget *Subtarget) { 11891 SDValue N0 = N->getOperand(0); 11892 SDValue N1 = N->getOperand(1); 11893 11894 // Only works one way, because it needs an immediate operand. 11895 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 11896 return Result; 11897 11898 // First try with the default operand order. 11899 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget)) 11900 return Result; 11901 11902 // If that didn't work, try again with the operands commuted. 11903 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget); 11904 } 11905 11906 /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB. 11907 /// 11908 static SDValue PerformSUBCombine(SDNode *N, 11909 TargetLowering::DAGCombinerInfo &DCI, 11910 const ARMSubtarget *Subtarget) { 11911 SDValue N0 = N->getOperand(0); 11912 SDValue N1 = N->getOperand(1); 11913 11914 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 11915 if (N1.getNode()->hasOneUse()) 11916 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI)) 11917 return Result; 11918 11919 if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector()) 11920 return SDValue(); 11921 11922 // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x)) 11923 // so that we can readily pattern match more mve instructions which can use 11924 // a scalar operand. 11925 SDValue VDup = N->getOperand(1); 11926 if (VDup->getOpcode() != ARMISD::VDUP) 11927 return SDValue(); 11928 11929 SDValue VMov = N->getOperand(0); 11930 if (VMov->getOpcode() == ISD::BITCAST) 11931 VMov = VMov->getOperand(0); 11932 11933 if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov)) 11934 return SDValue(); 11935 11936 SDLoc dl(N); 11937 SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32, 11938 DCI.DAG.getConstant(0, dl, MVT::i32), 11939 VDup->getOperand(0)); 11940 return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate); 11941 } 11942 11943 /// PerformVMULCombine 11944 /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the 11945 /// special multiplier accumulator forwarding. 11946 /// vmul d3, d0, d2 11947 /// vmla d3, d1, d2 11948 /// is faster than 11949 /// vadd d3, d0, d1 11950 /// vmul d3, d3, d2 11951 // However, for (A + B) * (A + B), 11952 // vadd d2, d0, d1 11953 // vmul d3, d0, d2 11954 // vmla d3, d1, d2 11955 // is slower than 11956 // vadd d2, d0, d1 11957 // vmul d3, d2, d2 11958 static SDValue PerformVMULCombine(SDNode *N, 11959 TargetLowering::DAGCombinerInfo &DCI, 11960 const ARMSubtarget *Subtarget) { 11961 if (!Subtarget->hasVMLxForwarding()) 11962 return SDValue(); 11963 11964 SelectionDAG &DAG = DCI.DAG; 11965 SDValue N0 = N->getOperand(0); 11966 SDValue N1 = N->getOperand(1); 11967 unsigned Opcode = N0.getOpcode(); 11968 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 11969 Opcode != ISD::FADD && Opcode != ISD::FSUB) { 11970 Opcode = N1.getOpcode(); 11971 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 11972 Opcode != ISD::FADD && Opcode != ISD::FSUB) 11973 return SDValue(); 11974 std::swap(N0, N1); 11975 } 11976 11977 if (N0 == N1) 11978 return SDValue(); 11979 11980 EVT VT = N->getValueType(0); 11981 SDLoc DL(N); 11982 SDValue N00 = N0->getOperand(0); 11983 SDValue N01 = N0->getOperand(1); 11984 return DAG.getNode(Opcode, DL, VT, 11985 DAG.getNode(ISD::MUL, DL, VT, N00, N1), 11986 DAG.getNode(ISD::MUL, DL, VT, N01, N1)); 11987 } 11988 11989 static SDValue PerformMULCombine(SDNode *N, 11990 TargetLowering::DAGCombinerInfo &DCI, 11991 const ARMSubtarget *Subtarget) { 11992 SelectionDAG &DAG = DCI.DAG; 11993 11994 if (Subtarget->isThumb1Only()) 11995 return SDValue(); 11996 11997 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 11998 return SDValue(); 11999 12000 EVT VT = N->getValueType(0); 12001 if (VT.is64BitVector() || VT.is128BitVector()) 12002 return PerformVMULCombine(N, DCI, Subtarget); 12003 if (VT != MVT::i32) 12004 return SDValue(); 12005 12006 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 12007 if (!C) 12008 return SDValue(); 12009 12010 int64_t MulAmt = C->getSExtValue(); 12011 unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt); 12012 12013 ShiftAmt = ShiftAmt & (32 - 1); 12014 SDValue V = N->getOperand(0); 12015 SDLoc DL(N); 12016 12017 SDValue Res; 12018 MulAmt >>= ShiftAmt; 12019 12020 if (MulAmt >= 0) { 12021 if (isPowerOf2_32(MulAmt - 1)) { 12022 // (mul x, 2^N + 1) => (add (shl x, N), x) 12023 Res = DAG.getNode(ISD::ADD, DL, VT, 12024 V, 12025 DAG.getNode(ISD::SHL, DL, VT, 12026 V, 12027 DAG.getConstant(Log2_32(MulAmt - 1), DL, 12028 MVT::i32))); 12029 } else if (isPowerOf2_32(MulAmt + 1)) { 12030 // (mul x, 2^N - 1) => (sub (shl x, N), x) 12031 Res = DAG.getNode(ISD::SUB, DL, VT, 12032 DAG.getNode(ISD::SHL, DL, VT, 12033 V, 12034 DAG.getConstant(Log2_32(MulAmt + 1), DL, 12035 MVT::i32)), 12036 V); 12037 } else 12038 return SDValue(); 12039 } else { 12040 uint64_t MulAmtAbs = -MulAmt; 12041 if (isPowerOf2_32(MulAmtAbs + 1)) { 12042 // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) 12043 Res = DAG.getNode(ISD::SUB, DL, VT, 12044 V, 12045 DAG.getNode(ISD::SHL, DL, VT, 12046 V, 12047 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL, 12048 MVT::i32))); 12049 } else if (isPowerOf2_32(MulAmtAbs - 1)) { 12050 // (mul x, -(2^N + 1)) => - (add (shl x, N), x) 12051 Res = DAG.getNode(ISD::ADD, DL, VT, 12052 V, 12053 DAG.getNode(ISD::SHL, DL, VT, 12054 V, 12055 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL, 12056 MVT::i32))); 12057 Res = DAG.getNode(ISD::SUB, DL, VT, 12058 DAG.getConstant(0, DL, MVT::i32), Res); 12059 } else 12060 return SDValue(); 12061 } 12062 12063 if (ShiftAmt != 0) 12064 Res = DAG.getNode(ISD::SHL, DL, VT, 12065 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32)); 12066 12067 // Do not add new nodes to DAG combiner worklist. 12068 DCI.CombineTo(N, Res, false); 12069 return SDValue(); 12070 } 12071 12072 static SDValue CombineANDShift(SDNode *N, 12073 TargetLowering::DAGCombinerInfo &DCI, 12074 const ARMSubtarget *Subtarget) { 12075 // Allow DAGCombine to pattern-match before we touch the canonical form. 12076 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 12077 return SDValue(); 12078 12079 if (N->getValueType(0) != MVT::i32) 12080 return SDValue(); 12081 12082 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 12083 if (!N1C) 12084 return SDValue(); 12085 12086 uint32_t C1 = (uint32_t)N1C->getZExtValue(); 12087 // Don't transform uxtb/uxth. 12088 if (C1 == 255 || C1 == 65535) 12089 return SDValue(); 12090 12091 SDNode *N0 = N->getOperand(0).getNode(); 12092 if (!N0->hasOneUse()) 12093 return SDValue(); 12094 12095 if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL) 12096 return SDValue(); 12097 12098 bool LeftShift = N0->getOpcode() == ISD::SHL; 12099 12100 ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1)); 12101 if (!N01C) 12102 return SDValue(); 12103 12104 uint32_t C2 = (uint32_t)N01C->getZExtValue(); 12105 if (!C2 || C2 >= 32) 12106 return SDValue(); 12107 12108 // Clear irrelevant bits in the mask. 12109 if (LeftShift) 12110 C1 &= (-1U << C2); 12111 else 12112 C1 &= (-1U >> C2); 12113 12114 SelectionDAG &DAG = DCI.DAG; 12115 SDLoc DL(N); 12116 12117 // We have a pattern of the form "(and (shl x, c2) c1)" or 12118 // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to 12119 // transform to a pair of shifts, to save materializing c1. 12120 12121 // First pattern: right shift, then mask off leading bits. 12122 // FIXME: Use demanded bits? 12123 if (!LeftShift && isMask_32(C1)) { 12124 uint32_t C3 = countLeadingZeros(C1); 12125 if (C2 < C3) { 12126 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 12127 DAG.getConstant(C3 - C2, DL, MVT::i32)); 12128 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL, 12129 DAG.getConstant(C3, DL, MVT::i32)); 12130 } 12131 } 12132 12133 // First pattern, reversed: left shift, then mask off trailing bits. 12134 if (LeftShift && isMask_32(~C1)) { 12135 uint32_t C3 = countTrailingZeros(C1); 12136 if (C2 < C3) { 12137 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0), 12138 DAG.getConstant(C3 - C2, DL, MVT::i32)); 12139 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL, 12140 DAG.getConstant(C3, DL, MVT::i32)); 12141 } 12142 } 12143 12144 // Second pattern: left shift, then mask off leading bits. 12145 // FIXME: Use demanded bits? 12146 if (LeftShift && isShiftedMask_32(C1)) { 12147 uint32_t Trailing = countTrailingZeros(C1); 12148 uint32_t C3 = countLeadingZeros(C1); 12149 if (Trailing == C2 && C2 + C3 < 32) { 12150 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 12151 DAG.getConstant(C2 + C3, DL, MVT::i32)); 12152 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL, 12153 DAG.getConstant(C3, DL, MVT::i32)); 12154 } 12155 } 12156 12157 // Second pattern, reversed: right shift, then mask off trailing bits. 12158 // FIXME: Handle other patterns of known/demanded bits. 12159 if (!LeftShift && isShiftedMask_32(C1)) { 12160 uint32_t Leading = countLeadingZeros(C1); 12161 uint32_t C3 = countTrailingZeros(C1); 12162 if (Leading == C2 && C2 + C3 < 32) { 12163 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0), 12164 DAG.getConstant(C2 + C3, DL, MVT::i32)); 12165 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL, 12166 DAG.getConstant(C3, DL, MVT::i32)); 12167 } 12168 } 12169 12170 // FIXME: Transform "(and (shl x, c2) c1)" -> 12171 // "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than 12172 // c1. 12173 return SDValue(); 12174 } 12175 12176 static SDValue PerformANDCombine(SDNode *N, 12177 TargetLowering::DAGCombinerInfo &DCI, 12178 const ARMSubtarget *Subtarget) { 12179 // Attempt to use immediate-form VBIC 12180 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 12181 SDLoc dl(N); 12182 EVT VT = N->getValueType(0); 12183 SelectionDAG &DAG = DCI.DAG; 12184 12185 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 12186 return SDValue(); 12187 12188 APInt SplatBits, SplatUndef; 12189 unsigned SplatBitSize; 12190 bool HasAnyUndefs; 12191 if (BVN && Subtarget->hasNEON() && 12192 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 12193 if (SplatBitSize <= 64) { 12194 EVT VbicVT; 12195 SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(), 12196 SplatUndef.getZExtValue(), SplatBitSize, 12197 DAG, dl, VbicVT, VT.is128BitVector(), 12198 OtherModImm); 12199 if (Val.getNode()) { 12200 SDValue Input = 12201 DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0)); 12202 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val); 12203 return DAG.getNode(ISD::BITCAST, dl, VT, Vbic); 12204 } 12205 } 12206 } 12207 12208 if (!Subtarget->isThumb1Only()) { 12209 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) 12210 if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI)) 12211 return Result; 12212 12213 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 12214 return Result; 12215 } 12216 12217 if (Subtarget->isThumb1Only()) 12218 if (SDValue Result = CombineANDShift(N, DCI, Subtarget)) 12219 return Result; 12220 12221 return SDValue(); 12222 } 12223 12224 // Try combining OR nodes to SMULWB, SMULWT. 12225 static SDValue PerformORCombineToSMULWBT(SDNode *OR, 12226 TargetLowering::DAGCombinerInfo &DCI, 12227 const ARMSubtarget *Subtarget) { 12228 if (!Subtarget->hasV6Ops() || 12229 (Subtarget->isThumb() && 12230 (!Subtarget->hasThumb2() || !Subtarget->hasDSP()))) 12231 return SDValue(); 12232 12233 SDValue SRL = OR->getOperand(0); 12234 SDValue SHL = OR->getOperand(1); 12235 12236 if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) { 12237 SRL = OR->getOperand(1); 12238 SHL = OR->getOperand(0); 12239 } 12240 if (!isSRL16(SRL) || !isSHL16(SHL)) 12241 return SDValue(); 12242 12243 // The first operands to the shifts need to be the two results from the 12244 // same smul_lohi node. 12245 if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) || 12246 SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI) 12247 return SDValue(); 12248 12249 SDNode *SMULLOHI = SRL.getOperand(0).getNode(); 12250 if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) || 12251 SHL.getOperand(0) != SDValue(SMULLOHI, 1)) 12252 return SDValue(); 12253 12254 // Now we have: 12255 // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16))) 12256 // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments. 12257 // For SMUWB the 16-bit value will signed extended somehow. 12258 // For SMULWT only the SRA is required. 12259 // Check both sides of SMUL_LOHI 12260 SDValue OpS16 = SMULLOHI->getOperand(0); 12261 SDValue OpS32 = SMULLOHI->getOperand(1); 12262 12263 SelectionDAG &DAG = DCI.DAG; 12264 if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) { 12265 OpS16 = OpS32; 12266 OpS32 = SMULLOHI->getOperand(0); 12267 } 12268 12269 SDLoc dl(OR); 12270 unsigned Opcode = 0; 12271 if (isS16(OpS16, DAG)) 12272 Opcode = ARMISD::SMULWB; 12273 else if (isSRA16(OpS16)) { 12274 Opcode = ARMISD::SMULWT; 12275 OpS16 = OpS16->getOperand(0); 12276 } 12277 else 12278 return SDValue(); 12279 12280 SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16); 12281 DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res); 12282 return SDValue(OR, 0); 12283 } 12284 12285 static SDValue PerformORCombineToBFI(SDNode *N, 12286 TargetLowering::DAGCombinerInfo &DCI, 12287 const ARMSubtarget *Subtarget) { 12288 // BFI is only available on V6T2+ 12289 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops()) 12290 return SDValue(); 12291 12292 EVT VT = N->getValueType(0); 12293 SDValue N0 = N->getOperand(0); 12294 SDValue N1 = N->getOperand(1); 12295 SelectionDAG &DAG = DCI.DAG; 12296 SDLoc DL(N); 12297 // 1) or (and A, mask), val => ARMbfi A, val, mask 12298 // iff (val & mask) == val 12299 // 12300 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 12301 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2) 12302 // && mask == ~mask2 12303 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2) 12304 // && ~mask == mask2 12305 // (i.e., copy a bitfield value into another bitfield of the same width) 12306 12307 if (VT != MVT::i32) 12308 return SDValue(); 12309 12310 SDValue N00 = N0.getOperand(0); 12311 12312 // The value and the mask need to be constants so we can verify this is 12313 // actually a bitfield set. If the mask is 0xffff, we can do better 12314 // via a movt instruction, so don't use BFI in that case. 12315 SDValue MaskOp = N0.getOperand(1); 12316 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp); 12317 if (!MaskC) 12318 return SDValue(); 12319 unsigned Mask = MaskC->getZExtValue(); 12320 if (Mask == 0xffff) 12321 return SDValue(); 12322 SDValue Res; 12323 // Case (1): or (and A, mask), val => ARMbfi A, val, mask 12324 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 12325 if (N1C) { 12326 unsigned Val = N1C->getZExtValue(); 12327 if ((Val & ~Mask) != Val) 12328 return SDValue(); 12329 12330 if (ARM::isBitFieldInvertedMask(Mask)) { 12331 Val >>= countTrailingZeros(~Mask); 12332 12333 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, 12334 DAG.getConstant(Val, DL, MVT::i32), 12335 DAG.getConstant(Mask, DL, MVT::i32)); 12336 12337 DCI.CombineTo(N, Res, false); 12338 // Return value from the original node to inform the combiner than N is 12339 // now dead. 12340 return SDValue(N, 0); 12341 } 12342 } else if (N1.getOpcode() == ISD::AND) { 12343 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 12344 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 12345 if (!N11C) 12346 return SDValue(); 12347 unsigned Mask2 = N11C->getZExtValue(); 12348 12349 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern 12350 // as is to match. 12351 if (ARM::isBitFieldInvertedMask(Mask) && 12352 (Mask == ~Mask2)) { 12353 // The pack halfword instruction works better for masks that fit it, 12354 // so use that when it's available. 12355 if (Subtarget->hasDSP() && 12356 (Mask == 0xffff || Mask == 0xffff0000)) 12357 return SDValue(); 12358 // 2a 12359 unsigned amt = countTrailingZeros(Mask2); 12360 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0), 12361 DAG.getConstant(amt, DL, MVT::i32)); 12362 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res, 12363 DAG.getConstant(Mask, DL, MVT::i32)); 12364 DCI.CombineTo(N, Res, false); 12365 // Return value from the original node to inform the combiner than N is 12366 // now dead. 12367 return SDValue(N, 0); 12368 } else if (ARM::isBitFieldInvertedMask(~Mask) && 12369 (~Mask == Mask2)) { 12370 // The pack halfword instruction works better for masks that fit it, 12371 // so use that when it's available. 12372 if (Subtarget->hasDSP() && 12373 (Mask2 == 0xffff || Mask2 == 0xffff0000)) 12374 return SDValue(); 12375 // 2b 12376 unsigned lsb = countTrailingZeros(Mask); 12377 Res = DAG.getNode(ISD::SRL, DL, VT, N00, 12378 DAG.getConstant(lsb, DL, MVT::i32)); 12379 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res, 12380 DAG.getConstant(Mask2, DL, MVT::i32)); 12381 DCI.CombineTo(N, Res, false); 12382 // Return value from the original node to inform the combiner than N is 12383 // now dead. 12384 return SDValue(N, 0); 12385 } 12386 } 12387 12388 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) && 12389 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) && 12390 ARM::isBitFieldInvertedMask(~Mask)) { 12391 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask 12392 // where lsb(mask) == #shamt and masked bits of B are known zero. 12393 SDValue ShAmt = N00.getOperand(1); 12394 unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue(); 12395 unsigned LSB = countTrailingZeros(Mask); 12396 if (ShAmtC != LSB) 12397 return SDValue(); 12398 12399 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0), 12400 DAG.getConstant(~Mask, DL, MVT::i32)); 12401 12402 DCI.CombineTo(N, Res, false); 12403 // Return value from the original node to inform the combiner than N is 12404 // now dead. 12405 return SDValue(N, 0); 12406 } 12407 12408 return SDValue(); 12409 } 12410 12411 static bool isValidMVECond(unsigned CC, bool IsFloat) { 12412 switch (CC) { 12413 case ARMCC::EQ: 12414 case ARMCC::NE: 12415 case ARMCC::LE: 12416 case ARMCC::GT: 12417 case ARMCC::GE: 12418 case ARMCC::LT: 12419 return true; 12420 case ARMCC::HS: 12421 case ARMCC::HI: 12422 return !IsFloat; 12423 default: 12424 return false; 12425 }; 12426 } 12427 12428 static SDValue PerformORCombine_i1(SDNode *N, 12429 TargetLowering::DAGCombinerInfo &DCI, 12430 const ARMSubtarget *Subtarget) { 12431 // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain 12432 // together with predicates 12433 EVT VT = N->getValueType(0); 12434 SDValue N0 = N->getOperand(0); 12435 SDValue N1 = N->getOperand(1); 12436 12437 ARMCC::CondCodes CondCode0 = ARMCC::AL; 12438 ARMCC::CondCodes CondCode1 = ARMCC::AL; 12439 if (N0->getOpcode() == ARMISD::VCMP) 12440 CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(2)) 12441 ->getZExtValue(); 12442 else if (N0->getOpcode() == ARMISD::VCMPZ) 12443 CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(1)) 12444 ->getZExtValue(); 12445 if (N1->getOpcode() == ARMISD::VCMP) 12446 CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(2)) 12447 ->getZExtValue(); 12448 else if (N1->getOpcode() == ARMISD::VCMPZ) 12449 CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(1)) 12450 ->getZExtValue(); 12451 12452 if (CondCode0 == ARMCC::AL || CondCode1 == ARMCC::AL) 12453 return SDValue(); 12454 12455 unsigned Opposite0 = ARMCC::getOppositeCondition(CondCode0); 12456 unsigned Opposite1 = ARMCC::getOppositeCondition(CondCode1); 12457 12458 if (!isValidMVECond(Opposite0, 12459 N0->getOperand(0)->getValueType(0).isFloatingPoint()) || 12460 !isValidMVECond(Opposite1, 12461 N1->getOperand(0)->getValueType(0).isFloatingPoint())) 12462 return SDValue(); 12463 12464 SmallVector<SDValue, 4> Ops0; 12465 Ops0.push_back(N0->getOperand(0)); 12466 if (N0->getOpcode() == ARMISD::VCMP) 12467 Ops0.push_back(N0->getOperand(1)); 12468 Ops0.push_back(DCI.DAG.getConstant(Opposite0, SDLoc(N0), MVT::i32)); 12469 SmallVector<SDValue, 4> Ops1; 12470 Ops1.push_back(N1->getOperand(0)); 12471 if (N1->getOpcode() == ARMISD::VCMP) 12472 Ops1.push_back(N1->getOperand(1)); 12473 Ops1.push_back(DCI.DAG.getConstant(Opposite1, SDLoc(N1), MVT::i32)); 12474 12475 SDValue NewN0 = DCI.DAG.getNode(N0->getOpcode(), SDLoc(N0), VT, Ops0); 12476 SDValue NewN1 = DCI.DAG.getNode(N1->getOpcode(), SDLoc(N1), VT, Ops1); 12477 SDValue And = DCI.DAG.getNode(ISD::AND, SDLoc(N), VT, NewN0, NewN1); 12478 return DCI.DAG.getNode(ISD::XOR, SDLoc(N), VT, And, 12479 DCI.DAG.getAllOnesConstant(SDLoc(N), VT)); 12480 } 12481 12482 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR 12483 static SDValue PerformORCombine(SDNode *N, 12484 TargetLowering::DAGCombinerInfo &DCI, 12485 const ARMSubtarget *Subtarget) { 12486 // Attempt to use immediate-form VORR 12487 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 12488 SDLoc dl(N); 12489 EVT VT = N->getValueType(0); 12490 SelectionDAG &DAG = DCI.DAG; 12491 12492 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 12493 return SDValue(); 12494 12495 APInt SplatBits, SplatUndef; 12496 unsigned SplatBitSize; 12497 bool HasAnyUndefs; 12498 if (BVN && Subtarget->hasNEON() && 12499 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 12500 if (SplatBitSize <= 64) { 12501 EVT VorrVT; 12502 SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(), 12503 SplatUndef.getZExtValue(), SplatBitSize, 12504 DAG, dl, VorrVT, VT.is128BitVector(), 12505 OtherModImm); 12506 if (Val.getNode()) { 12507 SDValue Input = 12508 DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0)); 12509 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val); 12510 return DAG.getNode(ISD::BITCAST, dl, VT, Vorr); 12511 } 12512 } 12513 } 12514 12515 if (!Subtarget->isThumb1Only()) { 12516 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 12517 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) 12518 return Result; 12519 if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget)) 12520 return Result; 12521 } 12522 12523 SDValue N0 = N->getOperand(0); 12524 SDValue N1 = N->getOperand(1); 12525 12526 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant. 12527 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() && 12528 DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 12529 12530 // The code below optimizes (or (and X, Y), Z). 12531 // The AND operand needs to have a single user to make these optimizations 12532 // profitable. 12533 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse()) 12534 return SDValue(); 12535 12536 APInt SplatUndef; 12537 unsigned SplatBitSize; 12538 bool HasAnyUndefs; 12539 12540 APInt SplatBits0, SplatBits1; 12541 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1)); 12542 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1)); 12543 // Ensure that the second operand of both ands are constants 12544 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize, 12545 HasAnyUndefs) && !HasAnyUndefs) { 12546 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, 12547 HasAnyUndefs) && !HasAnyUndefs) { 12548 // Ensure that the bit width of the constants are the same and that 12549 // the splat arguments are logical inverses as per the pattern we 12550 // are trying to simplify. 12551 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() && 12552 SplatBits0 == ~SplatBits1) { 12553 // Canonicalize the vector type to make instruction selection 12554 // simpler. 12555 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 12556 SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT, 12557 N0->getOperand(1), 12558 N0->getOperand(0), 12559 N1->getOperand(0)); 12560 return DAG.getNode(ISD::BITCAST, dl, VT, Result); 12561 } 12562 } 12563 } 12564 } 12565 12566 if (Subtarget->hasMVEIntegerOps() && 12567 (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)) 12568 return PerformORCombine_i1(N, DCI, Subtarget); 12569 12570 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when 12571 // reasonable. 12572 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { 12573 if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget)) 12574 return Res; 12575 } 12576 12577 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 12578 return Result; 12579 12580 return SDValue(); 12581 } 12582 12583 static SDValue PerformXORCombine(SDNode *N, 12584 TargetLowering::DAGCombinerInfo &DCI, 12585 const ARMSubtarget *Subtarget) { 12586 EVT VT = N->getValueType(0); 12587 SelectionDAG &DAG = DCI.DAG; 12588 12589 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 12590 return SDValue(); 12591 12592 if (!Subtarget->isThumb1Only()) { 12593 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 12594 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) 12595 return Result; 12596 12597 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 12598 return Result; 12599 } 12600 12601 return SDValue(); 12602 } 12603 12604 // ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it, 12605 // and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and 12606 // their position in "to" (Rd). 12607 static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) { 12608 assert(N->getOpcode() == ARMISD::BFI); 12609 12610 SDValue From = N->getOperand(1); 12611 ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue(); 12612 FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation()); 12613 12614 // If the Base came from a SHR #C, we can deduce that it is really testing bit 12615 // #C in the base of the SHR. 12616 if (From->getOpcode() == ISD::SRL && 12617 isa<ConstantSDNode>(From->getOperand(1))) { 12618 APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue(); 12619 assert(Shift.getLimitedValue() < 32 && "Shift too large!"); 12620 FromMask <<= Shift.getLimitedValue(31); 12621 From = From->getOperand(0); 12622 } 12623 12624 return From; 12625 } 12626 12627 // If A and B contain one contiguous set of bits, does A | B == A . B? 12628 // 12629 // Neither A nor B must be zero. 12630 static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) { 12631 unsigned LastActiveBitInA = A.countTrailingZeros(); 12632 unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1; 12633 return LastActiveBitInA - 1 == FirstActiveBitInB; 12634 } 12635 12636 static SDValue FindBFIToCombineWith(SDNode *N) { 12637 // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with, 12638 // if one exists. 12639 APInt ToMask, FromMask; 12640 SDValue From = ParseBFI(N, ToMask, FromMask); 12641 SDValue To = N->getOperand(0); 12642 12643 // Now check for a compatible BFI to merge with. We can pass through BFIs that 12644 // aren't compatible, but not if they set the same bit in their destination as 12645 // we do (or that of any BFI we're going to combine with). 12646 SDValue V = To; 12647 APInt CombinedToMask = ToMask; 12648 while (V.getOpcode() == ARMISD::BFI) { 12649 APInt NewToMask, NewFromMask; 12650 SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask); 12651 if (NewFrom != From) { 12652 // This BFI has a different base. Keep going. 12653 CombinedToMask |= NewToMask; 12654 V = V.getOperand(0); 12655 continue; 12656 } 12657 12658 // Do the written bits conflict with any we've seen so far? 12659 if ((NewToMask & CombinedToMask).getBoolValue()) 12660 // Conflicting bits - bail out because going further is unsafe. 12661 return SDValue(); 12662 12663 // Are the new bits contiguous when combined with the old bits? 12664 if (BitsProperlyConcatenate(ToMask, NewToMask) && 12665 BitsProperlyConcatenate(FromMask, NewFromMask)) 12666 return V; 12667 if (BitsProperlyConcatenate(NewToMask, ToMask) && 12668 BitsProperlyConcatenate(NewFromMask, FromMask)) 12669 return V; 12670 12671 // We've seen a write to some bits, so track it. 12672 CombinedToMask |= NewToMask; 12673 // Keep going... 12674 V = V.getOperand(0); 12675 } 12676 12677 return SDValue(); 12678 } 12679 12680 static SDValue PerformBFICombine(SDNode *N, 12681 TargetLowering::DAGCombinerInfo &DCI) { 12682 SDValue N1 = N->getOperand(1); 12683 if (N1.getOpcode() == ISD::AND) { 12684 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff 12685 // the bits being cleared by the AND are not demanded by the BFI. 12686 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 12687 if (!N11C) 12688 return SDValue(); 12689 unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 12690 unsigned LSB = countTrailingZeros(~InvMask); 12691 unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB; 12692 assert(Width < 12693 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) && 12694 "undefined behavior"); 12695 unsigned Mask = (1u << Width) - 1; 12696 unsigned Mask2 = N11C->getZExtValue(); 12697 if ((Mask & (~Mask2)) == 0) 12698 return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0), 12699 N->getOperand(0), N1.getOperand(0), 12700 N->getOperand(2)); 12701 } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) { 12702 // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes. 12703 // Keep track of any consecutive bits set that all come from the same base 12704 // value. We can combine these together into a single BFI. 12705 SDValue CombineBFI = FindBFIToCombineWith(N); 12706 if (CombineBFI == SDValue()) 12707 return SDValue(); 12708 12709 // We've found a BFI. 12710 APInt ToMask1, FromMask1; 12711 SDValue From1 = ParseBFI(N, ToMask1, FromMask1); 12712 12713 APInt ToMask2, FromMask2; 12714 SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2); 12715 assert(From1 == From2); 12716 (void)From2; 12717 12718 // First, unlink CombineBFI. 12719 DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0)); 12720 // Then create a new BFI, combining the two together. 12721 APInt NewFromMask = FromMask1 | FromMask2; 12722 APInt NewToMask = ToMask1 | ToMask2; 12723 12724 EVT VT = N->getValueType(0); 12725 SDLoc dl(N); 12726 12727 if (NewFromMask[0] == 0) 12728 From1 = DCI.DAG.getNode( 12729 ISD::SRL, dl, VT, From1, 12730 DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT)); 12731 return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1, 12732 DCI.DAG.getConstant(~NewToMask, dl, VT)); 12733 } 12734 return SDValue(); 12735 } 12736 12737 /// PerformVMOVRRDCombine - Target-specific dag combine xforms for 12738 /// ARMISD::VMOVRRD. 12739 static SDValue PerformVMOVRRDCombine(SDNode *N, 12740 TargetLowering::DAGCombinerInfo &DCI, 12741 const ARMSubtarget *Subtarget) { 12742 // vmovrrd(vmovdrr x, y) -> x,y 12743 SDValue InDouble = N->getOperand(0); 12744 if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64()) 12745 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1)); 12746 12747 // vmovrrd(load f64) -> (load i32), (load i32) 12748 SDNode *InNode = InDouble.getNode(); 12749 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() && 12750 InNode->getValueType(0) == MVT::f64 && 12751 InNode->getOperand(1).getOpcode() == ISD::FrameIndex && 12752 !cast<LoadSDNode>(InNode)->isVolatile()) { 12753 // TODO: Should this be done for non-FrameIndex operands? 12754 LoadSDNode *LD = cast<LoadSDNode>(InNode); 12755 12756 SelectionDAG &DAG = DCI.DAG; 12757 SDLoc DL(LD); 12758 SDValue BasePtr = LD->getBasePtr(); 12759 SDValue NewLD1 = 12760 DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(), 12761 LD->getAlignment(), LD->getMemOperand()->getFlags()); 12762 12763 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 12764 DAG.getConstant(4, DL, MVT::i32)); 12765 12766 SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr, 12767 LD->getPointerInfo().getWithOffset(4), 12768 std::min(4U, LD->getAlignment()), 12769 LD->getMemOperand()->getFlags()); 12770 12771 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1)); 12772 if (DCI.DAG.getDataLayout().isBigEndian()) 12773 std::swap (NewLD1, NewLD2); 12774 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2); 12775 return Result; 12776 } 12777 12778 return SDValue(); 12779 } 12780 12781 /// PerformVMOVDRRCombine - Target-specific dag combine xforms for 12782 /// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands. 12783 static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) { 12784 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X) 12785 SDValue Op0 = N->getOperand(0); 12786 SDValue Op1 = N->getOperand(1); 12787 if (Op0.getOpcode() == ISD::BITCAST) 12788 Op0 = Op0.getOperand(0); 12789 if (Op1.getOpcode() == ISD::BITCAST) 12790 Op1 = Op1.getOperand(0); 12791 if (Op0.getOpcode() == ARMISD::VMOVRRD && 12792 Op0.getNode() == Op1.getNode() && 12793 Op0.getResNo() == 0 && Op1.getResNo() == 1) 12794 return DAG.getNode(ISD::BITCAST, SDLoc(N), 12795 N->getValueType(0), Op0.getOperand(0)); 12796 return SDValue(); 12797 } 12798 12799 /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node 12800 /// are normal, non-volatile loads. If so, it is profitable to bitcast an 12801 /// i64 vector to have f64 elements, since the value can then be loaded 12802 /// directly into a VFP register. 12803 static bool hasNormalLoadOperand(SDNode *N) { 12804 unsigned NumElts = N->getValueType(0).getVectorNumElements(); 12805 for (unsigned i = 0; i < NumElts; ++i) { 12806 SDNode *Elt = N->getOperand(i).getNode(); 12807 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile()) 12808 return true; 12809 } 12810 return false; 12811 } 12812 12813 /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for 12814 /// ISD::BUILD_VECTOR. 12815 static SDValue PerformBUILD_VECTORCombine(SDNode *N, 12816 TargetLowering::DAGCombinerInfo &DCI, 12817 const ARMSubtarget *Subtarget) { 12818 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X): 12819 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value 12820 // into a pair of GPRs, which is fine when the value is used as a scalar, 12821 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD. 12822 SelectionDAG &DAG = DCI.DAG; 12823 if (N->getNumOperands() == 2) 12824 if (SDValue RV = PerformVMOVDRRCombine(N, DAG)) 12825 return RV; 12826 12827 // Load i64 elements as f64 values so that type legalization does not split 12828 // them up into i32 values. 12829 EVT VT = N->getValueType(0); 12830 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N)) 12831 return SDValue(); 12832 SDLoc dl(N); 12833 SmallVector<SDValue, 8> Ops; 12834 unsigned NumElts = VT.getVectorNumElements(); 12835 for (unsigned i = 0; i < NumElts; ++i) { 12836 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i)); 12837 Ops.push_back(V); 12838 // Make the DAGCombiner fold the bitcast. 12839 DCI.AddToWorklist(V.getNode()); 12840 } 12841 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts); 12842 SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops); 12843 return DAG.getNode(ISD::BITCAST, dl, VT, BV); 12844 } 12845 12846 /// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR. 12847 static SDValue 12848 PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 12849 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR. 12850 // At that time, we may have inserted bitcasts from integer to float. 12851 // If these bitcasts have survived DAGCombine, change the lowering of this 12852 // BUILD_VECTOR in something more vector friendly, i.e., that does not 12853 // force to use floating point types. 12854 12855 // Make sure we can change the type of the vector. 12856 // This is possible iff: 12857 // 1. The vector is only used in a bitcast to a integer type. I.e., 12858 // 1.1. Vector is used only once. 12859 // 1.2. Use is a bit convert to an integer type. 12860 // 2. The size of its operands are 32-bits (64-bits are not legal). 12861 EVT VT = N->getValueType(0); 12862 EVT EltVT = VT.getVectorElementType(); 12863 12864 // Check 1.1. and 2. 12865 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse()) 12866 return SDValue(); 12867 12868 // By construction, the input type must be float. 12869 assert(EltVT == MVT::f32 && "Unexpected type!"); 12870 12871 // Check 1.2. 12872 SDNode *Use = *N->use_begin(); 12873 if (Use->getOpcode() != ISD::BITCAST || 12874 Use->getValueType(0).isFloatingPoint()) 12875 return SDValue(); 12876 12877 // Check profitability. 12878 // Model is, if more than half of the relevant operands are bitcast from 12879 // i32, turn the build_vector into a sequence of insert_vector_elt. 12880 // Relevant operands are everything that is not statically 12881 // (i.e., at compile time) bitcasted. 12882 unsigned NumOfBitCastedElts = 0; 12883 unsigned NumElts = VT.getVectorNumElements(); 12884 unsigned NumOfRelevantElts = NumElts; 12885 for (unsigned Idx = 0; Idx < NumElts; ++Idx) { 12886 SDValue Elt = N->getOperand(Idx); 12887 if (Elt->getOpcode() == ISD::BITCAST) { 12888 // Assume only bit cast to i32 will go away. 12889 if (Elt->getOperand(0).getValueType() == MVT::i32) 12890 ++NumOfBitCastedElts; 12891 } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt)) 12892 // Constants are statically casted, thus do not count them as 12893 // relevant operands. 12894 --NumOfRelevantElts; 12895 } 12896 12897 // Check if more than half of the elements require a non-free bitcast. 12898 if (NumOfBitCastedElts <= NumOfRelevantElts / 2) 12899 return SDValue(); 12900 12901 SelectionDAG &DAG = DCI.DAG; 12902 // Create the new vector type. 12903 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); 12904 // Check if the type is legal. 12905 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12906 if (!TLI.isTypeLegal(VecVT)) 12907 return SDValue(); 12908 12909 // Combine: 12910 // ARMISD::BUILD_VECTOR E1, E2, ..., EN. 12911 // => BITCAST INSERT_VECTOR_ELT 12912 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1), 12913 // (BITCAST EN), N. 12914 SDValue Vec = DAG.getUNDEF(VecVT); 12915 SDLoc dl(N); 12916 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) { 12917 SDValue V = N->getOperand(Idx); 12918 if (V.isUndef()) 12919 continue; 12920 if (V.getOpcode() == ISD::BITCAST && 12921 V->getOperand(0).getValueType() == MVT::i32) 12922 // Fold obvious case. 12923 V = V.getOperand(0); 12924 else { 12925 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V); 12926 // Make the DAGCombiner fold the bitcasts. 12927 DCI.AddToWorklist(V.getNode()); 12928 } 12929 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32); 12930 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx); 12931 } 12932 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec); 12933 // Make the DAGCombiner fold the bitcasts. 12934 DCI.AddToWorklist(Vec.getNode()); 12935 return Vec; 12936 } 12937 12938 static SDValue 12939 PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 12940 EVT VT = N->getValueType(0); 12941 SDValue Op = N->getOperand(0); 12942 SDLoc dl(N); 12943 12944 // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x) 12945 if (Op->getOpcode() == ARMISD::PREDICATE_CAST) { 12946 // If the valuetypes are the same, we can remove the cast entirely. 12947 if (Op->getOperand(0).getValueType() == VT) 12948 return Op->getOperand(0); 12949 return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, 12950 Op->getOperand(0).getValueType(), Op->getOperand(0)); 12951 } 12952 12953 return SDValue(); 12954 } 12955 12956 static SDValue PerformVCMPCombine(SDNode *N, 12957 TargetLowering::DAGCombinerInfo &DCI, 12958 const ARMSubtarget *Subtarget) { 12959 if (!Subtarget->hasMVEIntegerOps()) 12960 return SDValue(); 12961 12962 EVT VT = N->getValueType(0); 12963 SDValue Op0 = N->getOperand(0); 12964 SDValue Op1 = N->getOperand(1); 12965 ARMCC::CondCodes Cond = 12966 (ARMCC::CondCodes)cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 12967 SDLoc dl(N); 12968 12969 // vcmp X, 0, cc -> vcmpz X, cc 12970 if (isZeroVector(Op1)) 12971 return DCI.DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, 12972 N->getOperand(2)); 12973 12974 unsigned SwappedCond = getSwappedCondition(Cond); 12975 if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) { 12976 // vcmp 0, X, cc -> vcmpz X, reversed(cc) 12977 if (isZeroVector(Op0)) 12978 return DCI.DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1, 12979 DCI.DAG.getConstant(SwappedCond, dl, MVT::i32)); 12980 // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc) 12981 if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP) 12982 return DCI.DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0, 12983 DCI.DAG.getConstant(SwappedCond, dl, MVT::i32)); 12984 } 12985 12986 return SDValue(); 12987 } 12988 12989 /// PerformInsertEltCombine - Target-specific dag combine xforms for 12990 /// ISD::INSERT_VECTOR_ELT. 12991 static SDValue PerformInsertEltCombine(SDNode *N, 12992 TargetLowering::DAGCombinerInfo &DCI) { 12993 // Bitcast an i64 load inserted into a vector to f64. 12994 // Otherwise, the i64 value will be legalized to a pair of i32 values. 12995 EVT VT = N->getValueType(0); 12996 SDNode *Elt = N->getOperand(1).getNode(); 12997 if (VT.getVectorElementType() != MVT::i64 || 12998 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile()) 12999 return SDValue(); 13000 13001 SelectionDAG &DAG = DCI.DAG; 13002 SDLoc dl(N); 13003 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 13004 VT.getVectorNumElements()); 13005 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0)); 13006 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1)); 13007 // Make the DAGCombiner fold the bitcasts. 13008 DCI.AddToWorklist(Vec.getNode()); 13009 DCI.AddToWorklist(V.getNode()); 13010 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT, 13011 Vec, V, N->getOperand(2)); 13012 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt); 13013 } 13014 13015 /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for 13016 /// ISD::VECTOR_SHUFFLE. 13017 static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { 13018 // The LLVM shufflevector instruction does not require the shuffle mask 13019 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does 13020 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the 13021 // operands do not match the mask length, they are extended by concatenating 13022 // them with undef vectors. That is probably the right thing for other 13023 // targets, but for NEON it is better to concatenate two double-register 13024 // size vector operands into a single quad-register size vector. Do that 13025 // transformation here: 13026 // shuffle(concat(v1, undef), concat(v2, undef)) -> 13027 // shuffle(concat(v1, v2), undef) 13028 SDValue Op0 = N->getOperand(0); 13029 SDValue Op1 = N->getOperand(1); 13030 if (Op0.getOpcode() != ISD::CONCAT_VECTORS || 13031 Op1.getOpcode() != ISD::CONCAT_VECTORS || 13032 Op0.getNumOperands() != 2 || 13033 Op1.getNumOperands() != 2) 13034 return SDValue(); 13035 SDValue Concat0Op1 = Op0.getOperand(1); 13036 SDValue Concat1Op1 = Op1.getOperand(1); 13037 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef()) 13038 return SDValue(); 13039 // Skip the transformation if any of the types are illegal. 13040 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13041 EVT VT = N->getValueType(0); 13042 if (!TLI.isTypeLegal(VT) || 13043 !TLI.isTypeLegal(Concat0Op1.getValueType()) || 13044 !TLI.isTypeLegal(Concat1Op1.getValueType())) 13045 return SDValue(); 13046 13047 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, 13048 Op0.getOperand(0), Op1.getOperand(0)); 13049 // Translate the shuffle mask. 13050 SmallVector<int, 16> NewMask; 13051 unsigned NumElts = VT.getVectorNumElements(); 13052 unsigned HalfElts = NumElts/2; 13053 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 13054 for (unsigned n = 0; n < NumElts; ++n) { 13055 int MaskElt = SVN->getMaskElt(n); 13056 int NewElt = -1; 13057 if (MaskElt < (int)HalfElts) 13058 NewElt = MaskElt; 13059 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts)) 13060 NewElt = HalfElts + MaskElt - NumElts; 13061 NewMask.push_back(NewElt); 13062 } 13063 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat, 13064 DAG.getUNDEF(VT), NewMask); 13065 } 13066 13067 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, 13068 /// NEON load/store intrinsics, and generic vector load/stores, to merge 13069 /// base address updates. 13070 /// For generic load/stores, the memory type is assumed to be a vector. 13071 /// The caller is assumed to have checked legality. 13072 static SDValue CombineBaseUpdate(SDNode *N, 13073 TargetLowering::DAGCombinerInfo &DCI) { 13074 SelectionDAG &DAG = DCI.DAG; 13075 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || 13076 N->getOpcode() == ISD::INTRINSIC_W_CHAIN); 13077 const bool isStore = N->getOpcode() == ISD::STORE; 13078 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1); 13079 SDValue Addr = N->getOperand(AddrOpIdx); 13080 MemSDNode *MemN = cast<MemSDNode>(N); 13081 SDLoc dl(N); 13082 13083 // Search for a use of the address operand that is an increment. 13084 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), 13085 UE = Addr.getNode()->use_end(); UI != UE; ++UI) { 13086 SDNode *User = *UI; 13087 if (User->getOpcode() != ISD::ADD || 13088 UI.getUse().getResNo() != Addr.getResNo()) 13089 continue; 13090 13091 // Check that the add is independent of the load/store. Otherwise, folding 13092 // it would create a cycle. We can avoid searching through Addr as it's a 13093 // predecessor to both. 13094 SmallPtrSet<const SDNode *, 32> Visited; 13095 SmallVector<const SDNode *, 16> Worklist; 13096 Visited.insert(Addr.getNode()); 13097 Worklist.push_back(N); 13098 Worklist.push_back(User); 13099 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || 13100 SDNode::hasPredecessorHelper(User, Visited, Worklist)) 13101 continue; 13102 13103 // Find the new opcode for the updating load/store. 13104 bool isLoadOp = true; 13105 bool isLaneOp = false; 13106 unsigned NewOpc = 0; 13107 unsigned NumVecs = 0; 13108 if (isIntrinsic) { 13109 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 13110 switch (IntNo) { 13111 default: llvm_unreachable("unexpected intrinsic for Neon base update"); 13112 case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD; 13113 NumVecs = 1; break; 13114 case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD; 13115 NumVecs = 2; break; 13116 case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD; 13117 NumVecs = 3; break; 13118 case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD; 13119 NumVecs = 4; break; 13120 case Intrinsic::arm_neon_vld2dup: 13121 case Intrinsic::arm_neon_vld3dup: 13122 case Intrinsic::arm_neon_vld4dup: 13123 // TODO: Support updating VLDxDUP nodes. For now, we just skip 13124 // combining base updates for such intrinsics. 13125 continue; 13126 case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD; 13127 NumVecs = 2; isLaneOp = true; break; 13128 case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD; 13129 NumVecs = 3; isLaneOp = true; break; 13130 case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD; 13131 NumVecs = 4; isLaneOp = true; break; 13132 case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD; 13133 NumVecs = 1; isLoadOp = false; break; 13134 case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD; 13135 NumVecs = 2; isLoadOp = false; break; 13136 case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD; 13137 NumVecs = 3; isLoadOp = false; break; 13138 case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD; 13139 NumVecs = 4; isLoadOp = false; break; 13140 case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD; 13141 NumVecs = 2; isLoadOp = false; isLaneOp = true; break; 13142 case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD; 13143 NumVecs = 3; isLoadOp = false; isLaneOp = true; break; 13144 case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD; 13145 NumVecs = 4; isLoadOp = false; isLaneOp = true; break; 13146 } 13147 } else { 13148 isLaneOp = true; 13149 switch (N->getOpcode()) { 13150 default: llvm_unreachable("unexpected opcode for Neon base update"); 13151 case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break; 13152 case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break; 13153 case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break; 13154 case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break; 13155 case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD; 13156 NumVecs = 1; isLaneOp = false; break; 13157 case ISD::STORE: NewOpc = ARMISD::VST1_UPD; 13158 NumVecs = 1; isLaneOp = false; isLoadOp = false; break; 13159 } 13160 } 13161 13162 // Find the size of memory referenced by the load/store. 13163 EVT VecTy; 13164 if (isLoadOp) { 13165 VecTy = N->getValueType(0); 13166 } else if (isIntrinsic) { 13167 VecTy = N->getOperand(AddrOpIdx+1).getValueType(); 13168 } else { 13169 assert(isStore && "Node has to be a load, a store, or an intrinsic!"); 13170 VecTy = N->getOperand(1).getValueType(); 13171 } 13172 13173 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; 13174 if (isLaneOp) 13175 NumBytes /= VecTy.getVectorNumElements(); 13176 13177 // If the increment is a constant, it must match the memory ref size. 13178 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 13179 ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode()); 13180 if (NumBytes >= 3 * 16 && (!CInc || CInc->getZExtValue() != NumBytes)) { 13181 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two 13182 // separate instructions that make it harder to use a non-constant update. 13183 continue; 13184 } 13185 13186 // OK, we found an ADD we can fold into the base update. 13187 // Now, create a _UPD node, taking care of not breaking alignment. 13188 13189 EVT AlignedVecTy = VecTy; 13190 unsigned Alignment = MemN->getAlignment(); 13191 13192 // If this is a less-than-standard-aligned load/store, change the type to 13193 // match the standard alignment. 13194 // The alignment is overlooked when selecting _UPD variants; and it's 13195 // easier to introduce bitcasts here than fix that. 13196 // There are 3 ways to get to this base-update combine: 13197 // - intrinsics: they are assumed to be properly aligned (to the standard 13198 // alignment of the memory type), so we don't need to do anything. 13199 // - ARMISD::VLDx nodes: they are only generated from the aforementioned 13200 // intrinsics, so, likewise, there's nothing to do. 13201 // - generic load/store instructions: the alignment is specified as an 13202 // explicit operand, rather than implicitly as the standard alignment 13203 // of the memory type (like the intrisics). We need to change the 13204 // memory type to match the explicit alignment. That way, we don't 13205 // generate non-standard-aligned ARMISD::VLDx nodes. 13206 if (isa<LSBaseSDNode>(N)) { 13207 if (Alignment == 0) 13208 Alignment = 1; 13209 if (Alignment < VecTy.getScalarSizeInBits() / 8) { 13210 MVT EltTy = MVT::getIntegerVT(Alignment * 8); 13211 assert(NumVecs == 1 && "Unexpected multi-element generic load/store."); 13212 assert(!isLaneOp && "Unexpected generic load/store lane."); 13213 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8); 13214 AlignedVecTy = MVT::getVectorVT(EltTy, NumElts); 13215 } 13216 // Don't set an explicit alignment on regular load/stores that we want 13217 // to transform to VLD/VST 1_UPD nodes. 13218 // This matches the behavior of regular load/stores, which only get an 13219 // explicit alignment if the MMO alignment is larger than the standard 13220 // alignment of the memory type. 13221 // Intrinsics, however, always get an explicit alignment, set to the 13222 // alignment of the MMO. 13223 Alignment = 1; 13224 } 13225 13226 // Create the new updating load/store node. 13227 // First, create an SDVTList for the new updating node's results. 13228 EVT Tys[6]; 13229 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); 13230 unsigned n; 13231 for (n = 0; n < NumResultVecs; ++n) 13232 Tys[n] = AlignedVecTy; 13233 Tys[n++] = MVT::i32; 13234 Tys[n] = MVT::Other; 13235 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2)); 13236 13237 // Then, gather the new node's operands. 13238 SmallVector<SDValue, 8> Ops; 13239 Ops.push_back(N->getOperand(0)); // incoming chain 13240 Ops.push_back(N->getOperand(AddrOpIdx)); 13241 Ops.push_back(Inc); 13242 13243 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) { 13244 // Try to match the intrinsic's signature 13245 Ops.push_back(StN->getValue()); 13246 } else { 13247 // Loads (and of course intrinsics) match the intrinsics' signature, 13248 // so just add all but the alignment operand. 13249 for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i) 13250 Ops.push_back(N->getOperand(i)); 13251 } 13252 13253 // For all node types, the alignment operand is always the last one. 13254 Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32)); 13255 13256 // If this is a non-standard-aligned STORE, the penultimate operand is the 13257 // stored value. Bitcast it to the aligned type. 13258 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) { 13259 SDValue &StVal = Ops[Ops.size()-2]; 13260 StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal); 13261 } 13262 13263 EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy; 13264 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT, 13265 MemN->getMemOperand()); 13266 13267 // Update the uses. 13268 SmallVector<SDValue, 5> NewResults; 13269 for (unsigned i = 0; i < NumResultVecs; ++i) 13270 NewResults.push_back(SDValue(UpdN.getNode(), i)); 13271 13272 // If this is an non-standard-aligned LOAD, the first result is the loaded 13273 // value. Bitcast it to the expected result type. 13274 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) { 13275 SDValue &LdVal = NewResults[0]; 13276 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal); 13277 } 13278 13279 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain 13280 DCI.CombineTo(N, NewResults); 13281 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); 13282 13283 break; 13284 } 13285 return SDValue(); 13286 } 13287 13288 static SDValue PerformVLDCombine(SDNode *N, 13289 TargetLowering::DAGCombinerInfo &DCI) { 13290 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 13291 return SDValue(); 13292 13293 return CombineBaseUpdate(N, DCI); 13294 } 13295 13296 /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a 13297 /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic 13298 /// are also VDUPLANEs. If so, combine them to a vldN-dup operation and 13299 /// return true. 13300 static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 13301 SelectionDAG &DAG = DCI.DAG; 13302 EVT VT = N->getValueType(0); 13303 // vldN-dup instructions only support 64-bit vectors for N > 1. 13304 if (!VT.is64BitVector()) 13305 return false; 13306 13307 // Check if the VDUPLANE operand is a vldN-dup intrinsic. 13308 SDNode *VLD = N->getOperand(0).getNode(); 13309 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN) 13310 return false; 13311 unsigned NumVecs = 0; 13312 unsigned NewOpc = 0; 13313 unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue(); 13314 if (IntNo == Intrinsic::arm_neon_vld2lane) { 13315 NumVecs = 2; 13316 NewOpc = ARMISD::VLD2DUP; 13317 } else if (IntNo == Intrinsic::arm_neon_vld3lane) { 13318 NumVecs = 3; 13319 NewOpc = ARMISD::VLD3DUP; 13320 } else if (IntNo == Intrinsic::arm_neon_vld4lane) { 13321 NumVecs = 4; 13322 NewOpc = ARMISD::VLD4DUP; 13323 } else { 13324 return false; 13325 } 13326 13327 // First check that all the vldN-lane uses are VDUPLANEs and that the lane 13328 // numbers match the load. 13329 unsigned VLDLaneNo = 13330 cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue(); 13331 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 13332 UI != UE; ++UI) { 13333 // Ignore uses of the chain result. 13334 if (UI.getUse().getResNo() == NumVecs) 13335 continue; 13336 SDNode *User = *UI; 13337 if (User->getOpcode() != ARMISD::VDUPLANE || 13338 VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue()) 13339 return false; 13340 } 13341 13342 // Create the vldN-dup node. 13343 EVT Tys[5]; 13344 unsigned n; 13345 for (n = 0; n < NumVecs; ++n) 13346 Tys[n] = VT; 13347 Tys[n] = MVT::Other; 13348 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1)); 13349 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) }; 13350 MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD); 13351 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, 13352 Ops, VLDMemInt->getMemoryVT(), 13353 VLDMemInt->getMemOperand()); 13354 13355 // Update the uses. 13356 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 13357 UI != UE; ++UI) { 13358 unsigned ResNo = UI.getUse().getResNo(); 13359 // Ignore uses of the chain result. 13360 if (ResNo == NumVecs) 13361 continue; 13362 SDNode *User = *UI; 13363 DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo)); 13364 } 13365 13366 // Now the vldN-lane intrinsic is dead except for its chain result. 13367 // Update uses of the chain. 13368 std::vector<SDValue> VLDDupResults; 13369 for (unsigned n = 0; n < NumVecs; ++n) 13370 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n)); 13371 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs)); 13372 DCI.CombineTo(VLD, VLDDupResults); 13373 13374 return true; 13375 } 13376 13377 /// PerformVDUPLANECombine - Target-specific dag combine xforms for 13378 /// ARMISD::VDUPLANE. 13379 static SDValue PerformVDUPLANECombine(SDNode *N, 13380 TargetLowering::DAGCombinerInfo &DCI) { 13381 SDValue Op = N->getOperand(0); 13382 13383 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses 13384 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation. 13385 if (CombineVLDDUP(N, DCI)) 13386 return SDValue(N, 0); 13387 13388 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is 13389 // redundant. Ignore bit_converts for now; element sizes are checked below. 13390 while (Op.getOpcode() == ISD::BITCAST) 13391 Op = Op.getOperand(0); 13392 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM) 13393 return SDValue(); 13394 13395 // Make sure the VMOV element size is not bigger than the VDUPLANE elements. 13396 unsigned EltSize = Op.getScalarValueSizeInBits(); 13397 // The canonical VMOV for a zero vector uses a 32-bit element size. 13398 unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 13399 unsigned EltBits; 13400 if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0) 13401 EltSize = 8; 13402 EVT VT = N->getValueType(0); 13403 if (EltSize > VT.getScalarSizeInBits()) 13404 return SDValue(); 13405 13406 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); 13407 } 13408 13409 /// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP. 13410 static SDValue PerformVDUPCombine(SDNode *N, 13411 TargetLowering::DAGCombinerInfo &DCI, 13412 const ARMSubtarget *Subtarget) { 13413 SelectionDAG &DAG = DCI.DAG; 13414 SDValue Op = N->getOperand(0); 13415 13416 if (!Subtarget->hasNEON()) 13417 return SDValue(); 13418 13419 // Match VDUP(LOAD) -> VLD1DUP. 13420 // We match this pattern here rather than waiting for isel because the 13421 // transform is only legal for unindexed loads. 13422 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode()); 13423 if (LD && Op.hasOneUse() && LD->isUnindexed() && 13424 LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) { 13425 SDValue Ops[] = { LD->getOperand(0), LD->getOperand(1), 13426 DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32) }; 13427 SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other); 13428 SDValue VLDDup = DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys, 13429 Ops, LD->getMemoryVT(), 13430 LD->getMemOperand()); 13431 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1)); 13432 return VLDDup; 13433 } 13434 13435 return SDValue(); 13436 } 13437 13438 static SDValue PerformLOADCombine(SDNode *N, 13439 TargetLowering::DAGCombinerInfo &DCI) { 13440 EVT VT = N->getValueType(0); 13441 13442 // If this is a legal vector load, try to combine it into a VLD1_UPD. 13443 if (ISD::isNormalLoad(N) && VT.isVector() && 13444 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 13445 return CombineBaseUpdate(N, DCI); 13446 13447 return SDValue(); 13448 } 13449 13450 // Optimize trunc store (of multiple scalars) to shuffle and store. First, 13451 // pack all of the elements in one place. Next, store to memory in fewer 13452 // chunks. 13453 static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, 13454 SelectionDAG &DAG) { 13455 SDValue StVal = St->getValue(); 13456 EVT VT = StVal.getValueType(); 13457 if (!St->isTruncatingStore() || !VT.isVector()) 13458 return SDValue(); 13459 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13460 EVT StVT = St->getMemoryVT(); 13461 unsigned NumElems = VT.getVectorNumElements(); 13462 assert(StVT != VT && "Cannot truncate to the same type"); 13463 unsigned FromEltSz = VT.getScalarSizeInBits(); 13464 unsigned ToEltSz = StVT.getScalarSizeInBits(); 13465 13466 // From, To sizes and ElemCount must be pow of two 13467 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) 13468 return SDValue(); 13469 13470 // We are going to use the original vector elt for storing. 13471 // Accumulated smaller vector elements must be a multiple of the store size. 13472 if (0 != (NumElems * FromEltSz) % ToEltSz) 13473 return SDValue(); 13474 13475 unsigned SizeRatio = FromEltSz / ToEltSz; 13476 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits()); 13477 13478 // Create a type on which we perform the shuffle. 13479 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), 13480 NumElems * SizeRatio); 13481 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 13482 13483 SDLoc DL(St); 13484 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); 13485 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 13486 for (unsigned i = 0; i < NumElems; ++i) 13487 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1 13488 : i * SizeRatio; 13489 13490 // Can't shuffle using an illegal type. 13491 if (!TLI.isTypeLegal(WideVecVT)) 13492 return SDValue(); 13493 13494 SDValue Shuff = DAG.getVectorShuffle( 13495 WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec); 13496 // At this point all of the data is stored at the bottom of the 13497 // register. We now need to save it to mem. 13498 13499 // Find the largest store unit 13500 MVT StoreType = MVT::i8; 13501 for (MVT Tp : MVT::integer_valuetypes()) { 13502 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz) 13503 StoreType = Tp; 13504 } 13505 // Didn't find a legal store type. 13506 if (!TLI.isTypeLegal(StoreType)) 13507 return SDValue(); 13508 13509 // Bitcast the original vector into a vector of store-size units 13510 EVT StoreVecVT = 13511 EVT::getVectorVT(*DAG.getContext(), StoreType, 13512 VT.getSizeInBits() / EVT(StoreType).getSizeInBits()); 13513 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); 13514 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); 13515 SmallVector<SDValue, 8> Chains; 13516 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL, 13517 TLI.getPointerTy(DAG.getDataLayout())); 13518 SDValue BasePtr = St->getBasePtr(); 13519 13520 // Perform one or more big stores into memory. 13521 unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits(); 13522 for (unsigned I = 0; I < E; I++) { 13523 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType, 13524 ShuffWide, DAG.getIntPtrConstant(I, DL)); 13525 SDValue Ch = 13526 DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(), 13527 St->getAlignment(), St->getMemOperand()->getFlags()); 13528 BasePtr = 13529 DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment); 13530 Chains.push_back(Ch); 13531 } 13532 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 13533 } 13534 13535 // Try taking a single vector store from an truncate (which would otherwise turn 13536 // into an expensive buildvector) and splitting it into a series of narrowing 13537 // stores. 13538 static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, 13539 SelectionDAG &DAG) { 13540 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed()) 13541 return SDValue(); 13542 SDValue Trunc = St->getValue(); 13543 if (Trunc->getOpcode() != ISD::TRUNCATE) 13544 return SDValue(); 13545 EVT FromVT = Trunc->getOperand(0).getValueType(); 13546 EVT ToVT = Trunc.getValueType(); 13547 if (!ToVT.isVector()) 13548 return SDValue(); 13549 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements()); 13550 EVT ToEltVT = ToVT.getVectorElementType(); 13551 EVT FromEltVT = FromVT.getVectorElementType(); 13552 13553 unsigned NumElements = 0; 13554 if (FromEltVT == MVT::i32 && (ToEltVT == MVT::i16 || ToEltVT == MVT::i8)) 13555 NumElements = 4; 13556 if (FromEltVT == MVT::i16 && ToEltVT == MVT::i8) 13557 NumElements = 8; 13558 if (NumElements == 0 || FromVT.getVectorNumElements() == NumElements || 13559 FromVT.getVectorNumElements() % NumElements != 0) 13560 return SDValue(); 13561 13562 SDLoc DL(St); 13563 // Details about the old store 13564 SDValue Ch = St->getChain(); 13565 SDValue BasePtr = St->getBasePtr(); 13566 unsigned Alignment = St->getOriginalAlignment(); 13567 MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags(); 13568 AAMDNodes AAInfo = St->getAAInfo(); 13569 13570 EVT NewFromVT = EVT::getVectorVT(*DAG.getContext(), FromEltVT, NumElements); 13571 EVT NewToVT = EVT::getVectorVT(*DAG.getContext(), ToEltVT, NumElements); 13572 13573 SmallVector<SDValue, 4> Stores; 13574 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) { 13575 unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8; 13576 SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset); 13577 13578 SDValue Extract = 13579 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0), 13580 DAG.getConstant(i * NumElements, DL, MVT::i32)); 13581 SDValue Store = DAG.getTruncStore( 13582 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset), 13583 NewToVT, Alignment, MMOFlags, AAInfo); 13584 Stores.push_back(Store); 13585 } 13586 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); 13587 } 13588 13589 /// PerformSTORECombine - Target-specific dag combine xforms for 13590 /// ISD::STORE. 13591 static SDValue PerformSTORECombine(SDNode *N, 13592 TargetLowering::DAGCombinerInfo &DCI, 13593 const ARMSubtarget *Subtarget) { 13594 StoreSDNode *St = cast<StoreSDNode>(N); 13595 if (St->isVolatile()) 13596 return SDValue(); 13597 SDValue StVal = St->getValue(); 13598 EVT VT = StVal.getValueType(); 13599 13600 if (Subtarget->hasNEON()) 13601 if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG)) 13602 return Store; 13603 13604 if (Subtarget->hasMVEIntegerOps()) 13605 if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG)) 13606 return NewToken; 13607 13608 if (!ISD::isNormalStore(St)) 13609 return SDValue(); 13610 13611 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and 13612 // ARM stores of arguments in the same cache line. 13613 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR && 13614 StVal.getNode()->hasOneUse()) { 13615 SelectionDAG &DAG = DCI.DAG; 13616 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 13617 SDLoc DL(St); 13618 SDValue BasePtr = St->getBasePtr(); 13619 SDValue NewST1 = DAG.getStore( 13620 St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0), 13621 BasePtr, St->getPointerInfo(), St->getAlignment(), 13622 St->getMemOperand()->getFlags()); 13623 13624 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 13625 DAG.getConstant(4, DL, MVT::i32)); 13626 return DAG.getStore(NewST1.getValue(0), DL, 13627 StVal.getNode()->getOperand(isBigEndian ? 0 : 1), 13628 OffsetPtr, St->getPointerInfo(), 13629 std::min(4U, St->getAlignment() / 2), 13630 St->getMemOperand()->getFlags()); 13631 } 13632 13633 if (StVal.getValueType() == MVT::i64 && 13634 StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 13635 13636 // Bitcast an i64 store extracted from a vector to f64. 13637 // Otherwise, the i64 value will be legalized to a pair of i32 values. 13638 SelectionDAG &DAG = DCI.DAG; 13639 SDLoc dl(StVal); 13640 SDValue IntVec = StVal.getOperand(0); 13641 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 13642 IntVec.getValueType().getVectorNumElements()); 13643 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec); 13644 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 13645 Vec, StVal.getOperand(1)); 13646 dl = SDLoc(N); 13647 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt); 13648 // Make the DAGCombiner fold the bitcasts. 13649 DCI.AddToWorklist(Vec.getNode()); 13650 DCI.AddToWorklist(ExtElt.getNode()); 13651 DCI.AddToWorklist(V.getNode()); 13652 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(), 13653 St->getPointerInfo(), St->getAlignment(), 13654 St->getMemOperand()->getFlags(), St->getAAInfo()); 13655 } 13656 13657 // If this is a legal vector store, try to combine it into a VST1_UPD. 13658 if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() && 13659 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 13660 return CombineBaseUpdate(N, DCI); 13661 13662 return SDValue(); 13663 } 13664 13665 /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) 13666 /// can replace combinations of VMUL and VCVT (floating-point to integer) 13667 /// when the VMUL has a constant operand that is a power of 2. 13668 /// 13669 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 13670 /// vmul.f32 d16, d17, d16 13671 /// vcvt.s32.f32 d16, d16 13672 /// becomes: 13673 /// vcvt.s32.f32 d16, d16, #3 13674 static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, 13675 const ARMSubtarget *Subtarget) { 13676 if (!Subtarget->hasNEON()) 13677 return SDValue(); 13678 13679 SDValue Op = N->getOperand(0); 13680 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || 13681 Op.getOpcode() != ISD::FMUL) 13682 return SDValue(); 13683 13684 SDValue ConstVec = Op->getOperand(1); 13685 if (!isa<BuildVectorSDNode>(ConstVec)) 13686 return SDValue(); 13687 13688 MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); 13689 uint32_t FloatBits = FloatTy.getSizeInBits(); 13690 MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); 13691 uint32_t IntBits = IntTy.getSizeInBits(); 13692 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 13693 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) { 13694 // These instructions only exist converting from f32 to i32. We can handle 13695 // smaller integers by generating an extra truncate, but larger ones would 13696 // be lossy. We also can't handle anything other than 2 or 4 lanes, since 13697 // these intructions only support v2i32/v4i32 types. 13698 return SDValue(); 13699 } 13700 13701 BitVector UndefElements; 13702 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 13703 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); 13704 if (C == -1 || C == 0 || C > 32) 13705 return SDValue(); 13706 13707 SDLoc dl(N); 13708 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT; 13709 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs : 13710 Intrinsic::arm_neon_vcvtfp2fxu; 13711 SDValue FixConv = DAG.getNode( 13712 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, 13713 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0), 13714 DAG.getConstant(C, dl, MVT::i32)); 13715 13716 if (IntBits < FloatBits) 13717 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv); 13718 13719 return FixConv; 13720 } 13721 13722 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD) 13723 /// can replace combinations of VCVT (integer to floating-point) and VDIV 13724 /// when the VDIV has a constant operand that is a power of 2. 13725 /// 13726 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 13727 /// vcvt.f32.s32 d16, d16 13728 /// vdiv.f32 d16, d17, d16 13729 /// becomes: 13730 /// vcvt.f32.s32 d16, d16, #3 13731 static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG, 13732 const ARMSubtarget *Subtarget) { 13733 if (!Subtarget->hasNEON()) 13734 return SDValue(); 13735 13736 SDValue Op = N->getOperand(0); 13737 unsigned OpOpcode = Op.getNode()->getOpcode(); 13738 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() || 13739 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP)) 13740 return SDValue(); 13741 13742 SDValue ConstVec = N->getOperand(1); 13743 if (!isa<BuildVectorSDNode>(ConstVec)) 13744 return SDValue(); 13745 13746 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); 13747 uint32_t FloatBits = FloatTy.getSizeInBits(); 13748 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); 13749 uint32_t IntBits = IntTy.getSizeInBits(); 13750 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 13751 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) { 13752 // These instructions only exist converting from i32 to f32. We can handle 13753 // smaller integers by generating an extra extend, but larger ones would 13754 // be lossy. We also can't handle anything other than 2 or 4 lanes, since 13755 // these intructions only support v2i32/v4i32 types. 13756 return SDValue(); 13757 } 13758 13759 BitVector UndefElements; 13760 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 13761 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); 13762 if (C == -1 || C == 0 || C > 32) 13763 return SDValue(); 13764 13765 SDLoc dl(N); 13766 bool isSigned = OpOpcode == ISD::SINT_TO_FP; 13767 SDValue ConvInput = Op.getOperand(0); 13768 if (IntBits < FloatBits) 13769 ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, 13770 dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, 13771 ConvInput); 13772 13773 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp : 13774 Intrinsic::arm_neon_vcvtfxu2fp; 13775 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, 13776 Op.getValueType(), 13777 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), 13778 ConvInput, DAG.getConstant(C, dl, MVT::i32)); 13779 } 13780 13781 /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. 13782 static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { 13783 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 13784 switch (IntNo) { 13785 default: 13786 // Don't do anything for most intrinsics. 13787 break; 13788 13789 // Vector shifts: check for immediate versions and lower them. 13790 // Note: This is done during DAG combining instead of DAG legalizing because 13791 // the build_vectors for 64-bit vector element shift counts are generally 13792 // not legal, and it is hard to see their values after they get legalized to 13793 // loads from a constant pool. 13794 case Intrinsic::arm_neon_vshifts: 13795 case Intrinsic::arm_neon_vshiftu: 13796 case Intrinsic::arm_neon_vrshifts: 13797 case Intrinsic::arm_neon_vrshiftu: 13798 case Intrinsic::arm_neon_vrshiftn: 13799 case Intrinsic::arm_neon_vqshifts: 13800 case Intrinsic::arm_neon_vqshiftu: 13801 case Intrinsic::arm_neon_vqshiftsu: 13802 case Intrinsic::arm_neon_vqshiftns: 13803 case Intrinsic::arm_neon_vqshiftnu: 13804 case Intrinsic::arm_neon_vqshiftnsu: 13805 case Intrinsic::arm_neon_vqrshiftns: 13806 case Intrinsic::arm_neon_vqrshiftnu: 13807 case Intrinsic::arm_neon_vqrshiftnsu: { 13808 EVT VT = N->getOperand(1).getValueType(); 13809 int64_t Cnt; 13810 unsigned VShiftOpc = 0; 13811 13812 switch (IntNo) { 13813 case Intrinsic::arm_neon_vshifts: 13814 case Intrinsic::arm_neon_vshiftu: 13815 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) { 13816 VShiftOpc = ARMISD::VSHLIMM; 13817 break; 13818 } 13819 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) { 13820 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM 13821 : ARMISD::VSHRuIMM); 13822 break; 13823 } 13824 return SDValue(); 13825 13826 case Intrinsic::arm_neon_vrshifts: 13827 case Intrinsic::arm_neon_vrshiftu: 13828 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) 13829 break; 13830 return SDValue(); 13831 13832 case Intrinsic::arm_neon_vqshifts: 13833 case Intrinsic::arm_neon_vqshiftu: 13834 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 13835 break; 13836 return SDValue(); 13837 13838 case Intrinsic::arm_neon_vqshiftsu: 13839 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 13840 break; 13841 llvm_unreachable("invalid shift count for vqshlu intrinsic"); 13842 13843 case Intrinsic::arm_neon_vrshiftn: 13844 case Intrinsic::arm_neon_vqshiftns: 13845 case Intrinsic::arm_neon_vqshiftnu: 13846 case Intrinsic::arm_neon_vqshiftnsu: 13847 case Intrinsic::arm_neon_vqrshiftns: 13848 case Intrinsic::arm_neon_vqrshiftnu: 13849 case Intrinsic::arm_neon_vqrshiftnsu: 13850 // Narrowing shifts require an immediate right shift. 13851 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt)) 13852 break; 13853 llvm_unreachable("invalid shift count for narrowing vector shift " 13854 "intrinsic"); 13855 13856 default: 13857 llvm_unreachable("unhandled vector shift"); 13858 } 13859 13860 switch (IntNo) { 13861 case Intrinsic::arm_neon_vshifts: 13862 case Intrinsic::arm_neon_vshiftu: 13863 // Opcode already set above. 13864 break; 13865 case Intrinsic::arm_neon_vrshifts: 13866 VShiftOpc = ARMISD::VRSHRsIMM; 13867 break; 13868 case Intrinsic::arm_neon_vrshiftu: 13869 VShiftOpc = ARMISD::VRSHRuIMM; 13870 break; 13871 case Intrinsic::arm_neon_vrshiftn: 13872 VShiftOpc = ARMISD::VRSHRNIMM; 13873 break; 13874 case Intrinsic::arm_neon_vqshifts: 13875 VShiftOpc = ARMISD::VQSHLsIMM; 13876 break; 13877 case Intrinsic::arm_neon_vqshiftu: 13878 VShiftOpc = ARMISD::VQSHLuIMM; 13879 break; 13880 case Intrinsic::arm_neon_vqshiftsu: 13881 VShiftOpc = ARMISD::VQSHLsuIMM; 13882 break; 13883 case Intrinsic::arm_neon_vqshiftns: 13884 VShiftOpc = ARMISD::VQSHRNsIMM; 13885 break; 13886 case Intrinsic::arm_neon_vqshiftnu: 13887 VShiftOpc = ARMISD::VQSHRNuIMM; 13888 break; 13889 case Intrinsic::arm_neon_vqshiftnsu: 13890 VShiftOpc = ARMISD::VQSHRNsuIMM; 13891 break; 13892 case Intrinsic::arm_neon_vqrshiftns: 13893 VShiftOpc = ARMISD::VQRSHRNsIMM; 13894 break; 13895 case Intrinsic::arm_neon_vqrshiftnu: 13896 VShiftOpc = ARMISD::VQRSHRNuIMM; 13897 break; 13898 case Intrinsic::arm_neon_vqrshiftnsu: 13899 VShiftOpc = ARMISD::VQRSHRNsuIMM; 13900 break; 13901 } 13902 13903 SDLoc dl(N); 13904 return DAG.getNode(VShiftOpc, dl, N->getValueType(0), 13905 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32)); 13906 } 13907 13908 case Intrinsic::arm_neon_vshiftins: { 13909 EVT VT = N->getOperand(1).getValueType(); 13910 int64_t Cnt; 13911 unsigned VShiftOpc = 0; 13912 13913 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt)) 13914 VShiftOpc = ARMISD::VSLIIMM; 13915 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt)) 13916 VShiftOpc = ARMISD::VSRIIMM; 13917 else { 13918 llvm_unreachable("invalid shift count for vsli/vsri intrinsic"); 13919 } 13920 13921 SDLoc dl(N); 13922 return DAG.getNode(VShiftOpc, dl, N->getValueType(0), 13923 N->getOperand(1), N->getOperand(2), 13924 DAG.getConstant(Cnt, dl, MVT::i32)); 13925 } 13926 13927 case Intrinsic::arm_neon_vqrshifts: 13928 case Intrinsic::arm_neon_vqrshiftu: 13929 // No immediate versions of these to check for. 13930 break; 13931 } 13932 13933 return SDValue(); 13934 } 13935 13936 /// PerformShiftCombine - Checks for immediate versions of vector shifts and 13937 /// lowers them. As with the vector shift intrinsics, this is done during DAG 13938 /// combining instead of DAG legalizing because the build_vectors for 64-bit 13939 /// vector element shift counts are generally not legal, and it is hard to see 13940 /// their values after they get legalized to loads from a constant pool. 13941 static SDValue PerformShiftCombine(SDNode *N, 13942 TargetLowering::DAGCombinerInfo &DCI, 13943 const ARMSubtarget *ST) { 13944 SelectionDAG &DAG = DCI.DAG; 13945 EVT VT = N->getValueType(0); 13946 if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) { 13947 // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high 13948 // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16. 13949 SDValue N1 = N->getOperand(1); 13950 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) { 13951 SDValue N0 = N->getOperand(0); 13952 if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP && 13953 DAG.MaskedValueIsZero(N0.getOperand(0), 13954 APInt::getHighBitsSet(32, 16))) 13955 return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1); 13956 } 13957 } 13958 13959 if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 && 13960 N->getOperand(0)->getOpcode() == ISD::AND && 13961 N->getOperand(0)->hasOneUse()) { 13962 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 13963 return SDValue(); 13964 // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't 13965 // usually show up because instcombine prefers to canonicalize it to 13966 // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come 13967 // out of GEP lowering in some cases. 13968 SDValue N0 = N->getOperand(0); 13969 ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1)); 13970 if (!ShiftAmtNode) 13971 return SDValue(); 13972 uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue()); 13973 ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1)); 13974 if (!AndMaskNode) 13975 return SDValue(); 13976 uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue()); 13977 // Don't transform uxtb/uxth. 13978 if (AndMask == 255 || AndMask == 65535) 13979 return SDValue(); 13980 if (isMask_32(AndMask)) { 13981 uint32_t MaskedBits = countLeadingZeros(AndMask); 13982 if (MaskedBits > ShiftAmt) { 13983 SDLoc DL(N); 13984 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 13985 DAG.getConstant(MaskedBits, DL, MVT::i32)); 13986 return DAG.getNode( 13987 ISD::SRL, DL, MVT::i32, SHL, 13988 DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32)); 13989 } 13990 } 13991 } 13992 13993 // Nothing to be done for scalar shifts. 13994 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13995 if (!VT.isVector() || !TLI.isTypeLegal(VT)) 13996 return SDValue(); 13997 if (ST->hasMVEIntegerOps() && VT == MVT::v2i64) 13998 return SDValue(); 13999 14000 int64_t Cnt; 14001 14002 switch (N->getOpcode()) { 14003 default: llvm_unreachable("unexpected shift opcode"); 14004 14005 case ISD::SHL: 14006 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) { 14007 SDLoc dl(N); 14008 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0), 14009 DAG.getConstant(Cnt, dl, MVT::i32)); 14010 } 14011 break; 14012 14013 case ISD::SRA: 14014 case ISD::SRL: 14015 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { 14016 unsigned VShiftOpc = 14017 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM); 14018 SDLoc dl(N); 14019 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), 14020 DAG.getConstant(Cnt, dl, MVT::i32)); 14021 } 14022 } 14023 return SDValue(); 14024 } 14025 14026 // Look for a sign/zero extend of a larger than legal load. This can be split 14027 // into two extending loads, which are simpler to deal with than an arbitrary 14028 // sign extend. 14029 static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) { 14030 SDValue N0 = N->getOperand(0); 14031 if (N0.getOpcode() != ISD::LOAD) 14032 return SDValue(); 14033 LoadSDNode *LD = cast<LoadSDNode>(N0.getNode()); 14034 if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() || 14035 LD->getExtensionType() != ISD::NON_EXTLOAD) 14036 return SDValue(); 14037 EVT FromVT = LD->getValueType(0); 14038 EVT ToVT = N->getValueType(0); 14039 if (!ToVT.isVector()) 14040 return SDValue(); 14041 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements()); 14042 EVT ToEltVT = ToVT.getVectorElementType(); 14043 EVT FromEltVT = FromVT.getVectorElementType(); 14044 14045 unsigned NumElements = 0; 14046 if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8)) 14047 NumElements = 4; 14048 if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8) 14049 NumElements = 8; 14050 if (NumElements == 0 || 14051 FromVT.getVectorNumElements() == NumElements || 14052 FromVT.getVectorNumElements() % NumElements != 0 || 14053 !isPowerOf2_32(NumElements)) 14054 return SDValue(); 14055 14056 SDLoc DL(LD); 14057 // Details about the old load 14058 SDValue Ch = LD->getChain(); 14059 SDValue BasePtr = LD->getBasePtr(); 14060 unsigned Alignment = LD->getOriginalAlignment(); 14061 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags(); 14062 AAMDNodes AAInfo = LD->getAAInfo(); 14063 14064 ISD::LoadExtType NewExtType = 14065 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 14066 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType()); 14067 EVT NewFromVT = FromVT.getHalfNumVectorElementsVT(*DAG.getContext()); 14068 EVT NewToVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext()); 14069 unsigned NewOffset = NewFromVT.getSizeInBits() / 8; 14070 SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset); 14071 14072 // Split the load in half, each side of which is extended separately. This 14073 // is good enough, as legalisation will take it from there. They are either 14074 // already legal or they will be split further into something that is 14075 // legal. 14076 SDValue NewLoad1 = 14077 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, BasePtr, Offset, 14078 LD->getPointerInfo(), NewFromVT, Alignment, MMOFlags, AAInfo); 14079 SDValue NewLoad2 = 14080 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset, 14081 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT, 14082 Alignment, MMOFlags, AAInfo); 14083 14084 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 14085 SDValue(NewLoad1.getNode(), 1), 14086 SDValue(NewLoad2.getNode(), 1)); 14087 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain); 14088 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, NewLoad1, NewLoad2); 14089 } 14090 14091 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, 14092 /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND. 14093 static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, 14094 const ARMSubtarget *ST) { 14095 SDValue N0 = N->getOperand(0); 14096 14097 // Check for sign- and zero-extensions of vector extract operations of 8- and 14098 // 16-bit vector elements. NEON and MVE support these directly. They are 14099 // handled during DAG combining because type legalization will promote them 14100 // to 32-bit types and it is messy to recognize the operations after that. 14101 if ((ST->hasNEON() || ST->hasMVEIntegerOps()) && 14102 N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 14103 SDValue Vec = N0.getOperand(0); 14104 SDValue Lane = N0.getOperand(1); 14105 EVT VT = N->getValueType(0); 14106 EVT EltVT = N0.getValueType(); 14107 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 14108 14109 if (VT == MVT::i32 && 14110 (EltVT == MVT::i8 || EltVT == MVT::i16) && 14111 TLI.isTypeLegal(Vec.getValueType()) && 14112 isa<ConstantSDNode>(Lane)) { 14113 14114 unsigned Opc = 0; 14115 switch (N->getOpcode()) { 14116 default: llvm_unreachable("unexpected opcode"); 14117 case ISD::SIGN_EXTEND: 14118 Opc = ARMISD::VGETLANEs; 14119 break; 14120 case ISD::ZERO_EXTEND: 14121 case ISD::ANY_EXTEND: 14122 Opc = ARMISD::VGETLANEu; 14123 break; 14124 } 14125 return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane); 14126 } 14127 } 14128 14129 if (ST->hasMVEIntegerOps()) 14130 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG)) 14131 return NewLoad; 14132 14133 return SDValue(); 14134 } 14135 14136 static const APInt *isPowerOf2Constant(SDValue V) { 14137 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 14138 if (!C) 14139 return nullptr; 14140 const APInt *CV = &C->getAPIntValue(); 14141 return CV->isPowerOf2() ? CV : nullptr; 14142 } 14143 14144 SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const { 14145 // If we have a CMOV, OR and AND combination such as: 14146 // if (x & CN) 14147 // y |= CM; 14148 // 14149 // And: 14150 // * CN is a single bit; 14151 // * All bits covered by CM are known zero in y 14152 // 14153 // Then we can convert this into a sequence of BFI instructions. This will 14154 // always be a win if CM is a single bit, will always be no worse than the 14155 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is 14156 // three bits (due to the extra IT instruction). 14157 14158 SDValue Op0 = CMOV->getOperand(0); 14159 SDValue Op1 = CMOV->getOperand(1); 14160 auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2)); 14161 auto CC = CCNode->getAPIntValue().getLimitedValue(); 14162 SDValue CmpZ = CMOV->getOperand(4); 14163 14164 // The compare must be against zero. 14165 if (!isNullConstant(CmpZ->getOperand(1))) 14166 return SDValue(); 14167 14168 assert(CmpZ->getOpcode() == ARMISD::CMPZ); 14169 SDValue And = CmpZ->getOperand(0); 14170 if (And->getOpcode() != ISD::AND) 14171 return SDValue(); 14172 const APInt *AndC = isPowerOf2Constant(And->getOperand(1)); 14173 if (!AndC) 14174 return SDValue(); 14175 SDValue X = And->getOperand(0); 14176 14177 if (CC == ARMCC::EQ) { 14178 // We're performing an "equal to zero" compare. Swap the operands so we 14179 // canonicalize on a "not equal to zero" compare. 14180 std::swap(Op0, Op1); 14181 } else { 14182 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?"); 14183 } 14184 14185 if (Op1->getOpcode() != ISD::OR) 14186 return SDValue(); 14187 14188 ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1)); 14189 if (!OrC) 14190 return SDValue(); 14191 SDValue Y = Op1->getOperand(0); 14192 14193 if (Op0 != Y) 14194 return SDValue(); 14195 14196 // Now, is it profitable to continue? 14197 APInt OrCI = OrC->getAPIntValue(); 14198 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2; 14199 if (OrCI.countPopulation() > Heuristic) 14200 return SDValue(); 14201 14202 // Lastly, can we determine that the bits defined by OrCI 14203 // are zero in Y? 14204 KnownBits Known = DAG.computeKnownBits(Y); 14205 if ((OrCI & Known.Zero) != OrCI) 14206 return SDValue(); 14207 14208 // OK, we can do the combine. 14209 SDValue V = Y; 14210 SDLoc dl(X); 14211 EVT VT = X.getValueType(); 14212 unsigned BitInX = AndC->logBase2(); 14213 14214 if (BitInX != 0) { 14215 // We must shift X first. 14216 X = DAG.getNode(ISD::SRL, dl, VT, X, 14217 DAG.getConstant(BitInX, dl, VT)); 14218 } 14219 14220 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits(); 14221 BitInY < NumActiveBits; ++BitInY) { 14222 if (OrCI[BitInY] == 0) 14223 continue; 14224 APInt Mask(VT.getSizeInBits(), 0); 14225 Mask.setBit(BitInY); 14226 V = DAG.getNode(ARMISD::BFI, dl, VT, V, X, 14227 // Confusingly, the operand is an *inverted* mask. 14228 DAG.getConstant(~Mask, dl, VT)); 14229 } 14230 14231 return V; 14232 } 14233 14234 // Given N, the value controlling the conditional branch, search for the loop 14235 // intrinsic, returning it, along with how the value is used. We need to handle 14236 // patterns such as the following: 14237 // (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit) 14238 // (brcond (setcc (loop.decrement), 0, eq), exit) 14239 // (brcond (setcc (loop.decrement), 0, ne), header) 14240 static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm, 14241 bool &Negate) { 14242 switch (N->getOpcode()) { 14243 default: 14244 break; 14245 case ISD::XOR: { 14246 if (!isa<ConstantSDNode>(N.getOperand(1))) 14247 return SDValue(); 14248 if (!cast<ConstantSDNode>(N.getOperand(1))->isOne()) 14249 return SDValue(); 14250 Negate = !Negate; 14251 return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate); 14252 } 14253 case ISD::SETCC: { 14254 auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1)); 14255 if (!Const) 14256 return SDValue(); 14257 if (Const->isNullValue()) 14258 Imm = 0; 14259 else if (Const->isOne()) 14260 Imm = 1; 14261 else 14262 return SDValue(); 14263 CC = cast<CondCodeSDNode>(N.getOperand(2))->get(); 14264 return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate); 14265 } 14266 case ISD::INTRINSIC_W_CHAIN: { 14267 unsigned IntOp = cast<ConstantSDNode>(N.getOperand(1))->getZExtValue(); 14268 if (IntOp != Intrinsic::test_set_loop_iterations && 14269 IntOp != Intrinsic::loop_decrement_reg) 14270 return SDValue(); 14271 return N; 14272 } 14273 } 14274 return SDValue(); 14275 } 14276 14277 static SDValue PerformHWLoopCombine(SDNode *N, 14278 TargetLowering::DAGCombinerInfo &DCI, 14279 const ARMSubtarget *ST) { 14280 14281 // The hwloop intrinsics that we're interested are used for control-flow, 14282 // either for entering or exiting the loop: 14283 // - test.set.loop.iterations will test whether its operand is zero. If it 14284 // is zero, the proceeding branch should not enter the loop. 14285 // - loop.decrement.reg also tests whether its operand is zero. If it is 14286 // zero, the proceeding branch should not branch back to the beginning of 14287 // the loop. 14288 // So here, we need to check that how the brcond is using the result of each 14289 // of the intrinsics to ensure that we're branching to the right place at the 14290 // right time. 14291 14292 ISD::CondCode CC; 14293 SDValue Cond; 14294 int Imm = 1; 14295 bool Negate = false; 14296 SDValue Chain = N->getOperand(0); 14297 SDValue Dest; 14298 14299 if (N->getOpcode() == ISD::BRCOND) { 14300 CC = ISD::SETEQ; 14301 Cond = N->getOperand(1); 14302 Dest = N->getOperand(2); 14303 } else { 14304 assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!"); 14305 CC = cast<CondCodeSDNode>(N->getOperand(1))->get(); 14306 Cond = N->getOperand(2); 14307 Dest = N->getOperand(4); 14308 if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) { 14309 if (!Const->isOne() && !Const->isNullValue()) 14310 return SDValue(); 14311 Imm = Const->getZExtValue(); 14312 } else 14313 return SDValue(); 14314 } 14315 14316 SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate); 14317 if (!Int) 14318 return SDValue(); 14319 14320 if (Negate) 14321 CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32); 14322 14323 auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) { 14324 return (CC == ISD::SETEQ && Imm == 0) || 14325 (CC == ISD::SETNE && Imm == 1) || 14326 (CC == ISD::SETLT && Imm == 1) || 14327 (CC == ISD::SETULT && Imm == 1); 14328 }; 14329 14330 auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) { 14331 return (CC == ISD::SETEQ && Imm == 1) || 14332 (CC == ISD::SETNE && Imm == 0) || 14333 (CC == ISD::SETGT && Imm == 0) || 14334 (CC == ISD::SETUGT && Imm == 0) || 14335 (CC == ISD::SETGE && Imm == 1) || 14336 (CC == ISD::SETUGE && Imm == 1); 14337 }; 14338 14339 assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) && 14340 "unsupported condition"); 14341 14342 SDLoc dl(Int); 14343 SelectionDAG &DAG = DCI.DAG; 14344 SDValue Elements = Int.getOperand(2); 14345 unsigned IntOp = cast<ConstantSDNode>(Int->getOperand(1))->getZExtValue(); 14346 assert((N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BR) 14347 && "expected single br user"); 14348 SDNode *Br = *N->use_begin(); 14349 SDValue OtherTarget = Br->getOperand(1); 14350 14351 // Update the unconditional branch to branch to the given Dest. 14352 auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) { 14353 SDValue NewBrOps[] = { Br->getOperand(0), Dest }; 14354 SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps); 14355 DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr); 14356 }; 14357 14358 if (IntOp == Intrinsic::test_set_loop_iterations) { 14359 SDValue Res; 14360 // We expect this 'instruction' to branch when the counter is zero. 14361 if (IsTrueIfZero(CC, Imm)) { 14362 SDValue Ops[] = { Chain, Elements, Dest }; 14363 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); 14364 } else { 14365 // The logic is the reverse of what we need for WLS, so find the other 14366 // basic block target: the target of the proceeding br. 14367 UpdateUncondBr(Br, Dest, DAG); 14368 14369 SDValue Ops[] = { Chain, Elements, OtherTarget }; 14370 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); 14371 } 14372 DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0)); 14373 return Res; 14374 } else { 14375 SDValue Size = DAG.getTargetConstant( 14376 cast<ConstantSDNode>(Int.getOperand(3))->getZExtValue(), dl, MVT::i32); 14377 SDValue Args[] = { Int.getOperand(0), Elements, Size, }; 14378 SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl, 14379 DAG.getVTList(MVT::i32, MVT::Other), Args); 14380 DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode()); 14381 14382 // We expect this instruction to branch when the count is not zero. 14383 SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget; 14384 14385 // Update the unconditional branch to target the loop preheader if we've 14386 // found the condition has been reversed. 14387 if (Target == OtherTarget) 14388 UpdateUncondBr(Br, Dest, DAG); 14389 14390 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 14391 SDValue(LoopDec.getNode(), 1), Chain); 14392 14393 SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target }; 14394 return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs); 14395 } 14396 return SDValue(); 14397 } 14398 14399 /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND. 14400 SDValue 14401 ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const { 14402 SDValue Cmp = N->getOperand(4); 14403 if (Cmp.getOpcode() != ARMISD::CMPZ) 14404 // Only looking at NE cases. 14405 return SDValue(); 14406 14407 EVT VT = N->getValueType(0); 14408 SDLoc dl(N); 14409 SDValue LHS = Cmp.getOperand(0); 14410 SDValue RHS = Cmp.getOperand(1); 14411 SDValue Chain = N->getOperand(0); 14412 SDValue BB = N->getOperand(1); 14413 SDValue ARMcc = N->getOperand(2); 14414 ARMCC::CondCodes CC = 14415 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); 14416 14417 // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0)) 14418 // -> (brcond Chain BB CC CPSR Cmp) 14419 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() && 14420 LHS->getOperand(0)->getOpcode() == ARMISD::CMOV && 14421 LHS->getOperand(0)->hasOneUse()) { 14422 auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0)); 14423 auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1)); 14424 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); 14425 auto *RHSC = dyn_cast<ConstantSDNode>(RHS); 14426 if ((LHS00C && LHS00C->getZExtValue() == 0) && 14427 (LHS01C && LHS01C->getZExtValue() == 1) && 14428 (LHS1C && LHS1C->getZExtValue() == 1) && 14429 (RHSC && RHSC->getZExtValue() == 0)) { 14430 return DAG.getNode( 14431 ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2), 14432 LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4)); 14433 } 14434 } 14435 14436 return SDValue(); 14437 } 14438 14439 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV. 14440 SDValue 14441 ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { 14442 SDValue Cmp = N->getOperand(4); 14443 if (Cmp.getOpcode() != ARMISD::CMPZ) 14444 // Only looking at EQ and NE cases. 14445 return SDValue(); 14446 14447 EVT VT = N->getValueType(0); 14448 SDLoc dl(N); 14449 SDValue LHS = Cmp.getOperand(0); 14450 SDValue RHS = Cmp.getOperand(1); 14451 SDValue FalseVal = N->getOperand(0); 14452 SDValue TrueVal = N->getOperand(1); 14453 SDValue ARMcc = N->getOperand(2); 14454 ARMCC::CondCodes CC = 14455 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); 14456 14457 // BFI is only available on V6T2+. 14458 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) { 14459 SDValue R = PerformCMOVToBFICombine(N, DAG); 14460 if (R) 14461 return R; 14462 } 14463 14464 // Simplify 14465 // mov r1, r0 14466 // cmp r1, x 14467 // mov r0, y 14468 // moveq r0, x 14469 // to 14470 // cmp r0, x 14471 // movne r0, y 14472 // 14473 // mov r1, r0 14474 // cmp r1, x 14475 // mov r0, x 14476 // movne r0, y 14477 // to 14478 // cmp r0, x 14479 // movne r0, y 14480 /// FIXME: Turn this into a target neutral optimization? 14481 SDValue Res; 14482 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) { 14483 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, 14484 N->getOperand(3), Cmp); 14485 } else if (CC == ARMCC::EQ && TrueVal == RHS) { 14486 SDValue ARMcc; 14487 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl); 14488 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, 14489 N->getOperand(3), NewCmp); 14490 } 14491 14492 // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0)) 14493 // -> (cmov F T CC CPSR Cmp) 14494 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) { 14495 auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)); 14496 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); 14497 auto *RHSC = dyn_cast<ConstantSDNode>(RHS); 14498 if ((LHS0C && LHS0C->getZExtValue() == 0) && 14499 (LHS1C && LHS1C->getZExtValue() == 1) && 14500 (RHSC && RHSC->getZExtValue() == 0)) { 14501 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, 14502 LHS->getOperand(2), LHS->getOperand(3), 14503 LHS->getOperand(4)); 14504 } 14505 } 14506 14507 if (!VT.isInteger()) 14508 return SDValue(); 14509 14510 // Materialize a boolean comparison for integers so we can avoid branching. 14511 if (isNullConstant(FalseVal)) { 14512 if (CC == ARMCC::EQ && isOneConstant(TrueVal)) { 14513 if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) { 14514 // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it 14515 // right 5 bits will make that 32 be 1, otherwise it will be 0. 14516 // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5 14517 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); 14518 Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub), 14519 DAG.getConstant(5, dl, MVT::i32)); 14520 } else { 14521 // CMOV 0, 1, ==, (CMPZ x, y) -> 14522 // (ADDCARRY (SUB x, y), t:0, t:1) 14523 // where t = (SUBCARRY 0, (SUB x, y), 0) 14524 // 14525 // The SUBCARRY computes 0 - (x - y) and this will give a borrow when 14526 // x != y. In other words, a carry C == 1 when x == y, C == 0 14527 // otherwise. 14528 // The final ADDCARRY computes 14529 // x - y + (0 - (x - y)) + C == C 14530 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); 14531 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 14532 SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub); 14533 // ISD::SUBCARRY returns a borrow but we want the carry here 14534 // actually. 14535 SDValue Carry = 14536 DAG.getNode(ISD::SUB, dl, MVT::i32, 14537 DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1)); 14538 Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry); 14539 } 14540 } else if (CC == ARMCC::NE && !isNullConstant(RHS) && 14541 (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) { 14542 // This seems pointless but will allow us to combine it further below. 14543 // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1 14544 SDValue Sub = 14545 DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS); 14546 SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, 14547 Sub.getValue(1), SDValue()); 14548 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc, 14549 N->getOperand(3), CPSRGlue.getValue(1)); 14550 FalseVal = Sub; 14551 } 14552 } else if (isNullConstant(TrueVal)) { 14553 if (CC == ARMCC::EQ && !isNullConstant(RHS) && 14554 (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) { 14555 // This seems pointless but will allow us to combine it further below 14556 // Note that we change == for != as this is the dual for the case above. 14557 // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1 14558 SDValue Sub = 14559 DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS); 14560 SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, 14561 Sub.getValue(1), SDValue()); 14562 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal, 14563 DAG.getConstant(ARMCC::NE, dl, MVT::i32), 14564 N->getOperand(3), CPSRGlue.getValue(1)); 14565 FalseVal = Sub; 14566 } 14567 } 14568 14569 // On Thumb1, the DAG above may be further combined if z is a power of 2 14570 // (z == 2 ^ K). 14571 // CMOV (SUBS x, y), z, !=, (SUBS x, y):1 -> 14572 // t1 = (USUBO (SUB x, y), 1) 14573 // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1) 14574 // Result = if K != 0 then (SHL t2:0, K) else t2:0 14575 // 14576 // This also handles the special case of comparing against zero; it's 14577 // essentially, the same pattern, except there's no SUBS: 14578 // CMOV x, z, !=, (CMPZ x, 0) -> 14579 // t1 = (USUBO x, 1) 14580 // t2 = (SUBCARRY x, t1:0, t1:1) 14581 // Result = if K != 0 then (SHL t2:0, K) else t2:0 14582 const APInt *TrueConst; 14583 if (Subtarget->isThumb1Only() && CC == ARMCC::NE && 14584 ((FalseVal.getOpcode() == ARMISD::SUBS && 14585 FalseVal.getOperand(0) == LHS && FalseVal.getOperand(1) == RHS) || 14586 (FalseVal == LHS && isNullConstant(RHS))) && 14587 (TrueConst = isPowerOf2Constant(TrueVal))) { 14588 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 14589 unsigned ShiftAmount = TrueConst->logBase2(); 14590 if (ShiftAmount) 14591 TrueVal = DAG.getConstant(1, dl, VT); 14592 SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal); 14593 Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1)); 14594 14595 if (ShiftAmount) 14596 Res = DAG.getNode(ISD::SHL, dl, VT, Res, 14597 DAG.getConstant(ShiftAmount, dl, MVT::i32)); 14598 } 14599 14600 if (Res.getNode()) { 14601 KnownBits Known = DAG.computeKnownBits(SDValue(N,0)); 14602 // Capture demanded bits information that would be otherwise lost. 14603 if (Known.Zero == 0xfffffffe) 14604 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 14605 DAG.getValueType(MVT::i1)); 14606 else if (Known.Zero == 0xffffff00) 14607 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 14608 DAG.getValueType(MVT::i8)); 14609 else if (Known.Zero == 0xffff0000) 14610 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 14611 DAG.getValueType(MVT::i16)); 14612 } 14613 14614 return Res; 14615 } 14616 14617 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, 14618 DAGCombinerInfo &DCI) const { 14619 switch (N->getOpcode()) { 14620 default: break; 14621 case ISD::ABS: return PerformABSCombine(N, DCI, Subtarget); 14622 case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget); 14623 case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget); 14624 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget); 14625 case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget); 14626 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); 14627 case ISD::OR: return PerformORCombine(N, DCI, Subtarget); 14628 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget); 14629 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget); 14630 case ISD::BRCOND: 14631 case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget); 14632 case ARMISD::ADDC: 14633 case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget); 14634 case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget); 14635 case ARMISD::BFI: return PerformBFICombine(N, DCI); 14636 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget); 14637 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); 14638 case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget); 14639 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget); 14640 case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); 14641 case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); 14642 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); 14643 case ARMISD::VDUP: return PerformVDUPCombine(N, DCI, Subtarget); 14644 case ISD::FP_TO_SINT: 14645 case ISD::FP_TO_UINT: 14646 return PerformVCVTCombine(N, DCI.DAG, Subtarget); 14647 case ISD::FDIV: 14648 return PerformVDIVCombine(N, DCI.DAG, Subtarget); 14649 case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); 14650 case ISD::SHL: 14651 case ISD::SRA: 14652 case ISD::SRL: 14653 return PerformShiftCombine(N, DCI, Subtarget); 14654 case ISD::SIGN_EXTEND: 14655 case ISD::ZERO_EXTEND: 14656 case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); 14657 case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG); 14658 case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG); 14659 case ISD::LOAD: return PerformLOADCombine(N, DCI); 14660 case ARMISD::VLD1DUP: 14661 case ARMISD::VLD2DUP: 14662 case ARMISD::VLD3DUP: 14663 case ARMISD::VLD4DUP: 14664 return PerformVLDCombine(N, DCI); 14665 case ARMISD::BUILD_VECTOR: 14666 return PerformARMBUILD_VECTORCombine(N, DCI); 14667 case ARMISD::PREDICATE_CAST: 14668 return PerformPREDICATE_CASTCombine(N, DCI); 14669 case ARMISD::VCMP: 14670 return PerformVCMPCombine(N, DCI, Subtarget); 14671 case ARMISD::SMULWB: { 14672 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 14673 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); 14674 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) 14675 return SDValue(); 14676 break; 14677 } 14678 case ARMISD::SMULWT: { 14679 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 14680 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); 14681 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) 14682 return SDValue(); 14683 break; 14684 } 14685 case ARMISD::SMLALBB: 14686 case ARMISD::QADD16b: 14687 case ARMISD::QSUB16b: { 14688 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 14689 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); 14690 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 14691 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 14692 return SDValue(); 14693 break; 14694 } 14695 case ARMISD::SMLALBT: { 14696 unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits(); 14697 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); 14698 unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits(); 14699 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); 14700 if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) || 14701 (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI))) 14702 return SDValue(); 14703 break; 14704 } 14705 case ARMISD::SMLALTB: { 14706 unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits(); 14707 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); 14708 unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits(); 14709 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); 14710 if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) || 14711 (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI))) 14712 return SDValue(); 14713 break; 14714 } 14715 case ARMISD::SMLALTT: { 14716 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 14717 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); 14718 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 14719 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 14720 return SDValue(); 14721 break; 14722 } 14723 case ARMISD::QADD8b: 14724 case ARMISD::QSUB8b: { 14725 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 14726 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8); 14727 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 14728 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 14729 return SDValue(); 14730 break; 14731 } 14732 case ISD::INTRINSIC_VOID: 14733 case ISD::INTRINSIC_W_CHAIN: 14734 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 14735 case Intrinsic::arm_neon_vld1: 14736 case Intrinsic::arm_neon_vld1x2: 14737 case Intrinsic::arm_neon_vld1x3: 14738 case Intrinsic::arm_neon_vld1x4: 14739 case Intrinsic::arm_neon_vld2: 14740 case Intrinsic::arm_neon_vld3: 14741 case Intrinsic::arm_neon_vld4: 14742 case Intrinsic::arm_neon_vld2lane: 14743 case Intrinsic::arm_neon_vld3lane: 14744 case Intrinsic::arm_neon_vld4lane: 14745 case Intrinsic::arm_neon_vld2dup: 14746 case Intrinsic::arm_neon_vld3dup: 14747 case Intrinsic::arm_neon_vld4dup: 14748 case Intrinsic::arm_neon_vst1: 14749 case Intrinsic::arm_neon_vst1x2: 14750 case Intrinsic::arm_neon_vst1x3: 14751 case Intrinsic::arm_neon_vst1x4: 14752 case Intrinsic::arm_neon_vst2: 14753 case Intrinsic::arm_neon_vst3: 14754 case Intrinsic::arm_neon_vst4: 14755 case Intrinsic::arm_neon_vst2lane: 14756 case Intrinsic::arm_neon_vst3lane: 14757 case Intrinsic::arm_neon_vst4lane: 14758 return PerformVLDCombine(N, DCI); 14759 default: break; 14760 } 14761 break; 14762 } 14763 return SDValue(); 14764 } 14765 14766 bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc, 14767 EVT VT) const { 14768 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE); 14769 } 14770 14771 bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, 14772 unsigned Alignment, 14773 MachineMemOperand::Flags, 14774 bool *Fast) const { 14775 // Depends what it gets converted into if the type is weird. 14776 if (!VT.isSimple()) 14777 return false; 14778 14779 // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus 14780 bool AllowsUnaligned = Subtarget->allowsUnalignedMem(); 14781 auto Ty = VT.getSimpleVT().SimpleTy; 14782 14783 if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) { 14784 // Unaligned access can use (for example) LRDB, LRDH, LDR 14785 if (AllowsUnaligned) { 14786 if (Fast) 14787 *Fast = Subtarget->hasV7Ops(); 14788 return true; 14789 } 14790 } 14791 14792 if (Ty == MVT::f64 || Ty == MVT::v2f64) { 14793 // For any little-endian targets with neon, we can support unaligned ld/st 14794 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8. 14795 // A big-endian target may also explicitly support unaligned accesses 14796 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) { 14797 if (Fast) 14798 *Fast = true; 14799 return true; 14800 } 14801 } 14802 14803 if (!Subtarget->hasMVEIntegerOps()) 14804 return false; 14805 14806 // These are for predicates 14807 if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1)) { 14808 if (Fast) 14809 *Fast = true; 14810 return true; 14811 } 14812 14813 // These are for truncated stores/narrowing loads. They are fine so long as 14814 // the alignment is at least the size of the item being loaded 14815 if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) && 14816 Alignment >= VT.getScalarSizeInBits() / 8) { 14817 if (Fast) 14818 *Fast = true; 14819 return true; 14820 } 14821 14822 // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and 14823 // VSTRW.U32 all store the vector register in exactly the same format, and 14824 // differ only in the range of their immediate offset field and the required 14825 // alignment. So there is always a store that can be used, regardless of 14826 // actual type. 14827 // 14828 // For big endian, that is not the case. But can still emit a (VSTRB.U8; 14829 // VREV64.8) pair and get the same effect. This will likely be better than 14830 // aligning the vector through the stack. 14831 if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 || 14832 Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 || 14833 Ty == MVT::v2f64) { 14834 if (Fast) 14835 *Fast = true; 14836 return true; 14837 } 14838 14839 return false; 14840 } 14841 14842 static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, 14843 unsigned AlignCheck) { 14844 return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && 14845 (DstAlign == 0 || DstAlign % AlignCheck == 0)); 14846 } 14847 14848 EVT ARMTargetLowering::getOptimalMemOpType( 14849 uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, 14850 bool ZeroMemset, bool MemcpyStrSrc, 14851 const AttributeList &FuncAttributes) const { 14852 // See if we can use NEON instructions for this... 14853 if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() && 14854 !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) { 14855 bool Fast; 14856 if (Size >= 16 && 14857 (memOpAlign(SrcAlign, DstAlign, 16) || 14858 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, 14859 MachineMemOperand::MONone, &Fast) && 14860 Fast))) { 14861 return MVT::v2f64; 14862 } else if (Size >= 8 && 14863 (memOpAlign(SrcAlign, DstAlign, 8) || 14864 (allowsMisalignedMemoryAccesses( 14865 MVT::f64, 0, 1, MachineMemOperand::MONone, &Fast) && 14866 Fast))) { 14867 return MVT::f64; 14868 } 14869 } 14870 14871 // Let the target-independent logic figure it out. 14872 return MVT::Other; 14873 } 14874 14875 // 64-bit integers are split into their high and low parts and held in two 14876 // different registers, so the trunc is free since the low register can just 14877 // be used. 14878 bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const { 14879 if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy()) 14880 return false; 14881 unsigned SrcBits = SrcTy->getPrimitiveSizeInBits(); 14882 unsigned DestBits = DstTy->getPrimitiveSizeInBits(); 14883 return (SrcBits == 64 && DestBits == 32); 14884 } 14885 14886 bool ARMTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const { 14887 if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() || 14888 !DstVT.isInteger()) 14889 return false; 14890 unsigned SrcBits = SrcVT.getSizeInBits(); 14891 unsigned DestBits = DstVT.getSizeInBits(); 14892 return (SrcBits == 64 && DestBits == 32); 14893 } 14894 14895 bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 14896 if (Val.getOpcode() != ISD::LOAD) 14897 return false; 14898 14899 EVT VT1 = Val.getValueType(); 14900 if (!VT1.isSimple() || !VT1.isInteger() || 14901 !VT2.isSimple() || !VT2.isInteger()) 14902 return false; 14903 14904 switch (VT1.getSimpleVT().SimpleTy) { 14905 default: break; 14906 case MVT::i1: 14907 case MVT::i8: 14908 case MVT::i16: 14909 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits. 14910 return true; 14911 } 14912 14913 return false; 14914 } 14915 14916 bool ARMTargetLowering::isFNegFree(EVT VT) const { 14917 if (!VT.isSimple()) 14918 return false; 14919 14920 // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that 14921 // negate values directly (fneg is free). So, we don't want to let the DAG 14922 // combiner rewrite fneg into xors and some other instructions. For f16 and 14923 // FullFP16 argument passing, some bitcast nodes may be introduced, 14924 // triggering this DAG combine rewrite, so we are avoiding that with this. 14925 switch (VT.getSimpleVT().SimpleTy) { 14926 default: break; 14927 case MVT::f16: 14928 return Subtarget->hasFullFP16(); 14929 } 14930 14931 return false; 14932 } 14933 14934 /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth 14935 /// of the vector elements. 14936 static bool areExtractExts(Value *Ext1, Value *Ext2) { 14937 auto areExtDoubled = [](Instruction *Ext) { 14938 return Ext->getType()->getScalarSizeInBits() == 14939 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); 14940 }; 14941 14942 if (!match(Ext1, m_ZExtOrSExt(m_Value())) || 14943 !match(Ext2, m_ZExtOrSExt(m_Value())) || 14944 !areExtDoubled(cast<Instruction>(Ext1)) || 14945 !areExtDoubled(cast<Instruction>(Ext2))) 14946 return false; 14947 14948 return true; 14949 } 14950 14951 /// Check if sinking \p I's operands to I's basic block is profitable, because 14952 /// the operands can be folded into a target instruction, e.g. 14953 /// sext/zext can be folded into vsubl. 14954 bool ARMTargetLowering::shouldSinkOperands(Instruction *I, 14955 SmallVectorImpl<Use *> &Ops) const { 14956 if (!I->getType()->isVectorTy()) 14957 return false; 14958 14959 if (Subtarget->hasNEON()) { 14960 switch (I->getOpcode()) { 14961 case Instruction::Sub: 14962 case Instruction::Add: { 14963 if (!areExtractExts(I->getOperand(0), I->getOperand(1))) 14964 return false; 14965 Ops.push_back(&I->getOperandUse(0)); 14966 Ops.push_back(&I->getOperandUse(1)); 14967 return true; 14968 } 14969 default: 14970 return false; 14971 } 14972 } 14973 14974 if (!Subtarget->hasMVEIntegerOps()) 14975 return false; 14976 14977 auto IsSinker = [](Instruction *I, int Operand) { 14978 switch (I->getOpcode()) { 14979 case Instruction::Add: 14980 case Instruction::Mul: 14981 case Instruction::ICmp: 14982 return true; 14983 case Instruction::Sub: 14984 case Instruction::Shl: 14985 case Instruction::LShr: 14986 case Instruction::AShr: 14987 return Operand == 1; 14988 default: 14989 return false; 14990 } 14991 }; 14992 14993 int Op = 0; 14994 if (!isa<ShuffleVectorInst>(I->getOperand(Op))) 14995 Op = 1; 14996 if (!IsSinker(I, Op)) 14997 return false; 14998 if (!match(I->getOperand(Op), 14999 m_ShuffleVector(m_InsertElement(m_Undef(), m_Value(), m_ZeroInt()), 15000 m_Undef(), m_Zero()))) { 15001 return false; 15002 } 15003 Instruction *Shuffle = cast<Instruction>(I->getOperand(Op)); 15004 // All uses of the shuffle should be sunk to avoid duplicating it across gpr 15005 // and vector registers 15006 for (Use &U : Shuffle->uses()) { 15007 Instruction *Insn = cast<Instruction>(U.getUser()); 15008 if (!IsSinker(Insn, U.getOperandNo())) 15009 return false; 15010 } 15011 Ops.push_back(&Shuffle->getOperandUse(0)); 15012 Ops.push_back(&I->getOperandUse(Op)); 15013 return true; 15014 } 15015 15016 bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { 15017 EVT VT = ExtVal.getValueType(); 15018 15019 if (!isTypeLegal(VT)) 15020 return false; 15021 15022 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) { 15023 if (Ld->isExpandingLoad()) 15024 return false; 15025 } 15026 15027 // Don't create a loadext if we can fold the extension into a wide/long 15028 // instruction. 15029 // If there's more than one user instruction, the loadext is desirable no 15030 // matter what. There can be two uses by the same instruction. 15031 if (ExtVal->use_empty() || 15032 !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode())) 15033 return true; 15034 15035 SDNode *U = *ExtVal->use_begin(); 15036 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB || 15037 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM)) 15038 return false; 15039 15040 return true; 15041 } 15042 15043 bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { 15044 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 15045 return false; 15046 15047 if (!isTypeLegal(EVT::getEVT(Ty1))) 15048 return false; 15049 15050 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); 15051 15052 // Assuming the caller doesn't have a zeroext or signext return parameter, 15053 // truncation all the way down to i1 is valid. 15054 return true; 15055 } 15056 15057 int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL, 15058 const AddrMode &AM, Type *Ty, 15059 unsigned AS) const { 15060 if (isLegalAddressingMode(DL, AM, Ty, AS)) { 15061 if (Subtarget->hasFPAO()) 15062 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster 15063 return 0; 15064 } 15065 return -1; 15066 } 15067 15068 /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster 15069 /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be 15070 /// expanded to FMAs when this method returns true, otherwise fmuladd is 15071 /// expanded to fmul + fadd. 15072 /// 15073 /// ARM supports both fused and unfused multiply-add operations; we already 15074 /// lower a pair of fmul and fadd to the latter so it's not clear that there 15075 /// would be a gain or that the gain would be worthwhile enough to risk 15076 /// correctness bugs. 15077 /// 15078 /// For MVE, we set this to true as it helps simplify the need for some 15079 /// patterns (and we don't have the non-fused floating point instruction). 15080 bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, 15081 EVT VT) const { 15082 if (!VT.isSimple()) 15083 return false; 15084 15085 switch (VT.getSimpleVT().SimpleTy) { 15086 case MVT::v4f32: 15087 case MVT::v8f16: 15088 return Subtarget->hasMVEFloatOps(); 15089 case MVT::f16: 15090 return Subtarget->useFPVFMx16(); 15091 case MVT::f32: 15092 return Subtarget->useFPVFMx(); 15093 case MVT::f64: 15094 return Subtarget->useFPVFMx64(); 15095 default: 15096 break; 15097 } 15098 15099 return false; 15100 } 15101 15102 static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { 15103 if (V < 0) 15104 return false; 15105 15106 unsigned Scale = 1; 15107 switch (VT.getSimpleVT().SimpleTy) { 15108 case MVT::i1: 15109 case MVT::i8: 15110 // Scale == 1; 15111 break; 15112 case MVT::i16: 15113 // Scale == 2; 15114 Scale = 2; 15115 break; 15116 default: 15117 // On thumb1 we load most things (i32, i64, floats, etc) with a LDR 15118 // Scale == 4; 15119 Scale = 4; 15120 break; 15121 } 15122 15123 if ((V & (Scale - 1)) != 0) 15124 return false; 15125 return isUInt<5>(V / Scale); 15126 } 15127 15128 static bool isLegalT2AddressImmediate(int64_t V, EVT VT, 15129 const ARMSubtarget *Subtarget) { 15130 if (!VT.isInteger() && !VT.isFloatingPoint()) 15131 return false; 15132 if (VT.isVector() && Subtarget->hasNEON()) 15133 return false; 15134 if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() && 15135 !Subtarget->hasMVEFloatOps()) 15136 return false; 15137 15138 bool IsNeg = false; 15139 if (V < 0) { 15140 IsNeg = true; 15141 V = -V; 15142 } 15143 15144 unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U); 15145 15146 // MVE: size * imm7 15147 if (VT.isVector() && Subtarget->hasMVEIntegerOps()) { 15148 switch (VT.getSimpleVT().getVectorElementType().SimpleTy) { 15149 case MVT::i32: 15150 case MVT::f32: 15151 return isShiftedUInt<7,2>(V); 15152 case MVT::i16: 15153 case MVT::f16: 15154 return isShiftedUInt<7,1>(V); 15155 case MVT::i8: 15156 return isUInt<7>(V); 15157 default: 15158 return false; 15159 } 15160 } 15161 15162 // half VLDR: 2 * imm8 15163 if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16()) 15164 return isShiftedUInt<8, 1>(V); 15165 // VLDR and LDRD: 4 * imm8 15166 if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8) 15167 return isShiftedUInt<8, 2>(V); 15168 15169 if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) { 15170 // + imm12 or - imm8 15171 if (IsNeg) 15172 return isUInt<8>(V); 15173 return isUInt<12>(V); 15174 } 15175 15176 return false; 15177 } 15178 15179 /// isLegalAddressImmediate - Return true if the integer value can be used 15180 /// as the offset of the target addressing mode for load / store of the 15181 /// given type. 15182 static bool isLegalAddressImmediate(int64_t V, EVT VT, 15183 const ARMSubtarget *Subtarget) { 15184 if (V == 0) 15185 return true; 15186 15187 if (!VT.isSimple()) 15188 return false; 15189 15190 if (Subtarget->isThumb1Only()) 15191 return isLegalT1AddressImmediate(V, VT); 15192 else if (Subtarget->isThumb2()) 15193 return isLegalT2AddressImmediate(V, VT, Subtarget); 15194 15195 // ARM mode. 15196 if (V < 0) 15197 V = - V; 15198 switch (VT.getSimpleVT().SimpleTy) { 15199 default: return false; 15200 case MVT::i1: 15201 case MVT::i8: 15202 case MVT::i32: 15203 // +- imm12 15204 return isUInt<12>(V); 15205 case MVT::i16: 15206 // +- imm8 15207 return isUInt<8>(V); 15208 case MVT::f32: 15209 case MVT::f64: 15210 if (!Subtarget->hasVFP2Base()) // FIXME: NEON? 15211 return false; 15212 return isShiftedUInt<8, 2>(V); 15213 } 15214 } 15215 15216 bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM, 15217 EVT VT) const { 15218 int Scale = AM.Scale; 15219 if (Scale < 0) 15220 return false; 15221 15222 switch (VT.getSimpleVT().SimpleTy) { 15223 default: return false; 15224 case MVT::i1: 15225 case MVT::i8: 15226 case MVT::i16: 15227 case MVT::i32: 15228 if (Scale == 1) 15229 return true; 15230 // r + r << imm 15231 Scale = Scale & ~1; 15232 return Scale == 2 || Scale == 4 || Scale == 8; 15233 case MVT::i64: 15234 // FIXME: What are we trying to model here? ldrd doesn't have an r + r 15235 // version in Thumb mode. 15236 // r + r 15237 if (Scale == 1) 15238 return true; 15239 // r * 2 (this can be lowered to r + r). 15240 if (!AM.HasBaseReg && Scale == 2) 15241 return true; 15242 return false; 15243 case MVT::isVoid: 15244 // Note, we allow "void" uses (basically, uses that aren't loads or 15245 // stores), because arm allows folding a scale into many arithmetic 15246 // operations. This should be made more precise and revisited later. 15247 15248 // Allow r << imm, but the imm has to be a multiple of two. 15249 if (Scale & 1) return false; 15250 return isPowerOf2_32(Scale); 15251 } 15252 } 15253 15254 bool ARMTargetLowering::isLegalT1ScaledAddressingMode(const AddrMode &AM, 15255 EVT VT) const { 15256 const int Scale = AM.Scale; 15257 15258 // Negative scales are not supported in Thumb1. 15259 if (Scale < 0) 15260 return false; 15261 15262 // Thumb1 addressing modes do not support register scaling excepting the 15263 // following cases: 15264 // 1. Scale == 1 means no scaling. 15265 // 2. Scale == 2 this can be lowered to r + r if there is no base register. 15266 return (Scale == 1) || (!AM.HasBaseReg && Scale == 2); 15267 } 15268 15269 /// isLegalAddressingMode - Return true if the addressing mode represented 15270 /// by AM is legal for this target, for a load/store of the specified type. 15271 bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL, 15272 const AddrMode &AM, Type *Ty, 15273 unsigned AS, Instruction *I) const { 15274 EVT VT = getValueType(DL, Ty, true); 15275 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget)) 15276 return false; 15277 15278 // Can never fold addr of global into load/store. 15279 if (AM.BaseGV) 15280 return false; 15281 15282 switch (AM.Scale) { 15283 case 0: // no scale reg, must be "r+i" or "r", or "i". 15284 break; 15285 default: 15286 // ARM doesn't support any R+R*scale+imm addr modes. 15287 if (AM.BaseOffs) 15288 return false; 15289 15290 if (!VT.isSimple()) 15291 return false; 15292 15293 if (Subtarget->isThumb1Only()) 15294 return isLegalT1ScaledAddressingMode(AM, VT); 15295 15296 if (Subtarget->isThumb2()) 15297 return isLegalT2ScaledAddressingMode(AM, VT); 15298 15299 int Scale = AM.Scale; 15300 switch (VT.getSimpleVT().SimpleTy) { 15301 default: return false; 15302 case MVT::i1: 15303 case MVT::i8: 15304 case MVT::i32: 15305 if (Scale < 0) Scale = -Scale; 15306 if (Scale == 1) 15307 return true; 15308 // r + r << imm 15309 return isPowerOf2_32(Scale & ~1); 15310 case MVT::i16: 15311 case MVT::i64: 15312 // r +/- r 15313 if (Scale == 1 || (AM.HasBaseReg && Scale == -1)) 15314 return true; 15315 // r * 2 (this can be lowered to r + r). 15316 if (!AM.HasBaseReg && Scale == 2) 15317 return true; 15318 return false; 15319 15320 case MVT::isVoid: 15321 // Note, we allow "void" uses (basically, uses that aren't loads or 15322 // stores), because arm allows folding a scale into many arithmetic 15323 // operations. This should be made more precise and revisited later. 15324 15325 // Allow r << imm, but the imm has to be a multiple of two. 15326 if (Scale & 1) return false; 15327 return isPowerOf2_32(Scale); 15328 } 15329 } 15330 return true; 15331 } 15332 15333 /// isLegalICmpImmediate - Return true if the specified immediate is legal 15334 /// icmp immediate, that is the target has icmp instructions which can compare 15335 /// a register against the immediate without having to materialize the 15336 /// immediate into a register. 15337 bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const { 15338 // Thumb2 and ARM modes can use cmn for negative immediates. 15339 if (!Subtarget->isThumb()) 15340 return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 || 15341 ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1; 15342 if (Subtarget->isThumb2()) 15343 return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 || 15344 ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1; 15345 // Thumb1 doesn't have cmn, and only 8-bit immediates. 15346 return Imm >= 0 && Imm <= 255; 15347 } 15348 15349 /// isLegalAddImmediate - Return true if the specified immediate is a legal add 15350 /// *or sub* immediate, that is the target has add or sub instructions which can 15351 /// add a register with the immediate without having to materialize the 15352 /// immediate into a register. 15353 bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const { 15354 // Same encoding for add/sub, just flip the sign. 15355 int64_t AbsImm = std::abs(Imm); 15356 if (!Subtarget->isThumb()) 15357 return ARM_AM::getSOImmVal(AbsImm) != -1; 15358 if (Subtarget->isThumb2()) 15359 return ARM_AM::getT2SOImmVal(AbsImm) != -1; 15360 // Thumb1 only has 8-bit unsigned immediate. 15361 return AbsImm >= 0 && AbsImm <= 255; 15362 } 15363 15364 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, 15365 bool isSEXTLoad, SDValue &Base, 15366 SDValue &Offset, bool &isInc, 15367 SelectionDAG &DAG) { 15368 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 15369 return false; 15370 15371 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) { 15372 // AddressingMode 3 15373 Base = Ptr->getOperand(0); 15374 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 15375 int RHSC = (int)RHS->getZExtValue(); 15376 if (RHSC < 0 && RHSC > -256) { 15377 assert(Ptr->getOpcode() == ISD::ADD); 15378 isInc = false; 15379 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 15380 return true; 15381 } 15382 } 15383 isInc = (Ptr->getOpcode() == ISD::ADD); 15384 Offset = Ptr->getOperand(1); 15385 return true; 15386 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) { 15387 // AddressingMode 2 15388 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 15389 int RHSC = (int)RHS->getZExtValue(); 15390 if (RHSC < 0 && RHSC > -0x1000) { 15391 assert(Ptr->getOpcode() == ISD::ADD); 15392 isInc = false; 15393 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 15394 Base = Ptr->getOperand(0); 15395 return true; 15396 } 15397 } 15398 15399 if (Ptr->getOpcode() == ISD::ADD) { 15400 isInc = true; 15401 ARM_AM::ShiftOpc ShOpcVal= 15402 ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode()); 15403 if (ShOpcVal != ARM_AM::no_shift) { 15404 Base = Ptr->getOperand(1); 15405 Offset = Ptr->getOperand(0); 15406 } else { 15407 Base = Ptr->getOperand(0); 15408 Offset = Ptr->getOperand(1); 15409 } 15410 return true; 15411 } 15412 15413 isInc = (Ptr->getOpcode() == ISD::ADD); 15414 Base = Ptr->getOperand(0); 15415 Offset = Ptr->getOperand(1); 15416 return true; 15417 } 15418 15419 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store. 15420 return false; 15421 } 15422 15423 static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, 15424 bool isSEXTLoad, SDValue &Base, 15425 SDValue &Offset, bool &isInc, 15426 SelectionDAG &DAG) { 15427 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 15428 return false; 15429 15430 Base = Ptr->getOperand(0); 15431 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 15432 int RHSC = (int)RHS->getZExtValue(); 15433 if (RHSC < 0 && RHSC > -0x100) { // 8 bits. 15434 assert(Ptr->getOpcode() == ISD::ADD); 15435 isInc = false; 15436 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 15437 return true; 15438 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero. 15439 isInc = Ptr->getOpcode() == ISD::ADD; 15440 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); 15441 return true; 15442 } 15443 } 15444 15445 return false; 15446 } 15447 15448 static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align, 15449 bool isSEXTLoad, bool IsMasked, bool isLE, 15450 SDValue &Base, SDValue &Offset, 15451 bool &isInc, SelectionDAG &DAG) { 15452 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 15453 return false; 15454 if (!isa<ConstantSDNode>(Ptr->getOperand(1))) 15455 return false; 15456 15457 // We allow LE non-masked loads to change the type (for example use a vldrb.8 15458 // as opposed to a vldrw.32). This can allow extra addressing modes or 15459 // alignments for what is otherwise an equivalent instruction. 15460 bool CanChangeType = isLE && !IsMasked; 15461 15462 ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1)); 15463 int RHSC = (int)RHS->getZExtValue(); 15464 15465 auto IsInRange = [&](int RHSC, int Limit, int Scale) { 15466 if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) { 15467 assert(Ptr->getOpcode() == ISD::ADD); 15468 isInc = false; 15469 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 15470 return true; 15471 } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) { 15472 isInc = Ptr->getOpcode() == ISD::ADD; 15473 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); 15474 return true; 15475 } 15476 return false; 15477 }; 15478 15479 // Try to find a matching instruction based on s/zext, Alignment, Offset and 15480 // (in BE/masked) type. 15481 Base = Ptr->getOperand(0); 15482 if (VT == MVT::v4i16) { 15483 if (Align >= 2 && IsInRange(RHSC, 0x80, 2)) 15484 return true; 15485 } else if (VT == MVT::v4i8 || VT == MVT::v8i8) { 15486 if (IsInRange(RHSC, 0x80, 1)) 15487 return true; 15488 } else if (Align >= 4 && 15489 (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) && 15490 IsInRange(RHSC, 0x80, 4)) 15491 return true; 15492 else if (Align >= 2 && 15493 (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) && 15494 IsInRange(RHSC, 0x80, 2)) 15495 return true; 15496 else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1)) 15497 return true; 15498 return false; 15499 } 15500 15501 /// getPreIndexedAddressParts - returns true by value, base pointer and 15502 /// offset pointer and addressing mode by reference if the node's address 15503 /// can be legally represented as pre-indexed load / store address. 15504 bool 15505 ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, 15506 SDValue &Offset, 15507 ISD::MemIndexedMode &AM, 15508 SelectionDAG &DAG) const { 15509 if (Subtarget->isThumb1Only()) 15510 return false; 15511 15512 EVT VT; 15513 SDValue Ptr; 15514 unsigned Align; 15515 bool isSEXTLoad = false; 15516 bool IsMasked = false; 15517 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 15518 Ptr = LD->getBasePtr(); 15519 VT = LD->getMemoryVT(); 15520 Align = LD->getAlignment(); 15521 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 15522 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 15523 Ptr = ST->getBasePtr(); 15524 VT = ST->getMemoryVT(); 15525 Align = ST->getAlignment(); 15526 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) { 15527 Ptr = LD->getBasePtr(); 15528 VT = LD->getMemoryVT(); 15529 Align = LD->getAlignment(); 15530 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 15531 IsMasked = true; 15532 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) { 15533 Ptr = ST->getBasePtr(); 15534 VT = ST->getMemoryVT(); 15535 Align = ST->getAlignment(); 15536 IsMasked = true; 15537 } else 15538 return false; 15539 15540 bool isInc; 15541 bool isLegal = false; 15542 if (VT.isVector()) 15543 isLegal = Subtarget->hasMVEIntegerOps() && 15544 getMVEIndexedAddressParts(Ptr.getNode(), VT, Align, isSEXTLoad, 15545 IsMasked, Subtarget->isLittle(), Base, 15546 Offset, isInc, DAG); 15547 else { 15548 if (Subtarget->isThumb2()) 15549 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 15550 Offset, isInc, DAG); 15551 else 15552 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 15553 Offset, isInc, DAG); 15554 } 15555 if (!isLegal) 15556 return false; 15557 15558 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC; 15559 return true; 15560 } 15561 15562 /// getPostIndexedAddressParts - returns true by value, base pointer and 15563 /// offset pointer and addressing mode by reference if this node can be 15564 /// combined with a load / store to form a post-indexed load / store. 15565 bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, 15566 SDValue &Base, 15567 SDValue &Offset, 15568 ISD::MemIndexedMode &AM, 15569 SelectionDAG &DAG) const { 15570 EVT VT; 15571 SDValue Ptr; 15572 unsigned Align; 15573 bool isSEXTLoad = false, isNonExt; 15574 bool IsMasked = false; 15575 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 15576 VT = LD->getMemoryVT(); 15577 Ptr = LD->getBasePtr(); 15578 Align = LD->getAlignment(); 15579 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 15580 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; 15581 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 15582 VT = ST->getMemoryVT(); 15583 Ptr = ST->getBasePtr(); 15584 Align = ST->getAlignment(); 15585 isNonExt = !ST->isTruncatingStore(); 15586 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) { 15587 VT = LD->getMemoryVT(); 15588 Ptr = LD->getBasePtr(); 15589 Align = LD->getAlignment(); 15590 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 15591 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; 15592 IsMasked = true; 15593 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) { 15594 VT = ST->getMemoryVT(); 15595 Ptr = ST->getBasePtr(); 15596 Align = ST->getAlignment(); 15597 isNonExt = !ST->isTruncatingStore(); 15598 IsMasked = true; 15599 } else 15600 return false; 15601 15602 if (Subtarget->isThumb1Only()) { 15603 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It 15604 // must be non-extending/truncating, i32, with an offset of 4. 15605 assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!"); 15606 if (Op->getOpcode() != ISD::ADD || !isNonExt) 15607 return false; 15608 auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 15609 if (!RHS || RHS->getZExtValue() != 4) 15610 return false; 15611 15612 Offset = Op->getOperand(1); 15613 Base = Op->getOperand(0); 15614 AM = ISD::POST_INC; 15615 return true; 15616 } 15617 15618 bool isInc; 15619 bool isLegal = false; 15620 if (VT.isVector()) 15621 isLegal = Subtarget->hasMVEIntegerOps() && 15622 getMVEIndexedAddressParts(Op, VT, Align, isSEXTLoad, IsMasked, 15623 Subtarget->isLittle(), Base, Offset, 15624 isInc, DAG); 15625 else { 15626 if (Subtarget->isThumb2()) 15627 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 15628 isInc, DAG); 15629 else 15630 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 15631 isInc, DAG); 15632 } 15633 if (!isLegal) 15634 return false; 15635 15636 if (Ptr != Base) { 15637 // Swap base ptr and offset to catch more post-index load / store when 15638 // it's legal. In Thumb2 mode, offset must be an immediate. 15639 if (Ptr == Offset && Op->getOpcode() == ISD::ADD && 15640 !Subtarget->isThumb2()) 15641 std::swap(Base, Offset); 15642 15643 // Post-indexed load / store update the base pointer. 15644 if (Ptr != Base) 15645 return false; 15646 } 15647 15648 AM = isInc ? ISD::POST_INC : ISD::POST_DEC; 15649 return true; 15650 } 15651 15652 void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, 15653 KnownBits &Known, 15654 const APInt &DemandedElts, 15655 const SelectionDAG &DAG, 15656 unsigned Depth) const { 15657 unsigned BitWidth = Known.getBitWidth(); 15658 Known.resetAll(); 15659 switch (Op.getOpcode()) { 15660 default: break; 15661 case ARMISD::ADDC: 15662 case ARMISD::ADDE: 15663 case ARMISD::SUBC: 15664 case ARMISD::SUBE: 15665 // Special cases when we convert a carry to a boolean. 15666 if (Op.getResNo() == 0) { 15667 SDValue LHS = Op.getOperand(0); 15668 SDValue RHS = Op.getOperand(1); 15669 // (ADDE 0, 0, C) will give us a single bit. 15670 if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) && 15671 isNullConstant(RHS)) { 15672 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); 15673 return; 15674 } 15675 } 15676 break; 15677 case ARMISD::CMOV: { 15678 // Bits are known zero/one if known on the LHS and RHS. 15679 Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1); 15680 if (Known.isUnknown()) 15681 return; 15682 15683 KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1); 15684 Known.Zero &= KnownRHS.Zero; 15685 Known.One &= KnownRHS.One; 15686 return; 15687 } 15688 case ISD::INTRINSIC_W_CHAIN: { 15689 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); 15690 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); 15691 switch (IntID) { 15692 default: return; 15693 case Intrinsic::arm_ldaex: 15694 case Intrinsic::arm_ldrex: { 15695 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); 15696 unsigned MemBits = VT.getScalarSizeInBits(); 15697 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); 15698 return; 15699 } 15700 } 15701 } 15702 case ARMISD::BFI: { 15703 // Conservatively, we can recurse down the first operand 15704 // and just mask out all affected bits. 15705 Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); 15706 15707 // The operand to BFI is already a mask suitable for removing the bits it 15708 // sets. 15709 ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2)); 15710 const APInt &Mask = CI->getAPIntValue(); 15711 Known.Zero &= Mask; 15712 Known.One &= Mask; 15713 return; 15714 } 15715 case ARMISD::VGETLANEs: 15716 case ARMISD::VGETLANEu: { 15717 const SDValue &SrcSV = Op.getOperand(0); 15718 EVT VecVT = SrcSV.getValueType(); 15719 assert(VecVT.isVector() && "VGETLANE expected a vector type"); 15720 const unsigned NumSrcElts = VecVT.getVectorNumElements(); 15721 ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode()); 15722 assert(Pos->getAPIntValue().ult(NumSrcElts) && 15723 "VGETLANE index out of bounds"); 15724 unsigned Idx = Pos->getZExtValue(); 15725 APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx); 15726 Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1); 15727 15728 EVT VT = Op.getValueType(); 15729 const unsigned DstSz = VT.getScalarSizeInBits(); 15730 const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits(); 15731 (void)SrcSz; 15732 assert(SrcSz == Known.getBitWidth()); 15733 assert(DstSz > SrcSz); 15734 if (Op.getOpcode() == ARMISD::VGETLANEs) 15735 Known = Known.sext(DstSz); 15736 else { 15737 Known = Known.zext(DstSz, true /* extended bits are known zero */); 15738 } 15739 assert(DstSz == Known.getBitWidth()); 15740 break; 15741 } 15742 } 15743 } 15744 15745 bool 15746 ARMTargetLowering::targetShrinkDemandedConstant(SDValue Op, 15747 const APInt &DemandedAPInt, 15748 TargetLoweringOpt &TLO) const { 15749 // Delay optimization, so we don't have to deal with illegal types, or block 15750 // optimizations. 15751 if (!TLO.LegalOps) 15752 return false; 15753 15754 // Only optimize AND for now. 15755 if (Op.getOpcode() != ISD::AND) 15756 return false; 15757 15758 EVT VT = Op.getValueType(); 15759 15760 // Ignore vectors. 15761 if (VT.isVector()) 15762 return false; 15763 15764 assert(VT == MVT::i32 && "Unexpected integer type"); 15765 15766 // Make sure the RHS really is a constant. 15767 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 15768 if (!C) 15769 return false; 15770 15771 unsigned Mask = C->getZExtValue(); 15772 15773 unsigned Demanded = DemandedAPInt.getZExtValue(); 15774 unsigned ShrunkMask = Mask & Demanded; 15775 unsigned ExpandedMask = Mask | ~Demanded; 15776 15777 // If the mask is all zeros, let the target-independent code replace the 15778 // result with zero. 15779 if (ShrunkMask == 0) 15780 return false; 15781 15782 // If the mask is all ones, erase the AND. (Currently, the target-independent 15783 // code won't do this, so we have to do it explicitly to avoid an infinite 15784 // loop in obscure cases.) 15785 if (ExpandedMask == ~0U) 15786 return TLO.CombineTo(Op, Op.getOperand(0)); 15787 15788 auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool { 15789 return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0; 15790 }; 15791 auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool { 15792 if (NewMask == Mask) 15793 return true; 15794 SDLoc DL(Op); 15795 SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT); 15796 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC); 15797 return TLO.CombineTo(Op, NewOp); 15798 }; 15799 15800 // Prefer uxtb mask. 15801 if (IsLegalMask(0xFF)) 15802 return UseMask(0xFF); 15803 15804 // Prefer uxth mask. 15805 if (IsLegalMask(0xFFFF)) 15806 return UseMask(0xFFFF); 15807 15808 // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2. 15809 // FIXME: Prefer a contiguous sequence of bits for other optimizations. 15810 if (ShrunkMask < 256) 15811 return UseMask(ShrunkMask); 15812 15813 // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2. 15814 // FIXME: Prefer a contiguous sequence of bits for other optimizations. 15815 if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256) 15816 return UseMask(ExpandedMask); 15817 15818 // Potential improvements: 15819 // 15820 // We could try to recognize lsls+lsrs or lsrs+lsls pairs here. 15821 // We could try to prefer Thumb1 immediates which can be lowered to a 15822 // two-instruction sequence. 15823 // We could try to recognize more legal ARM/Thumb2 immediates here. 15824 15825 return false; 15826 } 15827 15828 15829 //===----------------------------------------------------------------------===// 15830 // ARM Inline Assembly Support 15831 //===----------------------------------------------------------------------===// 15832 15833 bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const { 15834 // Looking for "rev" which is V6+. 15835 if (!Subtarget->hasV6Ops()) 15836 return false; 15837 15838 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 15839 std::string AsmStr = IA->getAsmString(); 15840 SmallVector<StringRef, 4> AsmPieces; 15841 SplitString(AsmStr, AsmPieces, ";\n"); 15842 15843 switch (AsmPieces.size()) { 15844 default: return false; 15845 case 1: 15846 AsmStr = AsmPieces[0]; 15847 AsmPieces.clear(); 15848 SplitString(AsmStr, AsmPieces, " \t,"); 15849 15850 // rev $0, $1 15851 if (AsmPieces.size() == 3 && 15852 AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" && 15853 IA->getConstraintString().compare(0, 4, "=l,l") == 0) { 15854 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 15855 if (Ty && Ty->getBitWidth() == 32) 15856 return IntrinsicLowering::LowerToByteSwap(CI); 15857 } 15858 break; 15859 } 15860 15861 return false; 15862 } 15863 15864 const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const { 15865 // At this point, we have to lower this constraint to something else, so we 15866 // lower it to an "r" or "w". However, by doing this we will force the result 15867 // to be in register, while the X constraint is much more permissive. 15868 // 15869 // Although we are correct (we are free to emit anything, without 15870 // constraints), we might break use cases that would expect us to be more 15871 // efficient and emit something else. 15872 if (!Subtarget->hasVFP2Base()) 15873 return "r"; 15874 if (ConstraintVT.isFloatingPoint()) 15875 return "w"; 15876 if (ConstraintVT.isVector() && Subtarget->hasNEON() && 15877 (ConstraintVT.getSizeInBits() == 64 || 15878 ConstraintVT.getSizeInBits() == 128)) 15879 return "w"; 15880 15881 return "r"; 15882 } 15883 15884 /// getConstraintType - Given a constraint letter, return the type of 15885 /// constraint it is for this target. 15886 ARMTargetLowering::ConstraintType 15887 ARMTargetLowering::getConstraintType(StringRef Constraint) const { 15888 unsigned S = Constraint.size(); 15889 if (S == 1) { 15890 switch (Constraint[0]) { 15891 default: break; 15892 case 'l': return C_RegisterClass; 15893 case 'w': return C_RegisterClass; 15894 case 'h': return C_RegisterClass; 15895 case 'x': return C_RegisterClass; 15896 case 't': return C_RegisterClass; 15897 case 'j': return C_Immediate; // Constant for movw. 15898 // An address with a single base register. Due to the way we 15899 // currently handle addresses it is the same as an 'r' memory constraint. 15900 case 'Q': return C_Memory; 15901 } 15902 } else if (S == 2) { 15903 switch (Constraint[0]) { 15904 default: break; 15905 case 'T': return C_RegisterClass; 15906 // All 'U+' constraints are addresses. 15907 case 'U': return C_Memory; 15908 } 15909 } 15910 return TargetLowering::getConstraintType(Constraint); 15911 } 15912 15913 /// Examine constraint type and operand type and determine a weight value. 15914 /// This object must already have been set up with the operand type 15915 /// and the current alternative constraint selected. 15916 TargetLowering::ConstraintWeight 15917 ARMTargetLowering::getSingleConstraintMatchWeight( 15918 AsmOperandInfo &info, const char *constraint) const { 15919 ConstraintWeight weight = CW_Invalid; 15920 Value *CallOperandVal = info.CallOperandVal; 15921 // If we don't have a value, we can't do a match, 15922 // but allow it at the lowest weight. 15923 if (!CallOperandVal) 15924 return CW_Default; 15925 Type *type = CallOperandVal->getType(); 15926 // Look at the constraint type. 15927 switch (*constraint) { 15928 default: 15929 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 15930 break; 15931 case 'l': 15932 if (type->isIntegerTy()) { 15933 if (Subtarget->isThumb()) 15934 weight = CW_SpecificReg; 15935 else 15936 weight = CW_Register; 15937 } 15938 break; 15939 case 'w': 15940 if (type->isFloatingPointTy()) 15941 weight = CW_Register; 15942 break; 15943 } 15944 return weight; 15945 } 15946 15947 using RCPair = std::pair<unsigned, const TargetRegisterClass *>; 15948 15949 RCPair ARMTargetLowering::getRegForInlineAsmConstraint( 15950 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { 15951 switch (Constraint.size()) { 15952 case 1: 15953 // GCC ARM Constraint Letters 15954 switch (Constraint[0]) { 15955 case 'l': // Low regs or general regs. 15956 if (Subtarget->isThumb()) 15957 return RCPair(0U, &ARM::tGPRRegClass); 15958 return RCPair(0U, &ARM::GPRRegClass); 15959 case 'h': // High regs or no regs. 15960 if (Subtarget->isThumb()) 15961 return RCPair(0U, &ARM::hGPRRegClass); 15962 break; 15963 case 'r': 15964 if (Subtarget->isThumb1Only()) 15965 return RCPair(0U, &ARM::tGPRRegClass); 15966 return RCPair(0U, &ARM::GPRRegClass); 15967 case 'w': 15968 if (VT == MVT::Other) 15969 break; 15970 if (VT == MVT::f32) 15971 return RCPair(0U, &ARM::SPRRegClass); 15972 if (VT.getSizeInBits() == 64) 15973 return RCPair(0U, &ARM::DPRRegClass); 15974 if (VT.getSizeInBits() == 128) 15975 return RCPair(0U, &ARM::QPRRegClass); 15976 break; 15977 case 'x': 15978 if (VT == MVT::Other) 15979 break; 15980 if (VT == MVT::f32) 15981 return RCPair(0U, &ARM::SPR_8RegClass); 15982 if (VT.getSizeInBits() == 64) 15983 return RCPair(0U, &ARM::DPR_8RegClass); 15984 if (VT.getSizeInBits() == 128) 15985 return RCPair(0U, &ARM::QPR_8RegClass); 15986 break; 15987 case 't': 15988 if (VT == MVT::Other) 15989 break; 15990 if (VT == MVT::f32 || VT == MVT::i32) 15991 return RCPair(0U, &ARM::SPRRegClass); 15992 if (VT.getSizeInBits() == 64) 15993 return RCPair(0U, &ARM::DPR_VFP2RegClass); 15994 if (VT.getSizeInBits() == 128) 15995 return RCPair(0U, &ARM::QPR_VFP2RegClass); 15996 break; 15997 } 15998 break; 15999 16000 case 2: 16001 if (Constraint[0] == 'T') { 16002 switch (Constraint[1]) { 16003 default: 16004 break; 16005 case 'e': 16006 return RCPair(0U, &ARM::tGPREvenRegClass); 16007 case 'o': 16008 return RCPair(0U, &ARM::tGPROddRegClass); 16009 } 16010 } 16011 break; 16012 16013 default: 16014 break; 16015 } 16016 16017 if (StringRef("{cc}").equals_lower(Constraint)) 16018 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass); 16019 16020 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 16021 } 16022 16023 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 16024 /// vector. If it is invalid, don't add anything to Ops. 16025 void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, 16026 std::string &Constraint, 16027 std::vector<SDValue>&Ops, 16028 SelectionDAG &DAG) const { 16029 SDValue Result; 16030 16031 // Currently only support length 1 constraints. 16032 if (Constraint.length() != 1) return; 16033 16034 char ConstraintLetter = Constraint[0]; 16035 switch (ConstraintLetter) { 16036 default: break; 16037 case 'j': 16038 case 'I': case 'J': case 'K': case 'L': 16039 case 'M': case 'N': case 'O': 16040 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 16041 if (!C) 16042 return; 16043 16044 int64_t CVal64 = C->getSExtValue(); 16045 int CVal = (int) CVal64; 16046 // None of these constraints allow values larger than 32 bits. Check 16047 // that the value fits in an int. 16048 if (CVal != CVal64) 16049 return; 16050 16051 switch (ConstraintLetter) { 16052 case 'j': 16053 // Constant suitable for movw, must be between 0 and 16054 // 65535. 16055 if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps())) 16056 if (CVal >= 0 && CVal <= 65535) 16057 break; 16058 return; 16059 case 'I': 16060 if (Subtarget->isThumb1Only()) { 16061 // This must be a constant between 0 and 255, for ADD 16062 // immediates. 16063 if (CVal >= 0 && CVal <= 255) 16064 break; 16065 } else if (Subtarget->isThumb2()) { 16066 // A constant that can be used as an immediate value in a 16067 // data-processing instruction. 16068 if (ARM_AM::getT2SOImmVal(CVal) != -1) 16069 break; 16070 } else { 16071 // A constant that can be used as an immediate value in a 16072 // data-processing instruction. 16073 if (ARM_AM::getSOImmVal(CVal) != -1) 16074 break; 16075 } 16076 return; 16077 16078 case 'J': 16079 if (Subtarget->isThumb1Only()) { 16080 // This must be a constant between -255 and -1, for negated ADD 16081 // immediates. This can be used in GCC with an "n" modifier that 16082 // prints the negated value, for use with SUB instructions. It is 16083 // not useful otherwise but is implemented for compatibility. 16084 if (CVal >= -255 && CVal <= -1) 16085 break; 16086 } else { 16087 // This must be a constant between -4095 and 4095. It is not clear 16088 // what this constraint is intended for. Implemented for 16089 // compatibility with GCC. 16090 if (CVal >= -4095 && CVal <= 4095) 16091 break; 16092 } 16093 return; 16094 16095 case 'K': 16096 if (Subtarget->isThumb1Only()) { 16097 // A 32-bit value where only one byte has a nonzero value. Exclude 16098 // zero to match GCC. This constraint is used by GCC internally for 16099 // constants that can be loaded with a move/shift combination. 16100 // It is not useful otherwise but is implemented for compatibility. 16101 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal)) 16102 break; 16103 } else if (Subtarget->isThumb2()) { 16104 // A constant whose bitwise inverse can be used as an immediate 16105 // value in a data-processing instruction. This can be used in GCC 16106 // with a "B" modifier that prints the inverted value, for use with 16107 // BIC and MVN instructions. It is not useful otherwise but is 16108 // implemented for compatibility. 16109 if (ARM_AM::getT2SOImmVal(~CVal) != -1) 16110 break; 16111 } else { 16112 // A constant whose bitwise inverse can be used as an immediate 16113 // value in a data-processing instruction. This can be used in GCC 16114 // with a "B" modifier that prints the inverted value, for use with 16115 // BIC and MVN instructions. It is not useful otherwise but is 16116 // implemented for compatibility. 16117 if (ARM_AM::getSOImmVal(~CVal) != -1) 16118 break; 16119 } 16120 return; 16121 16122 case 'L': 16123 if (Subtarget->isThumb1Only()) { 16124 // This must be a constant between -7 and 7, 16125 // for 3-operand ADD/SUB immediate instructions. 16126 if (CVal >= -7 && CVal < 7) 16127 break; 16128 } else if (Subtarget->isThumb2()) { 16129 // A constant whose negation can be used as an immediate value in a 16130 // data-processing instruction. This can be used in GCC with an "n" 16131 // modifier that prints the negated value, for use with SUB 16132 // instructions. It is not useful otherwise but is implemented for 16133 // compatibility. 16134 if (ARM_AM::getT2SOImmVal(-CVal) != -1) 16135 break; 16136 } else { 16137 // A constant whose negation can be used as an immediate value in a 16138 // data-processing instruction. This can be used in GCC with an "n" 16139 // modifier that prints the negated value, for use with SUB 16140 // instructions. It is not useful otherwise but is implemented for 16141 // compatibility. 16142 if (ARM_AM::getSOImmVal(-CVal) != -1) 16143 break; 16144 } 16145 return; 16146 16147 case 'M': 16148 if (Subtarget->isThumb1Only()) { 16149 // This must be a multiple of 4 between 0 and 1020, for 16150 // ADD sp + immediate. 16151 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0)) 16152 break; 16153 } else { 16154 // A power of two or a constant between 0 and 32. This is used in 16155 // GCC for the shift amount on shifted register operands, but it is 16156 // useful in general for any shift amounts. 16157 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0)) 16158 break; 16159 } 16160 return; 16161 16162 case 'N': 16163 if (Subtarget->isThumb1Only()) { 16164 // This must be a constant between 0 and 31, for shift amounts. 16165 if (CVal >= 0 && CVal <= 31) 16166 break; 16167 } 16168 return; 16169 16170 case 'O': 16171 if (Subtarget->isThumb1Only()) { 16172 // This must be a multiple of 4 between -508 and 508, for 16173 // ADD/SUB sp = sp + immediate. 16174 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0)) 16175 break; 16176 } 16177 return; 16178 } 16179 Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType()); 16180 break; 16181 } 16182 16183 if (Result.getNode()) { 16184 Ops.push_back(Result); 16185 return; 16186 } 16187 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 16188 } 16189 16190 static RTLIB::Libcall getDivRemLibcall( 16191 const SDNode *N, MVT::SimpleValueType SVT) { 16192 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || 16193 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && 16194 "Unhandled Opcode in getDivRemLibcall"); 16195 bool isSigned = N->getOpcode() == ISD::SDIVREM || 16196 N->getOpcode() == ISD::SREM; 16197 RTLIB::Libcall LC; 16198 switch (SVT) { 16199 default: llvm_unreachable("Unexpected request for libcall!"); 16200 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break; 16201 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break; 16202 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break; 16203 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break; 16204 } 16205 return LC; 16206 } 16207 16208 static TargetLowering::ArgListTy getDivRemArgList( 16209 const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) { 16210 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || 16211 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && 16212 "Unhandled Opcode in getDivRemArgList"); 16213 bool isSigned = N->getOpcode() == ISD::SDIVREM || 16214 N->getOpcode() == ISD::SREM; 16215 TargetLowering::ArgListTy Args; 16216 TargetLowering::ArgListEntry Entry; 16217 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 16218 EVT ArgVT = N->getOperand(i).getValueType(); 16219 Type *ArgTy = ArgVT.getTypeForEVT(*Context); 16220 Entry.Node = N->getOperand(i); 16221 Entry.Ty = ArgTy; 16222 Entry.IsSExt = isSigned; 16223 Entry.IsZExt = !isSigned; 16224 Args.push_back(Entry); 16225 } 16226 if (Subtarget->isTargetWindows() && Args.size() >= 2) 16227 std::swap(Args[0], Args[1]); 16228 return Args; 16229 } 16230 16231 SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { 16232 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || 16233 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || 16234 Subtarget->isTargetWindows()) && 16235 "Register-based DivRem lowering only"); 16236 unsigned Opcode = Op->getOpcode(); 16237 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) && 16238 "Invalid opcode for Div/Rem lowering"); 16239 bool isSigned = (Opcode == ISD::SDIVREM); 16240 EVT VT = Op->getValueType(0); 16241 Type *Ty = VT.getTypeForEVT(*DAG.getContext()); 16242 SDLoc dl(Op); 16243 16244 // If the target has hardware divide, use divide + multiply + subtract: 16245 // div = a / b 16246 // rem = a - b * div 16247 // return {div, rem} 16248 // This should be lowered into UDIV/SDIV + MLS later on. 16249 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() 16250 : Subtarget->hasDivideInARMMode(); 16251 if (hasDivide && Op->getValueType(0).isSimple() && 16252 Op->getSimpleValueType(0) == MVT::i32) { 16253 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV; 16254 const SDValue Dividend = Op->getOperand(0); 16255 const SDValue Divisor = Op->getOperand(1); 16256 SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor); 16257 SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor); 16258 SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul); 16259 16260 SDValue Values[2] = {Div, Rem}; 16261 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values); 16262 } 16263 16264 RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(), 16265 VT.getSimpleVT().SimpleTy); 16266 SDValue InChain = DAG.getEntryNode(); 16267 16268 TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(), 16269 DAG.getContext(), 16270 Subtarget); 16271 16272 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 16273 getPointerTy(DAG.getDataLayout())); 16274 16275 Type *RetTy = StructType::get(Ty, Ty); 16276 16277 if (Subtarget->isTargetWindows()) 16278 InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain); 16279 16280 TargetLowering::CallLoweringInfo CLI(DAG); 16281 CLI.setDebugLoc(dl).setChain(InChain) 16282 .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) 16283 .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); 16284 16285 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); 16286 return CallInfo.first; 16287 } 16288 16289 // Lowers REM using divmod helpers 16290 // see RTABI section 4.2/4.3 16291 SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const { 16292 // Build return types (div and rem) 16293 std::vector<Type*> RetTyParams; 16294 Type *RetTyElement; 16295 16296 switch (N->getValueType(0).getSimpleVT().SimpleTy) { 16297 default: llvm_unreachable("Unexpected request for libcall!"); 16298 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break; 16299 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break; 16300 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break; 16301 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break; 16302 } 16303 16304 RetTyParams.push_back(RetTyElement); 16305 RetTyParams.push_back(RetTyElement); 16306 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams); 16307 Type *RetTy = StructType::get(*DAG.getContext(), ret); 16308 16309 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT(). 16310 SimpleTy); 16311 SDValue InChain = DAG.getEntryNode(); 16312 TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext(), 16313 Subtarget); 16314 bool isSigned = N->getOpcode() == ISD::SREM; 16315 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 16316 getPointerTy(DAG.getDataLayout())); 16317 16318 if (Subtarget->isTargetWindows()) 16319 InChain = WinDBZCheckDenominator(DAG, N, InChain); 16320 16321 // Lower call 16322 CallLoweringInfo CLI(DAG); 16323 CLI.setChain(InChain) 16324 .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args)) 16325 .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N)); 16326 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 16327 16328 // Return second (rem) result operand (first contains div) 16329 SDNode *ResNode = CallResult.first.getNode(); 16330 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands"); 16331 return ResNode->getOperand(1); 16332 } 16333 16334 SDValue 16335 ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { 16336 assert(Subtarget->isTargetWindows() && "unsupported target platform"); 16337 SDLoc DL(Op); 16338 16339 // Get the inputs. 16340 SDValue Chain = Op.getOperand(0); 16341 SDValue Size = Op.getOperand(1); 16342 16343 if (DAG.getMachineFunction().getFunction().hasFnAttribute( 16344 "no-stack-arg-probe")) { 16345 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 16346 SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); 16347 Chain = SP.getValue(1); 16348 SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size); 16349 if (Align) 16350 SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0), 16351 DAG.getConstant(-(uint64_t)Align, DL, MVT::i32)); 16352 Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP); 16353 SDValue Ops[2] = { SP, Chain }; 16354 return DAG.getMergeValues(Ops, DL); 16355 } 16356 16357 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size, 16358 DAG.getConstant(2, DL, MVT::i32)); 16359 16360 SDValue Flag; 16361 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag); 16362 Flag = Chain.getValue(1); 16363 16364 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 16365 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag); 16366 16367 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); 16368 Chain = NewSP.getValue(1); 16369 16370 SDValue Ops[2] = { NewSP, Chain }; 16371 return DAG.getMergeValues(Ops, DL); 16372 } 16373 16374 SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { 16375 bool IsStrict = Op->isStrictFPOpcode(); 16376 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 16377 const unsigned DstSz = Op.getValueType().getSizeInBits(); 16378 const unsigned SrcSz = SrcVal.getValueType().getSizeInBits(); 16379 assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 && 16380 "Unexpected type for custom-lowering FP_EXTEND"); 16381 16382 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) && 16383 "With both FP DP and 16, any FP conversion is legal!"); 16384 16385 assert(!(DstSz == 32 && Subtarget->hasFP16()) && 16386 "With FP16, 16 to 32 conversion is legal!"); 16387 16388 // Converting from 32 -> 64 is valid if we have FP64. 16389 if (SrcSz == 32 && DstSz == 64 && Subtarget->hasFP64()) { 16390 // FIXME: Remove this when we have strict fp instruction selection patterns 16391 if (IsStrict) { 16392 SDLoc Loc(Op); 16393 SDValue Result = DAG.getNode(ISD::FP_EXTEND, 16394 Loc, Op.getValueType(), SrcVal); 16395 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc); 16396 } 16397 return Op; 16398 } 16399 16400 // Either we are converting from 16 -> 64, without FP16 and/or 16401 // FP.double-precision or without Armv8-fp. So we must do it in two 16402 // steps. 16403 // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32 16404 // without FP16. So we must do a function call. 16405 SDLoc Loc(Op); 16406 RTLIB::Libcall LC; 16407 MakeLibCallOptions CallOptions; 16408 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); 16409 for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) { 16410 bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64()); 16411 MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32); 16412 MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64); 16413 if (Supported) { 16414 if (IsStrict) { 16415 SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc, 16416 {DstVT, MVT::Other}, {Chain, SrcVal}); 16417 Chain = SrcVal.getValue(1); 16418 } else { 16419 SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, DstVT, SrcVal); 16420 } 16421 } else { 16422 LC = RTLIB::getFPEXT(SrcVT, DstVT); 16423 assert(LC != RTLIB::UNKNOWN_LIBCALL && 16424 "Unexpected type for custom-lowering FP_EXTEND"); 16425 std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions, 16426 Loc, Chain); 16427 } 16428 } 16429 16430 return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal; 16431 } 16432 16433 SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { 16434 bool IsStrict = Op->isStrictFPOpcode(); 16435 16436 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 16437 EVT SrcVT = SrcVal.getValueType(); 16438 EVT DstVT = Op.getValueType(); 16439 const unsigned DstSz = Op.getValueType().getSizeInBits(); 16440 const unsigned SrcSz = SrcVT.getSizeInBits(); 16441 (void)DstSz; 16442 assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 && 16443 "Unexpected type for custom-lowering FP_ROUND"); 16444 16445 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) && 16446 "With both FP DP and 16, any FP conversion is legal!"); 16447 16448 SDLoc Loc(Op); 16449 16450 // Instruction from 32 -> 16 if hasFP16 is valid 16451 if (SrcSz == 32 && Subtarget->hasFP16()) 16452 return Op; 16453 16454 // Lib call from 32 -> 16 / 64 -> [32, 16] 16455 RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT); 16456 assert(LC != RTLIB::UNKNOWN_LIBCALL && 16457 "Unexpected type for custom-lowering FP_ROUND"); 16458 MakeLibCallOptions CallOptions; 16459 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); 16460 SDValue Result; 16461 std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions, 16462 Loc, Chain); 16463 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result; 16464 } 16465 16466 void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results, 16467 SelectionDAG &DAG) const { 16468 assert(N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS."); 16469 MVT HalfT = MVT::i32; 16470 SDLoc dl(N); 16471 SDValue Hi, Lo, Tmp; 16472 16473 if (!isOperationLegalOrCustom(ISD::ADDCARRY, HalfT) || 16474 !isOperationLegalOrCustom(ISD::UADDO, HalfT)) 16475 return ; 16476 16477 unsigned OpTypeBits = HalfT.getScalarSizeInBits(); 16478 SDVTList VTList = DAG.getVTList(HalfT, MVT::i1); 16479 16480 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), 16481 DAG.getConstant(0, dl, HalfT)); 16482 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), 16483 DAG.getConstant(1, dl, HalfT)); 16484 16485 Tmp = DAG.getNode(ISD::SRA, dl, HalfT, Hi, 16486 DAG.getConstant(OpTypeBits - 1, dl, 16487 getShiftAmountTy(HalfT, DAG.getDataLayout()))); 16488 Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo); 16489 Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi, 16490 SDValue(Lo.getNode(), 1)); 16491 Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi); 16492 Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo); 16493 16494 Results.push_back(Lo); 16495 Results.push_back(Hi); 16496 } 16497 16498 bool 16499 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { 16500 // The ARM target isn't yet aware of offsets. 16501 return false; 16502 } 16503 16504 bool ARM::isBitFieldInvertedMask(unsigned v) { 16505 if (v == 0xffffffff) 16506 return false; 16507 16508 // there can be 1's on either or both "outsides", all the "inside" 16509 // bits must be 0's 16510 return isShiftedMask_32(~v); 16511 } 16512 16513 /// isFPImmLegal - Returns true if the target can instruction select the 16514 /// specified FP immediate natively. If false, the legalizer will 16515 /// materialize the FP immediate as a load from a constant pool. 16516 bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, 16517 bool ForCodeSize) const { 16518 if (!Subtarget->hasVFP3Base()) 16519 return false; 16520 if (VT == MVT::f16 && Subtarget->hasFullFP16()) 16521 return ARM_AM::getFP16Imm(Imm) != -1; 16522 if (VT == MVT::f32) 16523 return ARM_AM::getFP32Imm(Imm) != -1; 16524 if (VT == MVT::f64 && Subtarget->hasFP64()) 16525 return ARM_AM::getFP64Imm(Imm) != -1; 16526 return false; 16527 } 16528 16529 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as 16530 /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment 16531 /// specified in the intrinsic calls. 16532 bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 16533 const CallInst &I, 16534 MachineFunction &MF, 16535 unsigned Intrinsic) const { 16536 switch (Intrinsic) { 16537 case Intrinsic::arm_neon_vld1: 16538 case Intrinsic::arm_neon_vld2: 16539 case Intrinsic::arm_neon_vld3: 16540 case Intrinsic::arm_neon_vld4: 16541 case Intrinsic::arm_neon_vld2lane: 16542 case Intrinsic::arm_neon_vld3lane: 16543 case Intrinsic::arm_neon_vld4lane: 16544 case Intrinsic::arm_neon_vld2dup: 16545 case Intrinsic::arm_neon_vld3dup: 16546 case Intrinsic::arm_neon_vld4dup: { 16547 Info.opc = ISD::INTRINSIC_W_CHAIN; 16548 // Conservatively set memVT to the entire set of vectors loaded. 16549 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 16550 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 16551 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 16552 Info.ptrVal = I.getArgOperand(0); 16553 Info.offset = 0; 16554 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); 16555 Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue()); 16556 // volatile loads with NEON intrinsics not supported 16557 Info.flags = MachineMemOperand::MOLoad; 16558 return true; 16559 } 16560 case Intrinsic::arm_neon_vld1x2: 16561 case Intrinsic::arm_neon_vld1x3: 16562 case Intrinsic::arm_neon_vld1x4: { 16563 Info.opc = ISD::INTRINSIC_W_CHAIN; 16564 // Conservatively set memVT to the entire set of vectors loaded. 16565 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 16566 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 16567 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 16568 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); 16569 Info.offset = 0; 16570 Info.align.reset(); 16571 // volatile loads with NEON intrinsics not supported 16572 Info.flags = MachineMemOperand::MOLoad; 16573 return true; 16574 } 16575 case Intrinsic::arm_neon_vst1: 16576 case Intrinsic::arm_neon_vst2: 16577 case Intrinsic::arm_neon_vst3: 16578 case Intrinsic::arm_neon_vst4: 16579 case Intrinsic::arm_neon_vst2lane: 16580 case Intrinsic::arm_neon_vst3lane: 16581 case Intrinsic::arm_neon_vst4lane: { 16582 Info.opc = ISD::INTRINSIC_VOID; 16583 // Conservatively set memVT to the entire set of vectors stored. 16584 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 16585 unsigned NumElts = 0; 16586 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { 16587 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 16588 if (!ArgTy->isVectorTy()) 16589 break; 16590 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 16591 } 16592 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 16593 Info.ptrVal = I.getArgOperand(0); 16594 Info.offset = 0; 16595 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); 16596 Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue()); 16597 // volatile stores with NEON intrinsics not supported 16598 Info.flags = MachineMemOperand::MOStore; 16599 return true; 16600 } 16601 case Intrinsic::arm_neon_vst1x2: 16602 case Intrinsic::arm_neon_vst1x3: 16603 case Intrinsic::arm_neon_vst1x4: { 16604 Info.opc = ISD::INTRINSIC_VOID; 16605 // Conservatively set memVT to the entire set of vectors stored. 16606 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 16607 unsigned NumElts = 0; 16608 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { 16609 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 16610 if (!ArgTy->isVectorTy()) 16611 break; 16612 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 16613 } 16614 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 16615 Info.ptrVal = I.getArgOperand(0); 16616 Info.offset = 0; 16617 Info.align.reset(); 16618 // volatile stores with NEON intrinsics not supported 16619 Info.flags = MachineMemOperand::MOStore; 16620 return true; 16621 } 16622 case Intrinsic::arm_ldaex: 16623 case Intrinsic::arm_ldrex: { 16624 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 16625 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType()); 16626 Info.opc = ISD::INTRINSIC_W_CHAIN; 16627 Info.memVT = MVT::getVT(PtrTy->getElementType()); 16628 Info.ptrVal = I.getArgOperand(0); 16629 Info.offset = 0; 16630 Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); 16631 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 16632 return true; 16633 } 16634 case Intrinsic::arm_stlex: 16635 case Intrinsic::arm_strex: { 16636 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 16637 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); 16638 Info.opc = ISD::INTRINSIC_W_CHAIN; 16639 Info.memVT = MVT::getVT(PtrTy->getElementType()); 16640 Info.ptrVal = I.getArgOperand(1); 16641 Info.offset = 0; 16642 Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); 16643 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 16644 return true; 16645 } 16646 case Intrinsic::arm_stlexd: 16647 case Intrinsic::arm_strexd: 16648 Info.opc = ISD::INTRINSIC_W_CHAIN; 16649 Info.memVT = MVT::i64; 16650 Info.ptrVal = I.getArgOperand(2); 16651 Info.offset = 0; 16652 Info.align = Align(8); 16653 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 16654 return true; 16655 16656 case Intrinsic::arm_ldaexd: 16657 case Intrinsic::arm_ldrexd: 16658 Info.opc = ISD::INTRINSIC_W_CHAIN; 16659 Info.memVT = MVT::i64; 16660 Info.ptrVal = I.getArgOperand(0); 16661 Info.offset = 0; 16662 Info.align = Align(8); 16663 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 16664 return true; 16665 16666 default: 16667 break; 16668 } 16669 16670 return false; 16671 } 16672 16673 /// Returns true if it is beneficial to convert a load of a constant 16674 /// to just the constant itself. 16675 bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 16676 Type *Ty) const { 16677 assert(Ty->isIntegerTy()); 16678 16679 unsigned Bits = Ty->getPrimitiveSizeInBits(); 16680 if (Bits == 0 || Bits > 32) 16681 return false; 16682 return true; 16683 } 16684 16685 bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, 16686 unsigned Index) const { 16687 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) 16688 return false; 16689 16690 return (Index == 0 || Index == ResVT.getVectorNumElements()); 16691 } 16692 16693 Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder, 16694 ARM_MB::MemBOpt Domain) const { 16695 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 16696 16697 // First, if the target has no DMB, see what fallback we can use. 16698 if (!Subtarget->hasDataBarrier()) { 16699 // Some ARMv6 cpus can support data barriers with an mcr instruction. 16700 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 16701 // here. 16702 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) { 16703 Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr); 16704 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0), 16705 Builder.getInt32(0), Builder.getInt32(7), 16706 Builder.getInt32(10), Builder.getInt32(5)}; 16707 return Builder.CreateCall(MCR, args); 16708 } else { 16709 // Instead of using barriers, atomic accesses on these subtargets use 16710 // libcalls. 16711 llvm_unreachable("makeDMB on a target so old that it has no barriers"); 16712 } 16713 } else { 16714 Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb); 16715 // Only a full system barrier exists in the M-class architectures. 16716 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain; 16717 Constant *CDomain = Builder.getInt32(Domain); 16718 return Builder.CreateCall(DMB, CDomain); 16719 } 16720 } 16721 16722 // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html 16723 Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, 16724 Instruction *Inst, 16725 AtomicOrdering Ord) const { 16726 switch (Ord) { 16727 case AtomicOrdering::NotAtomic: 16728 case AtomicOrdering::Unordered: 16729 llvm_unreachable("Invalid fence: unordered/non-atomic"); 16730 case AtomicOrdering::Monotonic: 16731 case AtomicOrdering::Acquire: 16732 return nullptr; // Nothing to do 16733 case AtomicOrdering::SequentiallyConsistent: 16734 if (!Inst->hasAtomicStore()) 16735 return nullptr; // Nothing to do 16736 LLVM_FALLTHROUGH; 16737 case AtomicOrdering::Release: 16738 case AtomicOrdering::AcquireRelease: 16739 if (Subtarget->preferISHSTBarriers()) 16740 return makeDMB(Builder, ARM_MB::ISHST); 16741 // FIXME: add a comment with a link to documentation justifying this. 16742 else 16743 return makeDMB(Builder, ARM_MB::ISH); 16744 } 16745 llvm_unreachable("Unknown fence ordering in emitLeadingFence"); 16746 } 16747 16748 Instruction *ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder, 16749 Instruction *Inst, 16750 AtomicOrdering Ord) const { 16751 switch (Ord) { 16752 case AtomicOrdering::NotAtomic: 16753 case AtomicOrdering::Unordered: 16754 llvm_unreachable("Invalid fence: unordered/not-atomic"); 16755 case AtomicOrdering::Monotonic: 16756 case AtomicOrdering::Release: 16757 return nullptr; // Nothing to do 16758 case AtomicOrdering::Acquire: 16759 case AtomicOrdering::AcquireRelease: 16760 case AtomicOrdering::SequentiallyConsistent: 16761 return makeDMB(Builder, ARM_MB::ISH); 16762 } 16763 llvm_unreachable("Unknown fence ordering in emitTrailingFence"); 16764 } 16765 16766 // Loads and stores less than 64-bits are already atomic; ones above that 16767 // are doomed anyway, so defer to the default libcall and blame the OS when 16768 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit 16769 // anything for those. 16770 bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { 16771 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); 16772 return (Size == 64) && !Subtarget->isMClass(); 16773 } 16774 16775 // Loads and stores less than 64-bits are already atomic; ones above that 16776 // are doomed anyway, so defer to the default libcall and blame the OS when 16777 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit 16778 // anything for those. 16779 // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that 16780 // guarantee, see DDI0406C ARM architecture reference manual, 16781 // sections A8.8.72-74 LDRD) 16782 TargetLowering::AtomicExpansionKind 16783 ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { 16784 unsigned Size = LI->getType()->getPrimitiveSizeInBits(); 16785 return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly 16786 : AtomicExpansionKind::None; 16787 } 16788 16789 // For the real atomic operations, we have ldrex/strex up to 32 bits, 16790 // and up to 64 bits on the non-M profiles 16791 TargetLowering::AtomicExpansionKind 16792 ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 16793 if (AI->isFloatingPointOperation()) 16794 return AtomicExpansionKind::CmpXChg; 16795 16796 unsigned Size = AI->getType()->getPrimitiveSizeInBits(); 16797 bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); 16798 return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) 16799 ? AtomicExpansionKind::LLSC 16800 : AtomicExpansionKind::None; 16801 } 16802 16803 TargetLowering::AtomicExpansionKind 16804 ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const { 16805 // At -O0, fast-regalloc cannot cope with the live vregs necessary to 16806 // implement cmpxchg without spilling. If the address being exchanged is also 16807 // on the stack and close enough to the spill slot, this can lead to a 16808 // situation where the monitor always gets cleared and the atomic operation 16809 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. 16810 bool HasAtomicCmpXchg = 16811 !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); 16812 if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg) 16813 return AtomicExpansionKind::LLSC; 16814 return AtomicExpansionKind::None; 16815 } 16816 16817 bool ARMTargetLowering::shouldInsertFencesForAtomic( 16818 const Instruction *I) const { 16819 return InsertFencesForAtomic; 16820 } 16821 16822 // This has so far only been implemented for MachO. 16823 bool ARMTargetLowering::useLoadStackGuardNode() const { 16824 return Subtarget->isTargetMachO(); 16825 } 16826 16827 void ARMTargetLowering::insertSSPDeclarations(Module &M) const { 16828 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 16829 return TargetLowering::insertSSPDeclarations(M); 16830 16831 // MSVC CRT has a global variable holding security cookie. 16832 M.getOrInsertGlobal("__security_cookie", 16833 Type::getInt8PtrTy(M.getContext())); 16834 16835 // MSVC CRT has a function to validate security cookie. 16836 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( 16837 "__security_check_cookie", Type::getVoidTy(M.getContext()), 16838 Type::getInt8PtrTy(M.getContext())); 16839 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) 16840 F->addAttribute(1, Attribute::AttrKind::InReg); 16841 } 16842 16843 Value *ARMTargetLowering::getSDagStackGuard(const Module &M) const { 16844 // MSVC CRT has a global variable holding security cookie. 16845 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 16846 return M.getGlobalVariable("__security_cookie"); 16847 return TargetLowering::getSDagStackGuard(M); 16848 } 16849 16850 Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const { 16851 // MSVC CRT has a function to validate security cookie. 16852 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 16853 return M.getFunction("__security_check_cookie"); 16854 return TargetLowering::getSSPStackGuardCheck(M); 16855 } 16856 16857 bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, 16858 unsigned &Cost) const { 16859 // If we do not have NEON, vector types are not natively supported. 16860 if (!Subtarget->hasNEON()) 16861 return false; 16862 16863 // Floating point values and vector values map to the same register file. 16864 // Therefore, although we could do a store extract of a vector type, this is 16865 // better to leave at float as we have more freedom in the addressing mode for 16866 // those. 16867 if (VectorTy->isFPOrFPVectorTy()) 16868 return false; 16869 16870 // If the index is unknown at compile time, this is very expensive to lower 16871 // and it is not possible to combine the store with the extract. 16872 if (!isa<ConstantInt>(Idx)) 16873 return false; 16874 16875 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type"); 16876 unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth(); 16877 // We can do a store + vector extract on any vector that fits perfectly in a D 16878 // or Q register. 16879 if (BitWidth == 64 || BitWidth == 128) { 16880 Cost = 0; 16881 return true; 16882 } 16883 return false; 16884 } 16885 16886 bool ARMTargetLowering::isCheapToSpeculateCttz() const { 16887 return Subtarget->hasV6T2Ops(); 16888 } 16889 16890 bool ARMTargetLowering::isCheapToSpeculateCtlz() const { 16891 return Subtarget->hasV6T2Ops(); 16892 } 16893 16894 bool ARMTargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const { 16895 return !Subtarget->hasMinSize() || Subtarget->isTargetWindows(); 16896 } 16897 16898 Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, 16899 AtomicOrdering Ord) const { 16900 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 16901 Type *ValTy = cast<PointerType>(Addr->getType())->getElementType(); 16902 bool IsAcquire = isAcquireOrStronger(Ord); 16903 16904 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd 16905 // intrinsic must return {i32, i32} and we have to recombine them into a 16906 // single i64 here. 16907 if (ValTy->getPrimitiveSizeInBits() == 64) { 16908 Intrinsic::ID Int = 16909 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd; 16910 Function *Ldrex = Intrinsic::getDeclaration(M, Int); 16911 16912 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 16913 Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi"); 16914 16915 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); 16916 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); 16917 if (!Subtarget->isLittle()) 16918 std::swap (Lo, Hi); 16919 Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); 16920 Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); 16921 return Builder.CreateOr( 16922 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64"); 16923 } 16924 16925 Type *Tys[] = { Addr->getType() }; 16926 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex; 16927 Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys); 16928 16929 return Builder.CreateTruncOrBitCast( 16930 Builder.CreateCall(Ldrex, Addr), 16931 cast<PointerType>(Addr->getType())->getElementType()); 16932 } 16933 16934 void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance( 16935 IRBuilder<> &Builder) const { 16936 if (!Subtarget->hasV7Ops()) 16937 return; 16938 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 16939 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex)); 16940 } 16941 16942 Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, 16943 Value *Addr, 16944 AtomicOrdering Ord) const { 16945 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 16946 bool IsRelease = isReleaseOrStronger(Ord); 16947 16948 // Since the intrinsics must have legal type, the i64 intrinsics take two 16949 // parameters: "i32, i32". We must marshal Val into the appropriate form 16950 // before the call. 16951 if (Val->getType()->getPrimitiveSizeInBits() == 64) { 16952 Intrinsic::ID Int = 16953 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd; 16954 Function *Strex = Intrinsic::getDeclaration(M, Int); 16955 Type *Int32Ty = Type::getInt32Ty(M->getContext()); 16956 16957 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo"); 16958 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi"); 16959 if (!Subtarget->isLittle()) 16960 std::swap(Lo, Hi); 16961 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 16962 return Builder.CreateCall(Strex, {Lo, Hi, Addr}); 16963 } 16964 16965 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex; 16966 Type *Tys[] = { Addr->getType() }; 16967 Function *Strex = Intrinsic::getDeclaration(M, Int, Tys); 16968 16969 return Builder.CreateCall( 16970 Strex, {Builder.CreateZExtOrBitCast( 16971 Val, Strex->getFunctionType()->getParamType(0)), 16972 Addr}); 16973 } 16974 16975 16976 bool ARMTargetLowering::alignLoopsWithOptSize() const { 16977 return Subtarget->isMClass(); 16978 } 16979 16980 /// A helper function for determining the number of interleaved accesses we 16981 /// will generate when lowering accesses of the given type. 16982 unsigned 16983 ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy, 16984 const DataLayout &DL) const { 16985 return (DL.getTypeSizeInBits(VecTy) + 127) / 128; 16986 } 16987 16988 bool ARMTargetLowering::isLegalInterleavedAccessType( 16989 unsigned Factor, VectorType *VecTy, const DataLayout &DL) const { 16990 16991 unsigned VecSize = DL.getTypeSizeInBits(VecTy); 16992 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); 16993 16994 if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps()) 16995 return false; 16996 16997 // Ensure the vector doesn't have f16 elements. Even though we could do an 16998 // i16 vldN, we can't hold the f16 vectors and will end up converting via 16999 // f32. 17000 if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy()) 17001 return false; 17002 if (Subtarget->hasMVEIntegerOps() && Factor == 3) 17003 return false; 17004 17005 // Ensure the number of vector elements is greater than 1. 17006 if (VecTy->getNumElements() < 2) 17007 return false; 17008 17009 // Ensure the element type is legal. 17010 if (ElSize != 8 && ElSize != 16 && ElSize != 32) 17011 return false; 17012 17013 // Ensure the total vector size is 64 or a multiple of 128. Types larger than 17014 // 128 will be split into multiple interleaved accesses. 17015 if (Subtarget->hasNEON() && VecSize == 64) 17016 return true; 17017 return VecSize % 128 == 0; 17018 } 17019 17020 unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const { 17021 if (Subtarget->hasNEON()) 17022 return 4; 17023 if (Subtarget->hasMVEIntegerOps()) 17024 return MVEMaxSupportedInterleaveFactor; 17025 return TargetLoweringBase::getMaxSupportedInterleaveFactor(); 17026 } 17027 17028 /// Lower an interleaved load into a vldN intrinsic. 17029 /// 17030 /// E.g. Lower an interleaved load (Factor = 2): 17031 /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4 17032 /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements 17033 /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements 17034 /// 17035 /// Into: 17036 /// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4) 17037 /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0 17038 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 17039 bool ARMTargetLowering::lowerInterleavedLoad( 17040 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, 17041 ArrayRef<unsigned> Indices, unsigned Factor) const { 17042 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 17043 "Invalid interleave factor"); 17044 assert(!Shuffles.empty() && "Empty shufflevector input"); 17045 assert(Shuffles.size() == Indices.size() && 17046 "Unmatched number of shufflevectors and indices"); 17047 17048 VectorType *VecTy = Shuffles[0]->getType(); 17049 Type *EltTy = VecTy->getVectorElementType(); 17050 17051 const DataLayout &DL = LI->getModule()->getDataLayout(); 17052 17053 // Skip if we do not have NEON and skip illegal vector types. We can 17054 // "legalize" wide vector types into multiple interleaved accesses as long as 17055 // the vector types are divisible by 128. 17056 if (!isLegalInterleavedAccessType(Factor, VecTy, DL)) 17057 return false; 17058 17059 unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); 17060 17061 // A pointer vector can not be the return type of the ldN intrinsics. Need to 17062 // load integer vectors first and then convert to pointer vectors. 17063 if (EltTy->isPointerTy()) 17064 VecTy = 17065 VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); 17066 17067 IRBuilder<> Builder(LI); 17068 17069 // The base address of the load. 17070 Value *BaseAddr = LI->getPointerOperand(); 17071 17072 if (NumLoads > 1) { 17073 // If we're going to generate more than one load, reset the sub-vector type 17074 // to something legal. 17075 VecTy = VectorType::get(VecTy->getVectorElementType(), 17076 VecTy->getVectorNumElements() / NumLoads); 17077 17078 // We will compute the pointer operand of each load from the original base 17079 // address using GEPs. Cast the base address to a pointer to the scalar 17080 // element type. 17081 BaseAddr = Builder.CreateBitCast( 17082 BaseAddr, VecTy->getVectorElementType()->getPointerTo( 17083 LI->getPointerAddressSpace())); 17084 } 17085 17086 assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!"); 17087 17088 auto createLoadIntrinsic = [&](Value *BaseAddr) { 17089 if (Subtarget->hasNEON()) { 17090 Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); 17091 Type *Tys[] = {VecTy, Int8Ptr}; 17092 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, 17093 Intrinsic::arm_neon_vld3, 17094 Intrinsic::arm_neon_vld4}; 17095 Function *VldnFunc = 17096 Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); 17097 17098 SmallVector<Value *, 2> Ops; 17099 Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); 17100 Ops.push_back(Builder.getInt32(LI->getAlignment())); 17101 17102 return Builder.CreateCall(VldnFunc, Ops, "vldN"); 17103 } else { 17104 assert((Factor == 2 || Factor == 4) && 17105 "expected interleave factor of 2 or 4 for MVE"); 17106 Intrinsic::ID LoadInts = 17107 Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q; 17108 Type *VecEltTy = VecTy->getVectorElementType()->getPointerTo( 17109 LI->getPointerAddressSpace()); 17110 Type *Tys[] = {VecTy, VecEltTy}; 17111 Function *VldnFunc = 17112 Intrinsic::getDeclaration(LI->getModule(), LoadInts, Tys); 17113 17114 SmallVector<Value *, 2> Ops; 17115 Ops.push_back(Builder.CreateBitCast(BaseAddr, VecEltTy)); 17116 return Builder.CreateCall(VldnFunc, Ops, "vldN"); 17117 } 17118 }; 17119 17120 // Holds sub-vectors extracted from the load intrinsic return values. The 17121 // sub-vectors are associated with the shufflevector instructions they will 17122 // replace. 17123 DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs; 17124 17125 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) { 17126 // If we're generating more than one load, compute the base address of 17127 // subsequent loads as an offset from the previous. 17128 if (LoadCount > 0) 17129 BaseAddr = 17130 Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr, 17131 VecTy->getVectorNumElements() * Factor); 17132 17133 CallInst *VldN = createLoadIntrinsic(BaseAddr); 17134 17135 // Replace uses of each shufflevector with the corresponding vector loaded 17136 // by ldN. 17137 for (unsigned i = 0; i < Shuffles.size(); i++) { 17138 ShuffleVectorInst *SV = Shuffles[i]; 17139 unsigned Index = Indices[i]; 17140 17141 Value *SubVec = Builder.CreateExtractValue(VldN, Index); 17142 17143 // Convert the integer vector to pointer vector if the element is pointer. 17144 if (EltTy->isPointerTy()) 17145 SubVec = Builder.CreateIntToPtr( 17146 SubVec, VectorType::get(SV->getType()->getVectorElementType(), 17147 VecTy->getVectorNumElements())); 17148 17149 SubVecs[SV].push_back(SubVec); 17150 } 17151 } 17152 17153 // Replace uses of the shufflevector instructions with the sub-vectors 17154 // returned by the load intrinsic. If a shufflevector instruction is 17155 // associated with more than one sub-vector, those sub-vectors will be 17156 // concatenated into a single wide vector. 17157 for (ShuffleVectorInst *SVI : Shuffles) { 17158 auto &SubVec = SubVecs[SVI]; 17159 auto *WideVec = 17160 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0]; 17161 SVI->replaceAllUsesWith(WideVec); 17162 } 17163 17164 return true; 17165 } 17166 17167 /// Lower an interleaved store into a vstN intrinsic. 17168 /// 17169 /// E.g. Lower an interleaved store (Factor = 3): 17170 /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, 17171 /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> 17172 /// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4 17173 /// 17174 /// Into: 17175 /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> 17176 /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> 17177 /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> 17178 /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) 17179 /// 17180 /// Note that the new shufflevectors will be removed and we'll only generate one 17181 /// vst3 instruction in CodeGen. 17182 /// 17183 /// Example for a more general valid mask (Factor 3). Lower: 17184 /// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1, 17185 /// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19> 17186 /// store <12 x i32> %i.vec, <12 x i32>* %ptr 17187 /// 17188 /// Into: 17189 /// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7> 17190 /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> 17191 /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> 17192 /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) 17193 bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, 17194 ShuffleVectorInst *SVI, 17195 unsigned Factor) const { 17196 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 17197 "Invalid interleave factor"); 17198 17199 VectorType *VecTy = SVI->getType(); 17200 assert(VecTy->getVectorNumElements() % Factor == 0 && 17201 "Invalid interleaved store"); 17202 17203 unsigned LaneLen = VecTy->getVectorNumElements() / Factor; 17204 Type *EltTy = VecTy->getVectorElementType(); 17205 VectorType *SubVecTy = VectorType::get(EltTy, LaneLen); 17206 17207 const DataLayout &DL = SI->getModule()->getDataLayout(); 17208 17209 // Skip if we do not have NEON and skip illegal vector types. We can 17210 // "legalize" wide vector types into multiple interleaved accesses as long as 17211 // the vector types are divisible by 128. 17212 if (!isLegalInterleavedAccessType(Factor, SubVecTy, DL)) 17213 return false; 17214 17215 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); 17216 17217 Value *Op0 = SVI->getOperand(0); 17218 Value *Op1 = SVI->getOperand(1); 17219 IRBuilder<> Builder(SI); 17220 17221 // StN intrinsics don't support pointer vectors as arguments. Convert pointer 17222 // vectors to integer vectors. 17223 if (EltTy->isPointerTy()) { 17224 Type *IntTy = DL.getIntPtrType(EltTy); 17225 17226 // Convert to the corresponding integer vector. 17227 Type *IntVecTy = 17228 VectorType::get(IntTy, Op0->getType()->getVectorNumElements()); 17229 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); 17230 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); 17231 17232 SubVecTy = VectorType::get(IntTy, LaneLen); 17233 } 17234 17235 // The base address of the store. 17236 Value *BaseAddr = SI->getPointerOperand(); 17237 17238 if (NumStores > 1) { 17239 // If we're going to generate more than one store, reset the lane length 17240 // and sub-vector type to something legal. 17241 LaneLen /= NumStores; 17242 SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen); 17243 17244 // We will compute the pointer operand of each store from the original base 17245 // address using GEPs. Cast the base address to a pointer to the scalar 17246 // element type. 17247 BaseAddr = Builder.CreateBitCast( 17248 BaseAddr, SubVecTy->getVectorElementType()->getPointerTo( 17249 SI->getPointerAddressSpace())); 17250 } 17251 17252 assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!"); 17253 17254 auto Mask = SVI->getShuffleMask(); 17255 17256 auto createStoreIntrinsic = [&](Value *BaseAddr, 17257 SmallVectorImpl<Value *> &Shuffles) { 17258 if (Subtarget->hasNEON()) { 17259 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, 17260 Intrinsic::arm_neon_vst3, 17261 Intrinsic::arm_neon_vst4}; 17262 Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); 17263 Type *Tys[] = {Int8Ptr, SubVecTy}; 17264 17265 Function *VstNFunc = Intrinsic::getDeclaration( 17266 SI->getModule(), StoreInts[Factor - 2], Tys); 17267 17268 SmallVector<Value *, 6> Ops; 17269 Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); 17270 for (auto S : Shuffles) 17271 Ops.push_back(S); 17272 Ops.push_back(Builder.getInt32(SI->getAlignment())); 17273 Builder.CreateCall(VstNFunc, Ops); 17274 } else { 17275 assert((Factor == 2 || Factor == 4) && 17276 "expected interleave factor of 2 or 4 for MVE"); 17277 Intrinsic::ID StoreInts = 17278 Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q; 17279 Type *EltPtrTy = SubVecTy->getVectorElementType()->getPointerTo( 17280 SI->getPointerAddressSpace()); 17281 Type *Tys[] = {EltPtrTy, SubVecTy}; 17282 Function *VstNFunc = 17283 Intrinsic::getDeclaration(SI->getModule(), StoreInts, Tys); 17284 17285 SmallVector<Value *, 6> Ops; 17286 Ops.push_back(Builder.CreateBitCast(BaseAddr, EltPtrTy)); 17287 for (auto S : Shuffles) 17288 Ops.push_back(S); 17289 for (unsigned F = 0; F < Factor; F++) { 17290 Ops.push_back(Builder.getInt32(F)); 17291 Builder.CreateCall(VstNFunc, Ops); 17292 Ops.pop_back(); 17293 } 17294 } 17295 }; 17296 17297 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { 17298 // If we generating more than one store, we compute the base address of 17299 // subsequent stores as an offset from the previous. 17300 if (StoreCount > 0) 17301 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(), 17302 BaseAddr, LaneLen * Factor); 17303 17304 SmallVector<Value *, 4> Shuffles; 17305 17306 // Split the shufflevector operands into sub vectors for the new vstN call. 17307 for (unsigned i = 0; i < Factor; i++) { 17308 unsigned IdxI = StoreCount * LaneLen * Factor + i; 17309 if (Mask[IdxI] >= 0) { 17310 Shuffles.push_back(Builder.CreateShuffleVector( 17311 Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0))); 17312 } else { 17313 unsigned StartMask = 0; 17314 for (unsigned j = 1; j < LaneLen; j++) { 17315 unsigned IdxJ = StoreCount * LaneLen * Factor + j; 17316 if (Mask[IdxJ * Factor + IdxI] >= 0) { 17317 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ; 17318 break; 17319 } 17320 } 17321 // Note: If all elements in a chunk are undefs, StartMask=0! 17322 // Note: Filling undef gaps with random elements is ok, since 17323 // those elements were being written anyway (with undefs). 17324 // In the case of all undefs we're defaulting to using elems from 0 17325 // Note: StartMask cannot be negative, it's checked in 17326 // isReInterleaveMask 17327 Shuffles.push_back(Builder.CreateShuffleVector( 17328 Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); 17329 } 17330 } 17331 17332 createStoreIntrinsic(BaseAddr, Shuffles); 17333 } 17334 return true; 17335 } 17336 17337 enum HABaseType { 17338 HA_UNKNOWN = 0, 17339 HA_FLOAT, 17340 HA_DOUBLE, 17341 HA_VECT64, 17342 HA_VECT128 17343 }; 17344 17345 static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, 17346 uint64_t &Members) { 17347 if (auto *ST = dyn_cast<StructType>(Ty)) { 17348 for (unsigned i = 0; i < ST->getNumElements(); ++i) { 17349 uint64_t SubMembers = 0; 17350 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers)) 17351 return false; 17352 Members += SubMembers; 17353 } 17354 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) { 17355 uint64_t SubMembers = 0; 17356 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers)) 17357 return false; 17358 Members += SubMembers * AT->getNumElements(); 17359 } else if (Ty->isFloatTy()) { 17360 if (Base != HA_UNKNOWN && Base != HA_FLOAT) 17361 return false; 17362 Members = 1; 17363 Base = HA_FLOAT; 17364 } else if (Ty->isDoubleTy()) { 17365 if (Base != HA_UNKNOWN && Base != HA_DOUBLE) 17366 return false; 17367 Members = 1; 17368 Base = HA_DOUBLE; 17369 } else if (auto *VT = dyn_cast<VectorType>(Ty)) { 17370 Members = 1; 17371 switch (Base) { 17372 case HA_FLOAT: 17373 case HA_DOUBLE: 17374 return false; 17375 case HA_VECT64: 17376 return VT->getBitWidth() == 64; 17377 case HA_VECT128: 17378 return VT->getBitWidth() == 128; 17379 case HA_UNKNOWN: 17380 switch (VT->getBitWidth()) { 17381 case 64: 17382 Base = HA_VECT64; 17383 return true; 17384 case 128: 17385 Base = HA_VECT128; 17386 return true; 17387 default: 17388 return false; 17389 } 17390 } 17391 } 17392 17393 return (Members > 0 && Members <= 4); 17394 } 17395 17396 /// Return the correct alignment for the current calling convention. 17397 Align ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy, 17398 DataLayout DL) const { 17399 const Align ABITypeAlign(DL.getABITypeAlignment(ArgTy)); 17400 if (!ArgTy->isVectorTy()) 17401 return ABITypeAlign; 17402 17403 // Avoid over-aligning vector parameters. It would require realigning the 17404 // stack and waste space for no real benefit. 17405 return std::min(ABITypeAlign, DL.getStackAlignment()); 17406 } 17407 17408 /// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of 17409 /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when 17410 /// passing according to AAPCS rules. 17411 bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( 17412 Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { 17413 if (getEffectiveCallingConv(CallConv, isVarArg) != 17414 CallingConv::ARM_AAPCS_VFP) 17415 return false; 17416 17417 HABaseType Base = HA_UNKNOWN; 17418 uint64_t Members = 0; 17419 bool IsHA = isHomogeneousAggregate(Ty, Base, Members); 17420 LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump()); 17421 17422 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy(); 17423 return IsHA || IsIntArray; 17424 } 17425 17426 unsigned ARMTargetLowering::getExceptionPointerRegister( 17427 const Constant *PersonalityFn) const { 17428 // Platforms which do not use SjLj EH may return values in these registers 17429 // via the personality function. 17430 return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0; 17431 } 17432 17433 unsigned ARMTargetLowering::getExceptionSelectorRegister( 17434 const Constant *PersonalityFn) const { 17435 // Platforms which do not use SjLj EH may return values in these registers 17436 // via the personality function. 17437 return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1; 17438 } 17439 17440 void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { 17441 // Update IsSplitCSR in ARMFunctionInfo. 17442 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>(); 17443 AFI->setIsSplitCSR(true); 17444 } 17445 17446 void ARMTargetLowering::insertCopiesSplitCSR( 17447 MachineBasicBlock *Entry, 17448 const SmallVectorImpl<MachineBasicBlock *> &Exits) const { 17449 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 17450 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); 17451 if (!IStart) 17452 return; 17453 17454 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 17455 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); 17456 MachineBasicBlock::iterator MBBI = Entry->begin(); 17457 for (const MCPhysReg *I = IStart; *I; ++I) { 17458 const TargetRegisterClass *RC = nullptr; 17459 if (ARM::GPRRegClass.contains(*I)) 17460 RC = &ARM::GPRRegClass; 17461 else if (ARM::DPRRegClass.contains(*I)) 17462 RC = &ARM::DPRRegClass; 17463 else 17464 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 17465 17466 Register NewVR = MRI->createVirtualRegister(RC); 17467 // Create copy from CSR to a virtual register. 17468 // FIXME: this currently does not emit CFI pseudo-instructions, it works 17469 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be 17470 // nounwind. If we want to generalize this later, we may need to emit 17471 // CFI pseudo-instructions. 17472 assert(Entry->getParent()->getFunction().hasFnAttribute( 17473 Attribute::NoUnwind) && 17474 "Function should be nounwind in insertCopiesSplitCSR!"); 17475 Entry->addLiveIn(*I); 17476 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) 17477 .addReg(*I); 17478 17479 // Insert the copy-back instructions right before the terminator. 17480 for (auto *Exit : Exits) 17481 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), 17482 TII->get(TargetOpcode::COPY), *I) 17483 .addReg(NewVR); 17484 } 17485 } 17486 17487 void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const { 17488 MF.getFrameInfo().computeMaxCallFrameSize(MF); 17489 TargetLoweringBase::finalizeLowering(MF); 17490 } 17491