1 //===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines the interfaces that ARM uses to lower LLVM code into a 10 // selection DAG. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "ARMISelLowering.h" 15 #include "ARMBaseInstrInfo.h" 16 #include "ARMBaseRegisterInfo.h" 17 #include "ARMCallingConv.h" 18 #include "ARMConstantPoolValue.h" 19 #include "ARMMachineFunctionInfo.h" 20 #include "ARMPerfectShuffle.h" 21 #include "ARMRegisterInfo.h" 22 #include "ARMSelectionDAGInfo.h" 23 #include "ARMSubtarget.h" 24 #include "ARMTargetTransformInfo.h" 25 #include "MCTargetDesc/ARMAddressingModes.h" 26 #include "MCTargetDesc/ARMBaseInfo.h" 27 #include "Utils/ARMBaseInfo.h" 28 #include "llvm/ADT/APFloat.h" 29 #include "llvm/ADT/APInt.h" 30 #include "llvm/ADT/ArrayRef.h" 31 #include "llvm/ADT/BitVector.h" 32 #include "llvm/ADT/DenseMap.h" 33 #include "llvm/ADT/STLExtras.h" 34 #include "llvm/ADT/SmallPtrSet.h" 35 #include "llvm/ADT/SmallVector.h" 36 #include "llvm/ADT/Statistic.h" 37 #include "llvm/ADT/StringExtras.h" 38 #include "llvm/ADT/StringRef.h" 39 #include "llvm/ADT/StringSwitch.h" 40 #include "llvm/ADT/Triple.h" 41 #include "llvm/ADT/Twine.h" 42 #include "llvm/Analysis/VectorUtils.h" 43 #include "llvm/CodeGen/CallingConvLower.h" 44 #include "llvm/CodeGen/ISDOpcodes.h" 45 #include "llvm/CodeGen/IntrinsicLowering.h" 46 #include "llvm/CodeGen/MachineBasicBlock.h" 47 #include "llvm/CodeGen/MachineConstantPool.h" 48 #include "llvm/CodeGen/MachineFrameInfo.h" 49 #include "llvm/CodeGen/MachineFunction.h" 50 #include "llvm/CodeGen/MachineInstr.h" 51 #include "llvm/CodeGen/MachineInstrBuilder.h" 52 #include "llvm/CodeGen/MachineJumpTableInfo.h" 53 #include "llvm/CodeGen/MachineMemOperand.h" 54 #include "llvm/CodeGen/MachineOperand.h" 55 #include "llvm/CodeGen/MachineRegisterInfo.h" 56 #include "llvm/CodeGen/RuntimeLibcalls.h" 57 #include "llvm/CodeGen/SelectionDAG.h" 58 #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h" 59 #include "llvm/CodeGen/SelectionDAGNodes.h" 60 #include "llvm/CodeGen/TargetInstrInfo.h" 61 #include "llvm/CodeGen/TargetLowering.h" 62 #include "llvm/CodeGen/TargetOpcodes.h" 63 #include "llvm/CodeGen/TargetRegisterInfo.h" 64 #include "llvm/CodeGen/TargetSubtargetInfo.h" 65 #include "llvm/CodeGen/ValueTypes.h" 66 #include "llvm/IR/Attributes.h" 67 #include "llvm/IR/CallingConv.h" 68 #include "llvm/IR/Constant.h" 69 #include "llvm/IR/Constants.h" 70 #include "llvm/IR/DataLayout.h" 71 #include "llvm/IR/DebugLoc.h" 72 #include "llvm/IR/DerivedTypes.h" 73 #include "llvm/IR/Function.h" 74 #include "llvm/IR/GlobalAlias.h" 75 #include "llvm/IR/GlobalValue.h" 76 #include "llvm/IR/GlobalVariable.h" 77 #include "llvm/IR/IRBuilder.h" 78 #include "llvm/IR/InlineAsm.h" 79 #include "llvm/IR/Instruction.h" 80 #include "llvm/IR/Instructions.h" 81 #include "llvm/IR/IntrinsicInst.h" 82 #include "llvm/IR/Intrinsics.h" 83 #include "llvm/IR/IntrinsicsARM.h" 84 #include "llvm/IR/Module.h" 85 #include "llvm/IR/PatternMatch.h" 86 #include "llvm/IR/Type.h" 87 #include "llvm/IR/User.h" 88 #include "llvm/IR/Value.h" 89 #include "llvm/MC/MCInstrDesc.h" 90 #include "llvm/MC/MCInstrItineraries.h" 91 #include "llvm/MC/MCRegisterInfo.h" 92 #include "llvm/MC/MCSchedule.h" 93 #include "llvm/Support/AtomicOrdering.h" 94 #include "llvm/Support/BranchProbability.h" 95 #include "llvm/Support/Casting.h" 96 #include "llvm/Support/CodeGen.h" 97 #include "llvm/Support/CommandLine.h" 98 #include "llvm/Support/Compiler.h" 99 #include "llvm/Support/Debug.h" 100 #include "llvm/Support/ErrorHandling.h" 101 #include "llvm/Support/KnownBits.h" 102 #include "llvm/Support/MachineValueType.h" 103 #include "llvm/Support/MathExtras.h" 104 #include "llvm/Support/raw_ostream.h" 105 #include "llvm/Target/TargetMachine.h" 106 #include "llvm/Target/TargetOptions.h" 107 #include <algorithm> 108 #include <cassert> 109 #include <cstdint> 110 #include <cstdlib> 111 #include <iterator> 112 #include <limits> 113 #include <string> 114 #include <tuple> 115 #include <utility> 116 #include <vector> 117 118 using namespace llvm; 119 using namespace llvm::PatternMatch; 120 121 #define DEBUG_TYPE "arm-isel" 122 123 STATISTIC(NumTailCalls, "Number of tail calls"); 124 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt"); 125 STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments"); 126 STATISTIC(NumConstpoolPromoted, 127 "Number of constants with their storage promoted into constant pools"); 128 129 static cl::opt<bool> 130 ARMInterworking("arm-interworking", cl::Hidden, 131 cl::desc("Enable / disable ARM interworking (for debugging only)"), 132 cl::init(true)); 133 134 static cl::opt<bool> EnableConstpoolPromotion( 135 "arm-promote-constant", cl::Hidden, 136 cl::desc("Enable / disable promotion of unnamed_addr constants into " 137 "constant pools"), 138 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed 139 static cl::opt<unsigned> ConstpoolPromotionMaxSize( 140 "arm-promote-constant-max-size", cl::Hidden, 141 cl::desc("Maximum size of constant to promote into a constant pool"), 142 cl::init(64)); 143 static cl::opt<unsigned> ConstpoolPromotionMaxTotal( 144 "arm-promote-constant-max-total", cl::Hidden, 145 cl::desc("Maximum size of ALL constants to promote into a constant pool"), 146 cl::init(128)); 147 148 cl::opt<unsigned> 149 MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, 150 cl::desc("Maximum interleave factor for MVE VLDn to generate."), 151 cl::init(2)); 152 153 // The APCS parameter registers. 154 static const MCPhysReg GPRArgRegs[] = { 155 ARM::R0, ARM::R1, ARM::R2, ARM::R3 156 }; 157 158 void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) { 159 if (VT != PromotedLdStVT) { 160 setOperationAction(ISD::LOAD, VT, Promote); 161 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT); 162 163 setOperationAction(ISD::STORE, VT, Promote); 164 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT); 165 } 166 167 MVT ElemTy = VT.getVectorElementType(); 168 if (ElemTy != MVT::f64) 169 setOperationAction(ISD::SETCC, VT, Custom); 170 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 171 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 172 if (ElemTy == MVT::i32) { 173 setOperationAction(ISD::SINT_TO_FP, VT, Custom); 174 setOperationAction(ISD::UINT_TO_FP, VT, Custom); 175 setOperationAction(ISD::FP_TO_SINT, VT, Custom); 176 setOperationAction(ISD::FP_TO_UINT, VT, Custom); 177 } else { 178 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 179 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 180 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 181 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 182 } 183 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 184 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 185 setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); 186 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); 187 setOperationAction(ISD::SELECT, VT, Expand); 188 setOperationAction(ISD::SELECT_CC, VT, Expand); 189 setOperationAction(ISD::VSELECT, VT, Expand); 190 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); 191 if (VT.isInteger()) { 192 setOperationAction(ISD::SHL, VT, Custom); 193 setOperationAction(ISD::SRA, VT, Custom); 194 setOperationAction(ISD::SRL, VT, Custom); 195 } 196 197 // Neon does not support vector divide/remainder operations. 198 setOperationAction(ISD::SDIV, VT, Expand); 199 setOperationAction(ISD::UDIV, VT, Expand); 200 setOperationAction(ISD::FDIV, VT, Expand); 201 setOperationAction(ISD::SREM, VT, Expand); 202 setOperationAction(ISD::UREM, VT, Expand); 203 setOperationAction(ISD::FREM, VT, Expand); 204 setOperationAction(ISD::SDIVREM, VT, Expand); 205 setOperationAction(ISD::UDIVREM, VT, Expand); 206 207 if (!VT.isFloatingPoint() && 208 VT != MVT::v2i64 && VT != MVT::v1i64) 209 for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) 210 setOperationAction(Opcode, VT, Legal); 211 if (!VT.isFloatingPoint()) 212 for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT}) 213 setOperationAction(Opcode, VT, Legal); 214 } 215 216 void ARMTargetLowering::addDRTypeForNEON(MVT VT) { 217 addRegisterClass(VT, &ARM::DPRRegClass); 218 addTypeForNEON(VT, MVT::f64); 219 } 220 221 void ARMTargetLowering::addQRTypeForNEON(MVT VT) { 222 addRegisterClass(VT, &ARM::DPairRegClass); 223 addTypeForNEON(VT, MVT::v2f64); 224 } 225 226 void ARMTargetLowering::setAllExpand(MVT VT) { 227 for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc) 228 setOperationAction(Opc, VT, Expand); 229 230 // We support these really simple operations even on types where all 231 // the actual arithmetic has to be broken down into simpler 232 // operations or turned into library calls. 233 setOperationAction(ISD::BITCAST, VT, Legal); 234 setOperationAction(ISD::LOAD, VT, Legal); 235 setOperationAction(ISD::STORE, VT, Legal); 236 setOperationAction(ISD::UNDEF, VT, Legal); 237 } 238 239 void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To, 240 LegalizeAction Action) { 241 setLoadExtAction(ISD::EXTLOAD, From, To, Action); 242 setLoadExtAction(ISD::ZEXTLOAD, From, To, Action); 243 setLoadExtAction(ISD::SEXTLOAD, From, To, Action); 244 } 245 246 void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { 247 const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 }; 248 249 for (auto VT : IntTypes) { 250 addRegisterClass(VT, &ARM::MQPRRegClass); 251 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 252 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 253 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 254 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 255 setOperationAction(ISD::SHL, VT, Custom); 256 setOperationAction(ISD::SRA, VT, Custom); 257 setOperationAction(ISD::SRL, VT, Custom); 258 setOperationAction(ISD::SMIN, VT, Legal); 259 setOperationAction(ISD::SMAX, VT, Legal); 260 setOperationAction(ISD::UMIN, VT, Legal); 261 setOperationAction(ISD::UMAX, VT, Legal); 262 setOperationAction(ISD::ABS, VT, Legal); 263 setOperationAction(ISD::SETCC, VT, Custom); 264 setOperationAction(ISD::MLOAD, VT, Custom); 265 setOperationAction(ISD::MSTORE, VT, Legal); 266 setOperationAction(ISD::CTLZ, VT, Legal); 267 setOperationAction(ISD::CTTZ, VT, Custom); 268 setOperationAction(ISD::BITREVERSE, VT, Legal); 269 setOperationAction(ISD::BSWAP, VT, Legal); 270 setOperationAction(ISD::SADDSAT, VT, Legal); 271 setOperationAction(ISD::UADDSAT, VT, Legal); 272 setOperationAction(ISD::SSUBSAT, VT, Legal); 273 setOperationAction(ISD::USUBSAT, VT, Legal); 274 setOperationAction(ISD::ABDS, VT, Legal); 275 setOperationAction(ISD::ABDU, VT, Legal); 276 277 // No native support for these. 278 setOperationAction(ISD::UDIV, VT, Expand); 279 setOperationAction(ISD::SDIV, VT, Expand); 280 setOperationAction(ISD::UREM, VT, Expand); 281 setOperationAction(ISD::SREM, VT, Expand); 282 setOperationAction(ISD::UDIVREM, VT, Expand); 283 setOperationAction(ISD::SDIVREM, VT, Expand); 284 setOperationAction(ISD::CTPOP, VT, Expand); 285 setOperationAction(ISD::SELECT, VT, Expand); 286 setOperationAction(ISD::SELECT_CC, VT, Expand); 287 288 // Vector reductions 289 setOperationAction(ISD::VECREDUCE_ADD, VT, Legal); 290 setOperationAction(ISD::VECREDUCE_SMAX, VT, Legal); 291 setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal); 292 setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal); 293 setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal); 294 setOperationAction(ISD::VECREDUCE_MUL, VT, Custom); 295 setOperationAction(ISD::VECREDUCE_AND, VT, Custom); 296 setOperationAction(ISD::VECREDUCE_OR, VT, Custom); 297 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); 298 299 if (!HasMVEFP) { 300 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 301 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 302 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 303 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 304 } else { 305 setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom); 306 setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom); 307 } 308 309 // Pre and Post inc are supported on loads and stores 310 for (unsigned im = (unsigned)ISD::PRE_INC; 311 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 312 setIndexedLoadAction(im, VT, Legal); 313 setIndexedStoreAction(im, VT, Legal); 314 setIndexedMaskedLoadAction(im, VT, Legal); 315 setIndexedMaskedStoreAction(im, VT, Legal); 316 } 317 } 318 319 const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 }; 320 for (auto VT : FloatTypes) { 321 addRegisterClass(VT, &ARM::MQPRRegClass); 322 if (!HasMVEFP) 323 setAllExpand(VT); 324 325 // These are legal or custom whether we have MVE.fp or not 326 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 327 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 328 setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getVectorElementType(), Custom); 329 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 330 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 331 setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom); 332 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal); 333 setOperationAction(ISD::SETCC, VT, Custom); 334 setOperationAction(ISD::MLOAD, VT, Custom); 335 setOperationAction(ISD::MSTORE, VT, Legal); 336 setOperationAction(ISD::SELECT, VT, Expand); 337 setOperationAction(ISD::SELECT_CC, VT, Expand); 338 339 // Pre and Post inc are supported on loads and stores 340 for (unsigned im = (unsigned)ISD::PRE_INC; 341 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 342 setIndexedLoadAction(im, VT, Legal); 343 setIndexedStoreAction(im, VT, Legal); 344 setIndexedMaskedLoadAction(im, VT, Legal); 345 setIndexedMaskedStoreAction(im, VT, Legal); 346 } 347 348 if (HasMVEFP) { 349 setOperationAction(ISD::FMINNUM, VT, Legal); 350 setOperationAction(ISD::FMAXNUM, VT, Legal); 351 setOperationAction(ISD::FROUND, VT, Legal); 352 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); 353 setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom); 354 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); 355 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); 356 357 // No native support for these. 358 setOperationAction(ISD::FDIV, VT, Expand); 359 setOperationAction(ISD::FREM, VT, Expand); 360 setOperationAction(ISD::FSQRT, VT, Expand); 361 setOperationAction(ISD::FSIN, VT, Expand); 362 setOperationAction(ISD::FCOS, VT, Expand); 363 setOperationAction(ISD::FPOW, VT, Expand); 364 setOperationAction(ISD::FLOG, VT, Expand); 365 setOperationAction(ISD::FLOG2, VT, Expand); 366 setOperationAction(ISD::FLOG10, VT, Expand); 367 setOperationAction(ISD::FEXP, VT, Expand); 368 setOperationAction(ISD::FEXP2, VT, Expand); 369 setOperationAction(ISD::FNEARBYINT, VT, Expand); 370 } 371 } 372 373 // Custom Expand smaller than legal vector reductions to prevent false zero 374 // items being added. 375 setOperationAction(ISD::VECREDUCE_FADD, MVT::v4f16, Custom); 376 setOperationAction(ISD::VECREDUCE_FMUL, MVT::v4f16, Custom); 377 setOperationAction(ISD::VECREDUCE_FMIN, MVT::v4f16, Custom); 378 setOperationAction(ISD::VECREDUCE_FMAX, MVT::v4f16, Custom); 379 setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom); 380 setOperationAction(ISD::VECREDUCE_FMUL, MVT::v2f16, Custom); 381 setOperationAction(ISD::VECREDUCE_FMIN, MVT::v2f16, Custom); 382 setOperationAction(ISD::VECREDUCE_FMAX, MVT::v2f16, Custom); 383 384 // We 'support' these types up to bitcast/load/store level, regardless of 385 // MVE integer-only / float support. Only doing FP data processing on the FP 386 // vector types is inhibited at integer-only level. 387 const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 }; 388 for (auto VT : LongTypes) { 389 addRegisterClass(VT, &ARM::MQPRRegClass); 390 setAllExpand(VT); 391 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 392 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 393 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 394 setOperationAction(ISD::VSELECT, VT, Legal); 395 } 396 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal); 397 398 // We can do bitwise operations on v2i64 vectors 399 setOperationAction(ISD::AND, MVT::v2i64, Legal); 400 setOperationAction(ISD::OR, MVT::v2i64, Legal); 401 setOperationAction(ISD::XOR, MVT::v2i64, Legal); 402 403 // It is legal to extload from v4i8 to v4i16 or v4i32. 404 addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal); 405 addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal); 406 addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal); 407 408 // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16. 409 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Legal); 410 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Legal); 411 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Legal); 412 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i8, Legal); 413 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i16, Legal); 414 415 // Some truncating stores are legal too. 416 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); 417 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); 418 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); 419 420 // Pre and Post inc on these are legal, given the correct extends 421 for (unsigned im = (unsigned)ISD::PRE_INC; 422 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 423 for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) { 424 setIndexedLoadAction(im, VT, Legal); 425 setIndexedStoreAction(im, VT, Legal); 426 setIndexedMaskedLoadAction(im, VT, Legal); 427 setIndexedMaskedStoreAction(im, VT, Legal); 428 } 429 } 430 431 // Predicate types 432 const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1}; 433 for (auto VT : pTypes) { 434 addRegisterClass(VT, &ARM::VCCRRegClass); 435 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 436 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 437 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 438 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 439 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 440 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 441 setOperationAction(ISD::SETCC, VT, Custom); 442 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand); 443 setOperationAction(ISD::LOAD, VT, Custom); 444 setOperationAction(ISD::STORE, VT, Custom); 445 setOperationAction(ISD::TRUNCATE, VT, Custom); 446 setOperationAction(ISD::VSELECT, VT, Expand); 447 setOperationAction(ISD::SELECT, VT, Expand); 448 } 449 setOperationAction(ISD::SETCC, MVT::v2i1, Expand); 450 setOperationAction(ISD::TRUNCATE, MVT::v2i1, Expand); 451 setOperationAction(ISD::AND, MVT::v2i1, Expand); 452 setOperationAction(ISD::OR, MVT::v2i1, Expand); 453 setOperationAction(ISD::XOR, MVT::v2i1, Expand); 454 setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Expand); 455 setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Expand); 456 setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Expand); 457 setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Expand); 458 459 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); 460 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); 461 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); 462 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); 463 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom); 464 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); 465 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); 466 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); 467 } 468 469 ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, 470 const ARMSubtarget &STI) 471 : TargetLowering(TM), Subtarget(&STI) { 472 RegInfo = Subtarget->getRegisterInfo(); 473 Itins = Subtarget->getInstrItineraryData(); 474 475 setBooleanContents(ZeroOrOneBooleanContent); 476 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 477 478 if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() && 479 !Subtarget->isTargetWatchOS()) { 480 bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard; 481 for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID) 482 setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID), 483 IsHFTarget ? CallingConv::ARM_AAPCS_VFP 484 : CallingConv::ARM_AAPCS); 485 } 486 487 if (Subtarget->isTargetMachO()) { 488 // Uses VFP for Thumb libfuncs if available. 489 if (Subtarget->isThumb() && Subtarget->hasVFP2Base() && 490 Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) { 491 static const struct { 492 const RTLIB::Libcall Op; 493 const char * const Name; 494 const ISD::CondCode Cond; 495 } LibraryCalls[] = { 496 // Single-precision floating-point arithmetic. 497 { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID }, 498 { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID }, 499 { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID }, 500 { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID }, 501 502 // Double-precision floating-point arithmetic. 503 { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID }, 504 { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID }, 505 { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID }, 506 { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID }, 507 508 // Single-precision comparisons. 509 { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE }, 510 { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE }, 511 { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE }, 512 { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE }, 513 { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE }, 514 { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE }, 515 { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE }, 516 517 // Double-precision comparisons. 518 { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE }, 519 { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE }, 520 { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE }, 521 { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE }, 522 { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE }, 523 { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE }, 524 { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE }, 525 526 // Floating-point to integer conversions. 527 // i64 conversions are done via library routines even when generating VFP 528 // instructions, so use the same ones. 529 { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID }, 530 { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID }, 531 { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID }, 532 { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID }, 533 534 // Conversions between floating types. 535 { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID }, 536 { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID }, 537 538 // Integer to floating-point conversions. 539 // i64 conversions are done via library routines even when generating VFP 540 // instructions, so use the same ones. 541 // FIXME: There appears to be some naming inconsistency in ARM libgcc: 542 // e.g., __floatunsidf vs. __floatunssidfvfp. 543 { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID }, 544 { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID }, 545 { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID }, 546 { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID }, 547 }; 548 549 for (const auto &LC : LibraryCalls) { 550 setLibcallName(LC.Op, LC.Name); 551 if (LC.Cond != ISD::SETCC_INVALID) 552 setCmpLibcallCC(LC.Op, LC.Cond); 553 } 554 } 555 } 556 557 // These libcalls are not available in 32-bit. 558 setLibcallName(RTLIB::SHL_I128, nullptr); 559 setLibcallName(RTLIB::SRL_I128, nullptr); 560 setLibcallName(RTLIB::SRA_I128, nullptr); 561 setLibcallName(RTLIB::MUL_I128, nullptr); 562 setLibcallName(RTLIB::MULO_I64, nullptr); 563 setLibcallName(RTLIB::MULO_I128, nullptr); 564 565 // RTLIB 566 if (Subtarget->isAAPCS_ABI() && 567 (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() || 568 Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) { 569 static const struct { 570 const RTLIB::Libcall Op; 571 const char * const Name; 572 const CallingConv::ID CC; 573 const ISD::CondCode Cond; 574 } LibraryCalls[] = { 575 // Double-precision floating-point arithmetic helper functions 576 // RTABI chapter 4.1.2, Table 2 577 { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 578 { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 579 { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 580 { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 581 582 // Double-precision floating-point comparison helper functions 583 // RTABI chapter 4.1.2, Table 3 584 { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, 585 { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, 586 { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, 587 { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, 588 { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, 589 { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, 590 { RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, 591 592 // Single-precision floating-point arithmetic helper functions 593 // RTABI chapter 4.1.2, Table 4 594 { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 595 { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 596 { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 597 { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 598 599 // Single-precision floating-point comparison helper functions 600 // RTABI chapter 4.1.2, Table 5 601 { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, 602 { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, 603 { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, 604 { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, 605 { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, 606 { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, 607 { RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, 608 609 // Floating-point to integer conversions. 610 // RTABI chapter 4.1.2, Table 6 611 { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 612 { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 613 { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 614 { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 615 { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 616 { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 617 { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 618 { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 619 620 // Conversions between floating types. 621 // RTABI chapter 4.1.2, Table 7 622 { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 623 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 624 { RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 625 626 // Integer to floating-point conversions. 627 // RTABI chapter 4.1.2, Table 8 628 { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 629 { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 630 { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 631 { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 632 { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 633 { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 634 { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 635 { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 636 637 // Long long helper functions 638 // RTABI chapter 4.2, Table 9 639 { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 640 { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 641 { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 642 { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 643 644 // Integer division functions 645 // RTABI chapter 4.3.1 646 { RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 647 { RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 648 { RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 649 { RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 650 { RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 651 { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 652 { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 653 { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 654 }; 655 656 for (const auto &LC : LibraryCalls) { 657 setLibcallName(LC.Op, LC.Name); 658 setLibcallCallingConv(LC.Op, LC.CC); 659 if (LC.Cond != ISD::SETCC_INVALID) 660 setCmpLibcallCC(LC.Op, LC.Cond); 661 } 662 663 // EABI dependent RTLIB 664 if (TM.Options.EABIVersion == EABI::EABI4 || 665 TM.Options.EABIVersion == EABI::EABI5) { 666 static const struct { 667 const RTLIB::Libcall Op; 668 const char *const Name; 669 const CallingConv::ID CC; 670 const ISD::CondCode Cond; 671 } MemOpsLibraryCalls[] = { 672 // Memory operations 673 // RTABI chapter 4.3.4 674 { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 675 { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 676 { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 677 }; 678 679 for (const auto &LC : MemOpsLibraryCalls) { 680 setLibcallName(LC.Op, LC.Name); 681 setLibcallCallingConv(LC.Op, LC.CC); 682 if (LC.Cond != ISD::SETCC_INVALID) 683 setCmpLibcallCC(LC.Op, LC.Cond); 684 } 685 } 686 } 687 688 if (Subtarget->isTargetWindows()) { 689 static const struct { 690 const RTLIB::Libcall Op; 691 const char * const Name; 692 const CallingConv::ID CC; 693 } LibraryCalls[] = { 694 { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP }, 695 { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP }, 696 { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP }, 697 { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP }, 698 { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP }, 699 { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP }, 700 { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP }, 701 { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP }, 702 }; 703 704 for (const auto &LC : LibraryCalls) { 705 setLibcallName(LC.Op, LC.Name); 706 setLibcallCallingConv(LC.Op, LC.CC); 707 } 708 } 709 710 // Use divmod compiler-rt calls for iOS 5.0 and later. 711 if (Subtarget->isTargetMachO() && 712 !(Subtarget->isTargetIOS() && 713 Subtarget->getTargetTriple().isOSVersionLT(5, 0))) { 714 setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); 715 setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); 716 } 717 718 // The half <-> float conversion functions are always soft-float on 719 // non-watchos platforms, but are needed for some targets which use a 720 // hard-float calling convention by default. 721 if (!Subtarget->isTargetWatchABI()) { 722 if (Subtarget->isAAPCS_ABI()) { 723 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS); 724 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS); 725 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS); 726 } else { 727 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS); 728 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS); 729 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS); 730 } 731 } 732 733 // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have 734 // a __gnu_ prefix (which is the default). 735 if (Subtarget->isTargetAEABI()) { 736 static const struct { 737 const RTLIB::Libcall Op; 738 const char * const Name; 739 const CallingConv::ID CC; 740 } LibraryCalls[] = { 741 { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS }, 742 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS }, 743 { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS }, 744 }; 745 746 for (const auto &LC : LibraryCalls) { 747 setLibcallName(LC.Op, LC.Name); 748 setLibcallCallingConv(LC.Op, LC.CC); 749 } 750 } 751 752 if (Subtarget->isThumb1Only()) 753 addRegisterClass(MVT::i32, &ARM::tGPRRegClass); 754 else 755 addRegisterClass(MVT::i32, &ARM::GPRRegClass); 756 757 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() && 758 Subtarget->hasFPRegs()) { 759 addRegisterClass(MVT::f32, &ARM::SPRRegClass); 760 addRegisterClass(MVT::f64, &ARM::DPRRegClass); 761 762 setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i32, Custom); 763 setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i32, Custom); 764 setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom); 765 setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom); 766 767 if (!Subtarget->hasVFP2Base()) 768 setAllExpand(MVT::f32); 769 if (!Subtarget->hasFP64()) 770 setAllExpand(MVT::f64); 771 } 772 773 if (Subtarget->hasFullFP16()) { 774 addRegisterClass(MVT::f16, &ARM::HPRRegClass); 775 setOperationAction(ISD::BITCAST, MVT::i16, Custom); 776 setOperationAction(ISD::BITCAST, MVT::f16, Custom); 777 778 setOperationAction(ISD::FMINNUM, MVT::f16, Legal); 779 setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); 780 } 781 782 if (Subtarget->hasBF16()) { 783 addRegisterClass(MVT::bf16, &ARM::HPRRegClass); 784 setAllExpand(MVT::bf16); 785 if (!Subtarget->hasFullFP16()) 786 setOperationAction(ISD::BITCAST, MVT::bf16, Custom); 787 } 788 789 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 790 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { 791 setTruncStoreAction(VT, InnerVT, Expand); 792 addAllExtLoads(VT, InnerVT, Expand); 793 } 794 795 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 796 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 797 798 setOperationAction(ISD::BSWAP, VT, Expand); 799 } 800 801 setOperationAction(ISD::ConstantFP, MVT::f32, Custom); 802 setOperationAction(ISD::ConstantFP, MVT::f64, Custom); 803 804 setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom); 805 setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom); 806 807 if (Subtarget->hasMVEIntegerOps()) 808 addMVEVectorTypes(Subtarget->hasMVEFloatOps()); 809 810 // Combine low-overhead loop intrinsics so that we can lower i1 types. 811 if (Subtarget->hasLOB()) { 812 setTargetDAGCombine(ISD::BRCOND); 813 setTargetDAGCombine(ISD::BR_CC); 814 } 815 816 if (Subtarget->hasNEON()) { 817 addDRTypeForNEON(MVT::v2f32); 818 addDRTypeForNEON(MVT::v8i8); 819 addDRTypeForNEON(MVT::v4i16); 820 addDRTypeForNEON(MVT::v2i32); 821 addDRTypeForNEON(MVT::v1i64); 822 823 addQRTypeForNEON(MVT::v4f32); 824 addQRTypeForNEON(MVT::v2f64); 825 addQRTypeForNEON(MVT::v16i8); 826 addQRTypeForNEON(MVT::v8i16); 827 addQRTypeForNEON(MVT::v4i32); 828 addQRTypeForNEON(MVT::v2i64); 829 830 if (Subtarget->hasFullFP16()) { 831 addQRTypeForNEON(MVT::v8f16); 832 addDRTypeForNEON(MVT::v4f16); 833 } 834 835 if (Subtarget->hasBF16()) { 836 addQRTypeForNEON(MVT::v8bf16); 837 addDRTypeForNEON(MVT::v4bf16); 838 } 839 } 840 841 if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) { 842 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but 843 // none of Neon, MVE or VFP supports any arithmetic operations on it. 844 setOperationAction(ISD::FADD, MVT::v2f64, Expand); 845 setOperationAction(ISD::FSUB, MVT::v2f64, Expand); 846 setOperationAction(ISD::FMUL, MVT::v2f64, Expand); 847 // FIXME: Code duplication: FDIV and FREM are expanded always, see 848 // ARMTargetLowering::addTypeForNEON method for details. 849 setOperationAction(ISD::FDIV, MVT::v2f64, Expand); 850 setOperationAction(ISD::FREM, MVT::v2f64, Expand); 851 // FIXME: Create unittest. 852 // In another words, find a way when "copysign" appears in DAG with vector 853 // operands. 854 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand); 855 // FIXME: Code duplication: SETCC has custom operation action, see 856 // ARMTargetLowering::addTypeForNEON method for details. 857 setOperationAction(ISD::SETCC, MVT::v2f64, Expand); 858 // FIXME: Create unittest for FNEG and for FABS. 859 setOperationAction(ISD::FNEG, MVT::v2f64, Expand); 860 setOperationAction(ISD::FABS, MVT::v2f64, Expand); 861 setOperationAction(ISD::FSQRT, MVT::v2f64, Expand); 862 setOperationAction(ISD::FSIN, MVT::v2f64, Expand); 863 setOperationAction(ISD::FCOS, MVT::v2f64, Expand); 864 setOperationAction(ISD::FPOW, MVT::v2f64, Expand); 865 setOperationAction(ISD::FLOG, MVT::v2f64, Expand); 866 setOperationAction(ISD::FLOG2, MVT::v2f64, Expand); 867 setOperationAction(ISD::FLOG10, MVT::v2f64, Expand); 868 setOperationAction(ISD::FEXP, MVT::v2f64, Expand); 869 setOperationAction(ISD::FEXP2, MVT::v2f64, Expand); 870 // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR. 871 setOperationAction(ISD::FCEIL, MVT::v2f64, Expand); 872 setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand); 873 setOperationAction(ISD::FRINT, MVT::v2f64, Expand); 874 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand); 875 setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand); 876 setOperationAction(ISD::FMA, MVT::v2f64, Expand); 877 } 878 879 if (Subtarget->hasNEON()) { 880 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively 881 // supported for v4f32. 882 setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); 883 setOperationAction(ISD::FSIN, MVT::v4f32, Expand); 884 setOperationAction(ISD::FCOS, MVT::v4f32, Expand); 885 setOperationAction(ISD::FPOW, MVT::v4f32, Expand); 886 setOperationAction(ISD::FLOG, MVT::v4f32, Expand); 887 setOperationAction(ISD::FLOG2, MVT::v4f32, Expand); 888 setOperationAction(ISD::FLOG10, MVT::v4f32, Expand); 889 setOperationAction(ISD::FEXP, MVT::v4f32, Expand); 890 setOperationAction(ISD::FEXP2, MVT::v4f32, Expand); 891 setOperationAction(ISD::FCEIL, MVT::v4f32, Expand); 892 setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand); 893 setOperationAction(ISD::FRINT, MVT::v4f32, Expand); 894 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); 895 setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand); 896 897 // Mark v2f32 intrinsics. 898 setOperationAction(ISD::FSQRT, MVT::v2f32, Expand); 899 setOperationAction(ISD::FSIN, MVT::v2f32, Expand); 900 setOperationAction(ISD::FCOS, MVT::v2f32, Expand); 901 setOperationAction(ISD::FPOW, MVT::v2f32, Expand); 902 setOperationAction(ISD::FLOG, MVT::v2f32, Expand); 903 setOperationAction(ISD::FLOG2, MVT::v2f32, Expand); 904 setOperationAction(ISD::FLOG10, MVT::v2f32, Expand); 905 setOperationAction(ISD::FEXP, MVT::v2f32, Expand); 906 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand); 907 setOperationAction(ISD::FCEIL, MVT::v2f32, Expand); 908 setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand); 909 setOperationAction(ISD::FRINT, MVT::v2f32, Expand); 910 setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand); 911 setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand); 912 913 // Neon does not support some operations on v1i64 and v2i64 types. 914 setOperationAction(ISD::MUL, MVT::v1i64, Expand); 915 // Custom handling for some quad-vector types to detect VMULL. 916 setOperationAction(ISD::MUL, MVT::v8i16, Custom); 917 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 918 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 919 // Custom handling for some vector types to avoid expensive expansions 920 setOperationAction(ISD::SDIV, MVT::v4i16, Custom); 921 setOperationAction(ISD::SDIV, MVT::v8i8, Custom); 922 setOperationAction(ISD::UDIV, MVT::v4i16, Custom); 923 setOperationAction(ISD::UDIV, MVT::v8i8, Custom); 924 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with 925 // a destination type that is wider than the source, and nor does 926 // it have a FP_TO_[SU]INT instruction with a narrower destination than 927 // source. 928 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); 929 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom); 930 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); 931 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); 932 setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); 933 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom); 934 setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom); 935 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); 936 937 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); 938 setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); 939 940 // NEON does not have single instruction CTPOP for vectors with element 941 // types wider than 8-bits. However, custom lowering can leverage the 942 // v8i8/v16i8 vcnt instruction. 943 setOperationAction(ISD::CTPOP, MVT::v2i32, Custom); 944 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); 945 setOperationAction(ISD::CTPOP, MVT::v4i16, Custom); 946 setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); 947 setOperationAction(ISD::CTPOP, MVT::v1i64, Custom); 948 setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); 949 950 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); 951 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); 952 953 // NEON does not have single instruction CTTZ for vectors. 954 setOperationAction(ISD::CTTZ, MVT::v8i8, Custom); 955 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom); 956 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom); 957 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); 958 959 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); 960 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); 961 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); 962 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom); 963 964 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom); 965 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom); 966 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom); 967 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom); 968 969 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom); 970 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom); 971 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); 972 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); 973 974 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 975 setOperationAction(ISD::MULHS, VT, Expand); 976 setOperationAction(ISD::MULHU, VT, Expand); 977 } 978 979 // NEON only has FMA instructions as of VFP4. 980 if (!Subtarget->hasVFP4Base()) { 981 setOperationAction(ISD::FMA, MVT::v2f32, Expand); 982 setOperationAction(ISD::FMA, MVT::v4f32, Expand); 983 } 984 985 setTargetDAGCombine(ISD::SHL); 986 setTargetDAGCombine(ISD::SRL); 987 setTargetDAGCombine(ISD::SRA); 988 setTargetDAGCombine(ISD::FP_TO_SINT); 989 setTargetDAGCombine(ISD::FP_TO_UINT); 990 setTargetDAGCombine(ISD::FDIV); 991 setTargetDAGCombine(ISD::LOAD); 992 993 // It is legal to extload from v4i8 to v4i16 or v4i32. 994 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16, 995 MVT::v2i32}) { 996 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) { 997 setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal); 998 setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal); 999 setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal); 1000 } 1001 } 1002 } 1003 1004 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) { 1005 setTargetDAGCombine(ISD::BUILD_VECTOR); 1006 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 1007 setTargetDAGCombine(ISD::INSERT_SUBVECTOR); 1008 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 1009 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 1010 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); 1011 setTargetDAGCombine(ISD::STORE); 1012 setTargetDAGCombine(ISD::SIGN_EXTEND); 1013 setTargetDAGCombine(ISD::ZERO_EXTEND); 1014 setTargetDAGCombine(ISD::ANY_EXTEND); 1015 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 1016 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); 1017 setTargetDAGCombine(ISD::INTRINSIC_VOID); 1018 setTargetDAGCombine(ISD::VECREDUCE_ADD); 1019 setTargetDAGCombine(ISD::ADD); 1020 setTargetDAGCombine(ISD::BITCAST); 1021 } 1022 if (Subtarget->hasMVEIntegerOps()) { 1023 setTargetDAGCombine(ISD::SMIN); 1024 setTargetDAGCombine(ISD::UMIN); 1025 setTargetDAGCombine(ISD::SMAX); 1026 setTargetDAGCombine(ISD::UMAX); 1027 setTargetDAGCombine(ISD::FP_EXTEND); 1028 setTargetDAGCombine(ISD::SELECT); 1029 setTargetDAGCombine(ISD::SELECT_CC); 1030 setTargetDAGCombine(ISD::SETCC); 1031 } 1032 if (Subtarget->hasMVEFloatOps()) { 1033 setTargetDAGCombine(ISD::FADD); 1034 } 1035 1036 if (!Subtarget->hasFP64()) { 1037 // When targeting a floating-point unit with only single-precision 1038 // operations, f64 is legal for the few double-precision instructions which 1039 // are present However, no double-precision operations other than moves, 1040 // loads and stores are provided by the hardware. 1041 setOperationAction(ISD::FADD, MVT::f64, Expand); 1042 setOperationAction(ISD::FSUB, MVT::f64, Expand); 1043 setOperationAction(ISD::FMUL, MVT::f64, Expand); 1044 setOperationAction(ISD::FMA, MVT::f64, Expand); 1045 setOperationAction(ISD::FDIV, MVT::f64, Expand); 1046 setOperationAction(ISD::FREM, MVT::f64, Expand); 1047 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 1048 setOperationAction(ISD::FGETSIGN, MVT::f64, Expand); 1049 setOperationAction(ISD::FNEG, MVT::f64, Expand); 1050 setOperationAction(ISD::FABS, MVT::f64, Expand); 1051 setOperationAction(ISD::FSQRT, MVT::f64, Expand); 1052 setOperationAction(ISD::FSIN, MVT::f64, Expand); 1053 setOperationAction(ISD::FCOS, MVT::f64, Expand); 1054 setOperationAction(ISD::FPOW, MVT::f64, Expand); 1055 setOperationAction(ISD::FLOG, MVT::f64, Expand); 1056 setOperationAction(ISD::FLOG2, MVT::f64, Expand); 1057 setOperationAction(ISD::FLOG10, MVT::f64, Expand); 1058 setOperationAction(ISD::FEXP, MVT::f64, Expand); 1059 setOperationAction(ISD::FEXP2, MVT::f64, Expand); 1060 setOperationAction(ISD::FCEIL, MVT::f64, Expand); 1061 setOperationAction(ISD::FTRUNC, MVT::f64, Expand); 1062 setOperationAction(ISD::FRINT, MVT::f64, Expand); 1063 setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand); 1064 setOperationAction(ISD::FFLOOR, MVT::f64, Expand); 1065 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 1066 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 1067 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 1068 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 1069 setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom); 1070 setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom); 1071 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); 1072 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom); 1073 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom); 1074 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::f64, Custom); 1075 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::f64, Custom); 1076 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom); 1077 } 1078 1079 if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) { 1080 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); 1081 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom); 1082 if (Subtarget->hasFullFP16()) { 1083 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); 1084 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); 1085 } 1086 } 1087 1088 if (!Subtarget->hasFP16()) { 1089 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom); 1090 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom); 1091 } 1092 1093 computeRegisterProperties(Subtarget->getRegisterInfo()); 1094 1095 // ARM does not have floating-point extending loads. 1096 for (MVT VT : MVT::fp_valuetypes()) { 1097 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); 1098 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); 1099 } 1100 1101 // ... or truncating stores 1102 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 1103 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 1104 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 1105 1106 // ARM does not have i1 sign extending load. 1107 for (MVT VT : MVT::integer_valuetypes()) 1108 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 1109 1110 // ARM supports all 4 flavors of integer indexed load / store. 1111 if (!Subtarget->isThumb1Only()) { 1112 for (unsigned im = (unsigned)ISD::PRE_INC; 1113 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 1114 setIndexedLoadAction(im, MVT::i1, Legal); 1115 setIndexedLoadAction(im, MVT::i8, Legal); 1116 setIndexedLoadAction(im, MVT::i16, Legal); 1117 setIndexedLoadAction(im, MVT::i32, Legal); 1118 setIndexedStoreAction(im, MVT::i1, Legal); 1119 setIndexedStoreAction(im, MVT::i8, Legal); 1120 setIndexedStoreAction(im, MVT::i16, Legal); 1121 setIndexedStoreAction(im, MVT::i32, Legal); 1122 } 1123 } else { 1124 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}. 1125 setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal); 1126 setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal); 1127 } 1128 1129 setOperationAction(ISD::SADDO, MVT::i32, Custom); 1130 setOperationAction(ISD::UADDO, MVT::i32, Custom); 1131 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 1132 setOperationAction(ISD::USUBO, MVT::i32, Custom); 1133 1134 setOperationAction(ISD::ADDCARRY, MVT::i32, Custom); 1135 setOperationAction(ISD::SUBCARRY, MVT::i32, Custom); 1136 if (Subtarget->hasDSP()) { 1137 setOperationAction(ISD::SADDSAT, MVT::i8, Custom); 1138 setOperationAction(ISD::SSUBSAT, MVT::i8, Custom); 1139 setOperationAction(ISD::SADDSAT, MVT::i16, Custom); 1140 setOperationAction(ISD::SSUBSAT, MVT::i16, Custom); 1141 setOperationAction(ISD::UADDSAT, MVT::i8, Custom); 1142 setOperationAction(ISD::USUBSAT, MVT::i8, Custom); 1143 setOperationAction(ISD::UADDSAT, MVT::i16, Custom); 1144 setOperationAction(ISD::USUBSAT, MVT::i16, Custom); 1145 } 1146 if (Subtarget->hasBaseDSP()) { 1147 setOperationAction(ISD::SADDSAT, MVT::i32, Legal); 1148 setOperationAction(ISD::SSUBSAT, MVT::i32, Legal); 1149 } 1150 1151 // i64 operation support. 1152 setOperationAction(ISD::MUL, MVT::i64, Expand); 1153 setOperationAction(ISD::MULHU, MVT::i32, Expand); 1154 if (Subtarget->isThumb1Only()) { 1155 setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); 1156 setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); 1157 } 1158 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops() 1159 || (Subtarget->isThumb2() && !Subtarget->hasDSP())) 1160 setOperationAction(ISD::MULHS, MVT::i32, Expand); 1161 1162 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); 1163 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); 1164 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); 1165 setOperationAction(ISD::SRL, MVT::i64, Custom); 1166 setOperationAction(ISD::SRA, MVT::i64, Custom); 1167 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 1168 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); 1169 setOperationAction(ISD::LOAD, MVT::i64, Custom); 1170 setOperationAction(ISD::STORE, MVT::i64, Custom); 1171 1172 // MVE lowers 64 bit shifts to lsll and lsrl 1173 // assuming that ISD::SRL and SRA of i64 are already marked custom 1174 if (Subtarget->hasMVEIntegerOps()) 1175 setOperationAction(ISD::SHL, MVT::i64, Custom); 1176 1177 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1. 1178 if (Subtarget->isThumb1Only()) { 1179 setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand); 1180 setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand); 1181 setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand); 1182 } 1183 1184 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) 1185 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 1186 1187 // ARM does not have ROTL. 1188 setOperationAction(ISD::ROTL, MVT::i32, Expand); 1189 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 1190 setOperationAction(ISD::ROTL, VT, Expand); 1191 setOperationAction(ISD::ROTR, VT, Expand); 1192 } 1193 setOperationAction(ISD::CTTZ, MVT::i32, Custom); 1194 setOperationAction(ISD::CTPOP, MVT::i32, Expand); 1195 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) { 1196 setOperationAction(ISD::CTLZ, MVT::i32, Expand); 1197 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, LibCall); 1198 } 1199 1200 // @llvm.readcyclecounter requires the Performance Monitors extension. 1201 // Default to the 0 expansion on unsupported platforms. 1202 // FIXME: Technically there are older ARM CPUs that have 1203 // implementation-specific ways of obtaining this information. 1204 if (Subtarget->hasPerfMon()) 1205 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom); 1206 1207 // Only ARMv6 has BSWAP. 1208 if (!Subtarget->hasV6Ops()) 1209 setOperationAction(ISD::BSWAP, MVT::i32, Expand); 1210 1211 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() 1212 : Subtarget->hasDivideInARMMode(); 1213 if (!hasDivide) { 1214 // These are expanded into libcalls if the cpu doesn't have HW divider. 1215 setOperationAction(ISD::SDIV, MVT::i32, LibCall); 1216 setOperationAction(ISD::UDIV, MVT::i32, LibCall); 1217 } 1218 1219 if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) { 1220 setOperationAction(ISD::SDIV, MVT::i32, Custom); 1221 setOperationAction(ISD::UDIV, MVT::i32, Custom); 1222 1223 setOperationAction(ISD::SDIV, MVT::i64, Custom); 1224 setOperationAction(ISD::UDIV, MVT::i64, Custom); 1225 } 1226 1227 setOperationAction(ISD::SREM, MVT::i32, Expand); 1228 setOperationAction(ISD::UREM, MVT::i32, Expand); 1229 1230 // Register based DivRem for AEABI (RTABI 4.2) 1231 if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || 1232 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || 1233 Subtarget->isTargetWindows()) { 1234 setOperationAction(ISD::SREM, MVT::i64, Custom); 1235 setOperationAction(ISD::UREM, MVT::i64, Custom); 1236 HasStandaloneRem = false; 1237 1238 if (Subtarget->isTargetWindows()) { 1239 const struct { 1240 const RTLIB::Libcall Op; 1241 const char * const Name; 1242 const CallingConv::ID CC; 1243 } LibraryCalls[] = { 1244 { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS }, 1245 { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS }, 1246 { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS }, 1247 { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS }, 1248 1249 { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS }, 1250 { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS }, 1251 { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS }, 1252 { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS }, 1253 }; 1254 1255 for (const auto &LC : LibraryCalls) { 1256 setLibcallName(LC.Op, LC.Name); 1257 setLibcallCallingConv(LC.Op, LC.CC); 1258 } 1259 } else { 1260 const struct { 1261 const RTLIB::Libcall Op; 1262 const char * const Name; 1263 const CallingConv::ID CC; 1264 } LibraryCalls[] = { 1265 { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 1266 { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 1267 { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 1268 { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS }, 1269 1270 { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 1271 { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 1272 { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 1273 { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS }, 1274 }; 1275 1276 for (const auto &LC : LibraryCalls) { 1277 setLibcallName(LC.Op, LC.Name); 1278 setLibcallCallingConv(LC.Op, LC.CC); 1279 } 1280 } 1281 1282 setOperationAction(ISD::SDIVREM, MVT::i32, Custom); 1283 setOperationAction(ISD::UDIVREM, MVT::i32, Custom); 1284 setOperationAction(ISD::SDIVREM, MVT::i64, Custom); 1285 setOperationAction(ISD::UDIVREM, MVT::i64, Custom); 1286 } else { 1287 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 1288 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 1289 } 1290 1291 if (Subtarget->getTargetTriple().isOSMSVCRT()) { 1292 // MSVCRT doesn't have powi; fall back to pow 1293 setLibcallName(RTLIB::POWI_F32, nullptr); 1294 setLibcallName(RTLIB::POWI_F64, nullptr); 1295 } 1296 1297 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 1298 setOperationAction(ISD::ConstantPool, MVT::i32, Custom); 1299 setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); 1300 setOperationAction(ISD::BlockAddress, MVT::i32, Custom); 1301 1302 setOperationAction(ISD::TRAP, MVT::Other, Legal); 1303 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); 1304 1305 // Use the default implementation. 1306 setOperationAction(ISD::VASTART, MVT::Other, Custom); 1307 setOperationAction(ISD::VAARG, MVT::Other, Expand); 1308 setOperationAction(ISD::VACOPY, MVT::Other, Expand); 1309 setOperationAction(ISD::VAEND, MVT::Other, Expand); 1310 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 1311 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 1312 1313 if (Subtarget->isTargetWindows()) 1314 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 1315 else 1316 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 1317 1318 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use 1319 // the default expansion. 1320 InsertFencesForAtomic = false; 1321 if (Subtarget->hasAnyDataBarrier() && 1322 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) { 1323 // ATOMIC_FENCE needs custom lowering; the others should have been expanded 1324 // to ldrex/strex loops already. 1325 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); 1326 if (!Subtarget->isThumb() || !Subtarget->isMClass()) 1327 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 1328 1329 // On v8, we have particularly efficient implementations of atomic fences 1330 // if they can be combined with nearby atomic loads and stores. 1331 if (!Subtarget->hasAcquireRelease() || 1332 getTargetMachine().getOptLevel() == 0) { 1333 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc. 1334 InsertFencesForAtomic = true; 1335 } 1336 } else { 1337 // If there's anything we can use as a barrier, go through custom lowering 1338 // for ATOMIC_FENCE. 1339 // If target has DMB in thumb, Fences can be inserted. 1340 if (Subtarget->hasDataBarrier()) 1341 InsertFencesForAtomic = true; 1342 1343 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, 1344 Subtarget->hasAnyDataBarrier() ? Custom : Expand); 1345 1346 // Set them all for expansion, which will force libcalls. 1347 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand); 1348 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand); 1349 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand); 1350 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand); 1351 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand); 1352 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand); 1353 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand); 1354 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand); 1355 setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand); 1356 setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand); 1357 setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand); 1358 setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand); 1359 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the 1360 // Unordered/Monotonic case. 1361 if (!InsertFencesForAtomic) { 1362 setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom); 1363 setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom); 1364 } 1365 } 1366 1367 setOperationAction(ISD::PREFETCH, MVT::Other, Custom); 1368 1369 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes. 1370 if (!Subtarget->hasV6Ops()) { 1371 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); 1372 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); 1373 } 1374 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 1375 1376 if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() && 1377 !Subtarget->isThumb1Only()) { 1378 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR 1379 // iff target supports vfp2. 1380 setOperationAction(ISD::BITCAST, MVT::i64, Custom); 1381 setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); 1382 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom); 1383 } 1384 1385 // We want to custom lower some of our intrinsics. 1386 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1387 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); 1388 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); 1389 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); 1390 if (Subtarget->useSjLjEH()) 1391 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); 1392 1393 setOperationAction(ISD::SETCC, MVT::i32, Expand); 1394 setOperationAction(ISD::SETCC, MVT::f32, Expand); 1395 setOperationAction(ISD::SETCC, MVT::f64, Expand); 1396 setOperationAction(ISD::SELECT, MVT::i32, Custom); 1397 setOperationAction(ISD::SELECT, MVT::f32, Custom); 1398 setOperationAction(ISD::SELECT, MVT::f64, Custom); 1399 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 1400 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 1401 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 1402 if (Subtarget->hasFullFP16()) { 1403 setOperationAction(ISD::SETCC, MVT::f16, Expand); 1404 setOperationAction(ISD::SELECT, MVT::f16, Custom); 1405 setOperationAction(ISD::SELECT_CC, MVT::f16, Custom); 1406 } 1407 1408 setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom); 1409 1410 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 1411 setOperationAction(ISD::BR_CC, MVT::i32, Custom); 1412 if (Subtarget->hasFullFP16()) 1413 setOperationAction(ISD::BR_CC, MVT::f16, Custom); 1414 setOperationAction(ISD::BR_CC, MVT::f32, Custom); 1415 setOperationAction(ISD::BR_CC, MVT::f64, Custom); 1416 setOperationAction(ISD::BR_JT, MVT::Other, Custom); 1417 1418 // We don't support sin/cos/fmod/copysign/pow 1419 setOperationAction(ISD::FSIN, MVT::f64, Expand); 1420 setOperationAction(ISD::FSIN, MVT::f32, Expand); 1421 setOperationAction(ISD::FCOS, MVT::f32, Expand); 1422 setOperationAction(ISD::FCOS, MVT::f64, Expand); 1423 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 1424 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 1425 setOperationAction(ISD::FREM, MVT::f64, Expand); 1426 setOperationAction(ISD::FREM, MVT::f32, Expand); 1427 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() && 1428 !Subtarget->isThumb1Only()) { 1429 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 1430 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 1431 } 1432 setOperationAction(ISD::FPOW, MVT::f64, Expand); 1433 setOperationAction(ISD::FPOW, MVT::f32, Expand); 1434 1435 if (!Subtarget->hasVFP4Base()) { 1436 setOperationAction(ISD::FMA, MVT::f64, Expand); 1437 setOperationAction(ISD::FMA, MVT::f32, Expand); 1438 } 1439 1440 // Various VFP goodness 1441 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) { 1442 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded. 1443 if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) { 1444 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); 1445 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); 1446 } 1447 1448 // fp16 is a special v7 extension that adds f16 <-> f32 conversions. 1449 if (!Subtarget->hasFP16()) { 1450 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); 1451 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); 1452 } 1453 1454 // Strict floating-point comparisons need custom lowering. 1455 setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom); 1456 setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom); 1457 setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom); 1458 setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom); 1459 setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom); 1460 setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom); 1461 } 1462 1463 // Use __sincos_stret if available. 1464 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && 1465 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { 1466 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 1467 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 1468 } 1469 1470 // FP-ARMv8 implements a lot of rounding-like FP operations. 1471 if (Subtarget->hasFPARMv8Base()) { 1472 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 1473 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 1474 setOperationAction(ISD::FROUND, MVT::f32, Legal); 1475 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 1476 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 1477 setOperationAction(ISD::FRINT, MVT::f32, Legal); 1478 setOperationAction(ISD::FMINNUM, MVT::f32, Legal); 1479 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); 1480 if (Subtarget->hasNEON()) { 1481 setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal); 1482 setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal); 1483 setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); 1484 setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); 1485 } 1486 1487 if (Subtarget->hasFP64()) { 1488 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 1489 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 1490 setOperationAction(ISD::FROUND, MVT::f64, Legal); 1491 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 1492 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 1493 setOperationAction(ISD::FRINT, MVT::f64, Legal); 1494 setOperationAction(ISD::FMINNUM, MVT::f64, Legal); 1495 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); 1496 } 1497 } 1498 1499 // FP16 often need to be promoted to call lib functions 1500 if (Subtarget->hasFullFP16()) { 1501 setOperationAction(ISD::FREM, MVT::f16, Promote); 1502 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); 1503 setOperationAction(ISD::FSIN, MVT::f16, Promote); 1504 setOperationAction(ISD::FCOS, MVT::f16, Promote); 1505 setOperationAction(ISD::FSINCOS, MVT::f16, Promote); 1506 setOperationAction(ISD::FPOWI, MVT::f16, Promote); 1507 setOperationAction(ISD::FPOW, MVT::f16, Promote); 1508 setOperationAction(ISD::FEXP, MVT::f16, Promote); 1509 setOperationAction(ISD::FEXP2, MVT::f16, Promote); 1510 setOperationAction(ISD::FLOG, MVT::f16, Promote); 1511 setOperationAction(ISD::FLOG10, MVT::f16, Promote); 1512 setOperationAction(ISD::FLOG2, MVT::f16, Promote); 1513 1514 setOperationAction(ISD::FROUND, MVT::f16, Legal); 1515 } 1516 1517 if (Subtarget->hasNEON()) { 1518 // vmin and vmax aren't available in a scalar form, so we can use 1519 // a NEON instruction with an undef lane instead. This has a performance 1520 // penalty on some cores, so we don't do this unless we have been 1521 // asked to by the core tuning model. 1522 if (Subtarget->useNEONForSinglePrecisionFP()) { 1523 setOperationAction(ISD::FMINIMUM, MVT::f32, Legal); 1524 setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal); 1525 setOperationAction(ISD::FMINIMUM, MVT::f16, Legal); 1526 setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal); 1527 } 1528 setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal); 1529 setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal); 1530 setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal); 1531 setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal); 1532 1533 if (Subtarget->hasFullFP16()) { 1534 setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal); 1535 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Legal); 1536 setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal); 1537 setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal); 1538 1539 setOperationAction(ISD::FMINIMUM, MVT::v4f16, Legal); 1540 setOperationAction(ISD::FMAXIMUM, MVT::v4f16, Legal); 1541 setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal); 1542 setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal); 1543 } 1544 } 1545 1546 // We have target-specific dag combine patterns for the following nodes: 1547 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine 1548 setTargetDAGCombine(ISD::ADD); 1549 setTargetDAGCombine(ISD::SUB); 1550 setTargetDAGCombine(ISD::MUL); 1551 setTargetDAGCombine(ISD::AND); 1552 setTargetDAGCombine(ISD::OR); 1553 setTargetDAGCombine(ISD::XOR); 1554 1555 if (Subtarget->hasMVEIntegerOps()) 1556 setTargetDAGCombine(ISD::VSELECT); 1557 1558 if (Subtarget->hasV6Ops()) 1559 setTargetDAGCombine(ISD::SRL); 1560 if (Subtarget->isThumb1Only()) 1561 setTargetDAGCombine(ISD::SHL); 1562 1563 setStackPointerRegisterToSaveRestore(ARM::SP); 1564 1565 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() || 1566 !Subtarget->hasVFP2Base() || Subtarget->hasMinSize()) 1567 setSchedulingPreference(Sched::RegPressure); 1568 else 1569 setSchedulingPreference(Sched::Hybrid); 1570 1571 //// temporary - rewrite interface to use type 1572 MaxStoresPerMemset = 8; 1573 MaxStoresPerMemsetOptSize = 4; 1574 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores 1575 MaxStoresPerMemcpyOptSize = 2; 1576 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores 1577 MaxStoresPerMemmoveOptSize = 2; 1578 1579 // On ARM arguments smaller than 4 bytes are extended, so all arguments 1580 // are at least 4 bytes aligned. 1581 setMinStackArgumentAlignment(Align(4)); 1582 1583 // Prefer likely predicted branches to selects on out-of-order cores. 1584 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder(); 1585 1586 setPrefLoopAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment())); 1587 1588 setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4)); 1589 1590 if (Subtarget->isThumb() || Subtarget->isThumb2()) 1591 setTargetDAGCombine(ISD::ABS); 1592 } 1593 1594 bool ARMTargetLowering::useSoftFloat() const { 1595 return Subtarget->useSoftFloat(); 1596 } 1597 1598 // FIXME: It might make sense to define the representative register class as the 1599 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is 1600 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently, 1601 // SPR's representative would be DPR_VFP2. This should work well if register 1602 // pressure tracking were modified such that a register use would increment the 1603 // pressure of the register class's representative and all of it's super 1604 // classes' representatives transitively. We have not implemented this because 1605 // of the difficulty prior to coalescing of modeling operand register classes 1606 // due to the common occurrence of cross class copies and subregister insertions 1607 // and extractions. 1608 std::pair<const TargetRegisterClass *, uint8_t> 1609 ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, 1610 MVT VT) const { 1611 const TargetRegisterClass *RRC = nullptr; 1612 uint8_t Cost = 1; 1613 switch (VT.SimpleTy) { 1614 default: 1615 return TargetLowering::findRepresentativeClass(TRI, VT); 1616 // Use DPR as representative register class for all floating point 1617 // and vector types. Since there are 32 SPR registers and 32 DPR registers so 1618 // the cost is 1 for both f32 and f64. 1619 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16: 1620 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32: 1621 RRC = &ARM::DPRRegClass; 1622 // When NEON is used for SP, only half of the register file is available 1623 // because operations that define both SP and DP results will be constrained 1624 // to the VFP2 class (D0-D15). We currently model this constraint prior to 1625 // coalescing by double-counting the SP regs. See the FIXME above. 1626 if (Subtarget->useNEONForSinglePrecisionFP()) 1627 Cost = 2; 1628 break; 1629 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1630 case MVT::v4f32: case MVT::v2f64: 1631 RRC = &ARM::DPRRegClass; 1632 Cost = 2; 1633 break; 1634 case MVT::v4i64: 1635 RRC = &ARM::DPRRegClass; 1636 Cost = 4; 1637 break; 1638 case MVT::v8i64: 1639 RRC = &ARM::DPRRegClass; 1640 Cost = 8; 1641 break; 1642 } 1643 return std::make_pair(RRC, Cost); 1644 } 1645 1646 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { 1647 #define MAKE_CASE(V) \ 1648 case V: \ 1649 return #V; 1650 switch ((ARMISD::NodeType)Opcode) { 1651 case ARMISD::FIRST_NUMBER: 1652 break; 1653 MAKE_CASE(ARMISD::Wrapper) 1654 MAKE_CASE(ARMISD::WrapperPIC) 1655 MAKE_CASE(ARMISD::WrapperJT) 1656 MAKE_CASE(ARMISD::COPY_STRUCT_BYVAL) 1657 MAKE_CASE(ARMISD::CALL) 1658 MAKE_CASE(ARMISD::CALL_PRED) 1659 MAKE_CASE(ARMISD::CALL_NOLINK) 1660 MAKE_CASE(ARMISD::tSECALL) 1661 MAKE_CASE(ARMISD::t2CALL_BTI) 1662 MAKE_CASE(ARMISD::BRCOND) 1663 MAKE_CASE(ARMISD::BR_JT) 1664 MAKE_CASE(ARMISD::BR2_JT) 1665 MAKE_CASE(ARMISD::RET_FLAG) 1666 MAKE_CASE(ARMISD::SERET_FLAG) 1667 MAKE_CASE(ARMISD::INTRET_FLAG) 1668 MAKE_CASE(ARMISD::PIC_ADD) 1669 MAKE_CASE(ARMISD::CMP) 1670 MAKE_CASE(ARMISD::CMN) 1671 MAKE_CASE(ARMISD::CMPZ) 1672 MAKE_CASE(ARMISD::CMPFP) 1673 MAKE_CASE(ARMISD::CMPFPE) 1674 MAKE_CASE(ARMISD::CMPFPw0) 1675 MAKE_CASE(ARMISD::CMPFPEw0) 1676 MAKE_CASE(ARMISD::BCC_i64) 1677 MAKE_CASE(ARMISD::FMSTAT) 1678 MAKE_CASE(ARMISD::CMOV) 1679 MAKE_CASE(ARMISD::SUBS) 1680 MAKE_CASE(ARMISD::SSAT) 1681 MAKE_CASE(ARMISD::USAT) 1682 MAKE_CASE(ARMISD::ASRL) 1683 MAKE_CASE(ARMISD::LSRL) 1684 MAKE_CASE(ARMISD::LSLL) 1685 MAKE_CASE(ARMISD::SRL_FLAG) 1686 MAKE_CASE(ARMISD::SRA_FLAG) 1687 MAKE_CASE(ARMISD::RRX) 1688 MAKE_CASE(ARMISD::ADDC) 1689 MAKE_CASE(ARMISD::ADDE) 1690 MAKE_CASE(ARMISD::SUBC) 1691 MAKE_CASE(ARMISD::SUBE) 1692 MAKE_CASE(ARMISD::LSLS) 1693 MAKE_CASE(ARMISD::VMOVRRD) 1694 MAKE_CASE(ARMISD::VMOVDRR) 1695 MAKE_CASE(ARMISD::VMOVhr) 1696 MAKE_CASE(ARMISD::VMOVrh) 1697 MAKE_CASE(ARMISD::VMOVSR) 1698 MAKE_CASE(ARMISD::EH_SJLJ_SETJMP) 1699 MAKE_CASE(ARMISD::EH_SJLJ_LONGJMP) 1700 MAKE_CASE(ARMISD::EH_SJLJ_SETUP_DISPATCH) 1701 MAKE_CASE(ARMISD::TC_RETURN) 1702 MAKE_CASE(ARMISD::THREAD_POINTER) 1703 MAKE_CASE(ARMISD::DYN_ALLOC) 1704 MAKE_CASE(ARMISD::MEMBARRIER_MCR) 1705 MAKE_CASE(ARMISD::PRELOAD) 1706 MAKE_CASE(ARMISD::LDRD) 1707 MAKE_CASE(ARMISD::STRD) 1708 MAKE_CASE(ARMISD::WIN__CHKSTK) 1709 MAKE_CASE(ARMISD::WIN__DBZCHK) 1710 MAKE_CASE(ARMISD::PREDICATE_CAST) 1711 MAKE_CASE(ARMISD::VECTOR_REG_CAST) 1712 MAKE_CASE(ARMISD::MVESEXT) 1713 MAKE_CASE(ARMISD::MVEZEXT) 1714 MAKE_CASE(ARMISD::MVETRUNC) 1715 MAKE_CASE(ARMISD::VCMP) 1716 MAKE_CASE(ARMISD::VCMPZ) 1717 MAKE_CASE(ARMISD::VTST) 1718 MAKE_CASE(ARMISD::VSHLs) 1719 MAKE_CASE(ARMISD::VSHLu) 1720 MAKE_CASE(ARMISD::VSHLIMM) 1721 MAKE_CASE(ARMISD::VSHRsIMM) 1722 MAKE_CASE(ARMISD::VSHRuIMM) 1723 MAKE_CASE(ARMISD::VRSHRsIMM) 1724 MAKE_CASE(ARMISD::VRSHRuIMM) 1725 MAKE_CASE(ARMISD::VRSHRNIMM) 1726 MAKE_CASE(ARMISD::VQSHLsIMM) 1727 MAKE_CASE(ARMISD::VQSHLuIMM) 1728 MAKE_CASE(ARMISD::VQSHLsuIMM) 1729 MAKE_CASE(ARMISD::VQSHRNsIMM) 1730 MAKE_CASE(ARMISD::VQSHRNuIMM) 1731 MAKE_CASE(ARMISD::VQSHRNsuIMM) 1732 MAKE_CASE(ARMISD::VQRSHRNsIMM) 1733 MAKE_CASE(ARMISD::VQRSHRNuIMM) 1734 MAKE_CASE(ARMISD::VQRSHRNsuIMM) 1735 MAKE_CASE(ARMISD::VSLIIMM) 1736 MAKE_CASE(ARMISD::VSRIIMM) 1737 MAKE_CASE(ARMISD::VGETLANEu) 1738 MAKE_CASE(ARMISD::VGETLANEs) 1739 MAKE_CASE(ARMISD::VMOVIMM) 1740 MAKE_CASE(ARMISD::VMVNIMM) 1741 MAKE_CASE(ARMISD::VMOVFPIMM) 1742 MAKE_CASE(ARMISD::VDUP) 1743 MAKE_CASE(ARMISD::VDUPLANE) 1744 MAKE_CASE(ARMISD::VEXT) 1745 MAKE_CASE(ARMISD::VREV64) 1746 MAKE_CASE(ARMISD::VREV32) 1747 MAKE_CASE(ARMISD::VREV16) 1748 MAKE_CASE(ARMISD::VZIP) 1749 MAKE_CASE(ARMISD::VUZP) 1750 MAKE_CASE(ARMISD::VTRN) 1751 MAKE_CASE(ARMISD::VTBL1) 1752 MAKE_CASE(ARMISD::VTBL2) 1753 MAKE_CASE(ARMISD::VMOVN) 1754 MAKE_CASE(ARMISD::VQMOVNs) 1755 MAKE_CASE(ARMISD::VQMOVNu) 1756 MAKE_CASE(ARMISD::VCVTN) 1757 MAKE_CASE(ARMISD::VCVTL) 1758 MAKE_CASE(ARMISD::VIDUP) 1759 MAKE_CASE(ARMISD::VMULLs) 1760 MAKE_CASE(ARMISD::VMULLu) 1761 MAKE_CASE(ARMISD::VQDMULH) 1762 MAKE_CASE(ARMISD::VADDVs) 1763 MAKE_CASE(ARMISD::VADDVu) 1764 MAKE_CASE(ARMISD::VADDVps) 1765 MAKE_CASE(ARMISD::VADDVpu) 1766 MAKE_CASE(ARMISD::VADDLVs) 1767 MAKE_CASE(ARMISD::VADDLVu) 1768 MAKE_CASE(ARMISD::VADDLVAs) 1769 MAKE_CASE(ARMISD::VADDLVAu) 1770 MAKE_CASE(ARMISD::VADDLVps) 1771 MAKE_CASE(ARMISD::VADDLVpu) 1772 MAKE_CASE(ARMISD::VADDLVAps) 1773 MAKE_CASE(ARMISD::VADDLVApu) 1774 MAKE_CASE(ARMISD::VMLAVs) 1775 MAKE_CASE(ARMISD::VMLAVu) 1776 MAKE_CASE(ARMISD::VMLAVps) 1777 MAKE_CASE(ARMISD::VMLAVpu) 1778 MAKE_CASE(ARMISD::VMLALVs) 1779 MAKE_CASE(ARMISD::VMLALVu) 1780 MAKE_CASE(ARMISD::VMLALVps) 1781 MAKE_CASE(ARMISD::VMLALVpu) 1782 MAKE_CASE(ARMISD::VMLALVAs) 1783 MAKE_CASE(ARMISD::VMLALVAu) 1784 MAKE_CASE(ARMISD::VMLALVAps) 1785 MAKE_CASE(ARMISD::VMLALVApu) 1786 MAKE_CASE(ARMISD::VMINVu) 1787 MAKE_CASE(ARMISD::VMINVs) 1788 MAKE_CASE(ARMISD::VMAXVu) 1789 MAKE_CASE(ARMISD::VMAXVs) 1790 MAKE_CASE(ARMISD::UMAAL) 1791 MAKE_CASE(ARMISD::UMLAL) 1792 MAKE_CASE(ARMISD::SMLAL) 1793 MAKE_CASE(ARMISD::SMLALBB) 1794 MAKE_CASE(ARMISD::SMLALBT) 1795 MAKE_CASE(ARMISD::SMLALTB) 1796 MAKE_CASE(ARMISD::SMLALTT) 1797 MAKE_CASE(ARMISD::SMULWB) 1798 MAKE_CASE(ARMISD::SMULWT) 1799 MAKE_CASE(ARMISD::SMLALD) 1800 MAKE_CASE(ARMISD::SMLALDX) 1801 MAKE_CASE(ARMISD::SMLSLD) 1802 MAKE_CASE(ARMISD::SMLSLDX) 1803 MAKE_CASE(ARMISD::SMMLAR) 1804 MAKE_CASE(ARMISD::SMMLSR) 1805 MAKE_CASE(ARMISD::QADD16b) 1806 MAKE_CASE(ARMISD::QSUB16b) 1807 MAKE_CASE(ARMISD::QADD8b) 1808 MAKE_CASE(ARMISD::QSUB8b) 1809 MAKE_CASE(ARMISD::UQADD16b) 1810 MAKE_CASE(ARMISD::UQSUB16b) 1811 MAKE_CASE(ARMISD::UQADD8b) 1812 MAKE_CASE(ARMISD::UQSUB8b) 1813 MAKE_CASE(ARMISD::BUILD_VECTOR) 1814 MAKE_CASE(ARMISD::BFI) 1815 MAKE_CASE(ARMISD::VORRIMM) 1816 MAKE_CASE(ARMISD::VBICIMM) 1817 MAKE_CASE(ARMISD::VBSP) 1818 MAKE_CASE(ARMISD::MEMCPY) 1819 MAKE_CASE(ARMISD::VLD1DUP) 1820 MAKE_CASE(ARMISD::VLD2DUP) 1821 MAKE_CASE(ARMISD::VLD3DUP) 1822 MAKE_CASE(ARMISD::VLD4DUP) 1823 MAKE_CASE(ARMISD::VLD1_UPD) 1824 MAKE_CASE(ARMISD::VLD2_UPD) 1825 MAKE_CASE(ARMISD::VLD3_UPD) 1826 MAKE_CASE(ARMISD::VLD4_UPD) 1827 MAKE_CASE(ARMISD::VLD1x2_UPD) 1828 MAKE_CASE(ARMISD::VLD1x3_UPD) 1829 MAKE_CASE(ARMISD::VLD1x4_UPD) 1830 MAKE_CASE(ARMISD::VLD2LN_UPD) 1831 MAKE_CASE(ARMISD::VLD3LN_UPD) 1832 MAKE_CASE(ARMISD::VLD4LN_UPD) 1833 MAKE_CASE(ARMISD::VLD1DUP_UPD) 1834 MAKE_CASE(ARMISD::VLD2DUP_UPD) 1835 MAKE_CASE(ARMISD::VLD3DUP_UPD) 1836 MAKE_CASE(ARMISD::VLD4DUP_UPD) 1837 MAKE_CASE(ARMISD::VST1_UPD) 1838 MAKE_CASE(ARMISD::VST2_UPD) 1839 MAKE_CASE(ARMISD::VST3_UPD) 1840 MAKE_CASE(ARMISD::VST4_UPD) 1841 MAKE_CASE(ARMISD::VST1x2_UPD) 1842 MAKE_CASE(ARMISD::VST1x3_UPD) 1843 MAKE_CASE(ARMISD::VST1x4_UPD) 1844 MAKE_CASE(ARMISD::VST2LN_UPD) 1845 MAKE_CASE(ARMISD::VST3LN_UPD) 1846 MAKE_CASE(ARMISD::VST4LN_UPD) 1847 MAKE_CASE(ARMISD::WLS) 1848 MAKE_CASE(ARMISD::WLSSETUP) 1849 MAKE_CASE(ARMISD::LE) 1850 MAKE_CASE(ARMISD::LOOP_DEC) 1851 MAKE_CASE(ARMISD::CSINV) 1852 MAKE_CASE(ARMISD::CSNEG) 1853 MAKE_CASE(ARMISD::CSINC) 1854 MAKE_CASE(ARMISD::MEMCPYLOOP) 1855 MAKE_CASE(ARMISD::MEMSETLOOP) 1856 #undef MAKE_CASE 1857 } 1858 return nullptr; 1859 } 1860 1861 EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, 1862 EVT VT) const { 1863 if (!VT.isVector()) 1864 return getPointerTy(DL); 1865 1866 // MVE has a predicate register. 1867 if ((Subtarget->hasMVEIntegerOps() && 1868 (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || 1869 VT == MVT::v16i8)) || 1870 (Subtarget->hasMVEFloatOps() && 1871 (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16))) 1872 return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount()); 1873 return VT.changeVectorElementTypeToInteger(); 1874 } 1875 1876 /// getRegClassFor - Return the register class that should be used for the 1877 /// specified value type. 1878 const TargetRegisterClass * 1879 ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { 1880 (void)isDivergent; 1881 // Map v4i64 to QQ registers but do not make the type legal. Similarly map 1882 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to 1883 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive 1884 // MVE Q registers. 1885 if (Subtarget->hasNEON()) { 1886 if (VT == MVT::v4i64) 1887 return &ARM::QQPRRegClass; 1888 if (VT == MVT::v8i64) 1889 return &ARM::QQQQPRRegClass; 1890 } 1891 if (Subtarget->hasMVEIntegerOps()) { 1892 if (VT == MVT::v4i64) 1893 return &ARM::MQQPRRegClass; 1894 if (VT == MVT::v8i64) 1895 return &ARM::MQQQQPRRegClass; 1896 } 1897 return TargetLowering::getRegClassFor(VT); 1898 } 1899 1900 // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the 1901 // source/dest is aligned and the copy size is large enough. We therefore want 1902 // to align such objects passed to memory intrinsics. 1903 bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, 1904 unsigned &PrefAlign) const { 1905 if (!isa<MemIntrinsic>(CI)) 1906 return false; 1907 MinSize = 8; 1908 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1 1909 // cycle faster than 4-byte aligned LDM. 1910 PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4); 1911 return true; 1912 } 1913 1914 // Create a fast isel object. 1915 FastISel * 1916 ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 1917 const TargetLibraryInfo *libInfo) const { 1918 return ARM::createFastISel(funcInfo, libInfo); 1919 } 1920 1921 Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { 1922 unsigned NumVals = N->getNumValues(); 1923 if (!NumVals) 1924 return Sched::RegPressure; 1925 1926 for (unsigned i = 0; i != NumVals; ++i) { 1927 EVT VT = N->getValueType(i); 1928 if (VT == MVT::Glue || VT == MVT::Other) 1929 continue; 1930 if (VT.isFloatingPoint() || VT.isVector()) 1931 return Sched::ILP; 1932 } 1933 1934 if (!N->isMachineOpcode()) 1935 return Sched::RegPressure; 1936 1937 // Load are scheduled for latency even if there instruction itinerary 1938 // is not available. 1939 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 1940 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); 1941 1942 if (MCID.getNumDefs() == 0) 1943 return Sched::RegPressure; 1944 if (!Itins->isEmpty() && 1945 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2) 1946 return Sched::ILP; 1947 1948 return Sched::RegPressure; 1949 } 1950 1951 //===----------------------------------------------------------------------===// 1952 // Lowering Code 1953 //===----------------------------------------------------------------------===// 1954 1955 static bool isSRL16(const SDValue &Op) { 1956 if (Op.getOpcode() != ISD::SRL) 1957 return false; 1958 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1959 return Const->getZExtValue() == 16; 1960 return false; 1961 } 1962 1963 static bool isSRA16(const SDValue &Op) { 1964 if (Op.getOpcode() != ISD::SRA) 1965 return false; 1966 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1967 return Const->getZExtValue() == 16; 1968 return false; 1969 } 1970 1971 static bool isSHL16(const SDValue &Op) { 1972 if (Op.getOpcode() != ISD::SHL) 1973 return false; 1974 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1975 return Const->getZExtValue() == 16; 1976 return false; 1977 } 1978 1979 // Check for a signed 16-bit value. We special case SRA because it makes it 1980 // more simple when also looking for SRAs that aren't sign extending a 1981 // smaller value. Without the check, we'd need to take extra care with 1982 // checking order for some operations. 1983 static bool isS16(const SDValue &Op, SelectionDAG &DAG) { 1984 if (isSRA16(Op)) 1985 return isSHL16(Op.getOperand(0)); 1986 return DAG.ComputeNumSignBits(Op) == 17; 1987 } 1988 1989 /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC 1990 static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) { 1991 switch (CC) { 1992 default: llvm_unreachable("Unknown condition code!"); 1993 case ISD::SETNE: return ARMCC::NE; 1994 case ISD::SETEQ: return ARMCC::EQ; 1995 case ISD::SETGT: return ARMCC::GT; 1996 case ISD::SETGE: return ARMCC::GE; 1997 case ISD::SETLT: return ARMCC::LT; 1998 case ISD::SETLE: return ARMCC::LE; 1999 case ISD::SETUGT: return ARMCC::HI; 2000 case ISD::SETUGE: return ARMCC::HS; 2001 case ISD::SETULT: return ARMCC::LO; 2002 case ISD::SETULE: return ARMCC::LS; 2003 } 2004 } 2005 2006 /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC. 2007 static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, 2008 ARMCC::CondCodes &CondCode2) { 2009 CondCode2 = ARMCC::AL; 2010 switch (CC) { 2011 default: llvm_unreachable("Unknown FP condition!"); 2012 case ISD::SETEQ: 2013 case ISD::SETOEQ: CondCode = ARMCC::EQ; break; 2014 case ISD::SETGT: 2015 case ISD::SETOGT: CondCode = ARMCC::GT; break; 2016 case ISD::SETGE: 2017 case ISD::SETOGE: CondCode = ARMCC::GE; break; 2018 case ISD::SETOLT: CondCode = ARMCC::MI; break; 2019 case ISD::SETOLE: CondCode = ARMCC::LS; break; 2020 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break; 2021 case ISD::SETO: CondCode = ARMCC::VC; break; 2022 case ISD::SETUO: CondCode = ARMCC::VS; break; 2023 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break; 2024 case ISD::SETUGT: CondCode = ARMCC::HI; break; 2025 case ISD::SETUGE: CondCode = ARMCC::PL; break; 2026 case ISD::SETLT: 2027 case ISD::SETULT: CondCode = ARMCC::LT; break; 2028 case ISD::SETLE: 2029 case ISD::SETULE: CondCode = ARMCC::LE; break; 2030 case ISD::SETNE: 2031 case ISD::SETUNE: CondCode = ARMCC::NE; break; 2032 } 2033 } 2034 2035 //===----------------------------------------------------------------------===// 2036 // Calling Convention Implementation 2037 //===----------------------------------------------------------------------===// 2038 2039 /// getEffectiveCallingConv - Get the effective calling convention, taking into 2040 /// account presence of floating point hardware and calling convention 2041 /// limitations, such as support for variadic functions. 2042 CallingConv::ID 2043 ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC, 2044 bool isVarArg) const { 2045 switch (CC) { 2046 default: 2047 report_fatal_error("Unsupported calling convention"); 2048 case CallingConv::ARM_AAPCS: 2049 case CallingConv::ARM_APCS: 2050 case CallingConv::GHC: 2051 case CallingConv::CFGuard_Check: 2052 return CC; 2053 case CallingConv::PreserveMost: 2054 return CallingConv::PreserveMost; 2055 case CallingConv::ARM_AAPCS_VFP: 2056 case CallingConv::Swift: 2057 case CallingConv::SwiftTail: 2058 return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP; 2059 case CallingConv::C: 2060 case CallingConv::Tail: 2061 if (!Subtarget->isAAPCS_ABI()) 2062 return CallingConv::ARM_APCS; 2063 else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && 2064 getTargetMachine().Options.FloatABIType == FloatABI::Hard && 2065 !isVarArg) 2066 return CallingConv::ARM_AAPCS_VFP; 2067 else 2068 return CallingConv::ARM_AAPCS; 2069 case CallingConv::Fast: 2070 case CallingConv::CXX_FAST_TLS: 2071 if (!Subtarget->isAAPCS_ABI()) { 2072 if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg) 2073 return CallingConv::Fast; 2074 return CallingConv::ARM_APCS; 2075 } else if (Subtarget->hasVFP2Base() && 2076 !Subtarget->isThumb1Only() && !isVarArg) 2077 return CallingConv::ARM_AAPCS_VFP; 2078 else 2079 return CallingConv::ARM_AAPCS; 2080 } 2081 } 2082 2083 CCAssignFn *ARMTargetLowering::CCAssignFnForCall(CallingConv::ID CC, 2084 bool isVarArg) const { 2085 return CCAssignFnForNode(CC, false, isVarArg); 2086 } 2087 2088 CCAssignFn *ARMTargetLowering::CCAssignFnForReturn(CallingConv::ID CC, 2089 bool isVarArg) const { 2090 return CCAssignFnForNode(CC, true, isVarArg); 2091 } 2092 2093 /// CCAssignFnForNode - Selects the correct CCAssignFn for the given 2094 /// CallingConvention. 2095 CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, 2096 bool Return, 2097 bool isVarArg) const { 2098 switch (getEffectiveCallingConv(CC, isVarArg)) { 2099 default: 2100 report_fatal_error("Unsupported calling convention"); 2101 case CallingConv::ARM_APCS: 2102 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS); 2103 case CallingConv::ARM_AAPCS: 2104 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); 2105 case CallingConv::ARM_AAPCS_VFP: 2106 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP); 2107 case CallingConv::Fast: 2108 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS); 2109 case CallingConv::GHC: 2110 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC); 2111 case CallingConv::PreserveMost: 2112 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); 2113 case CallingConv::CFGuard_Check: 2114 return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check); 2115 } 2116 } 2117 2118 SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG, 2119 MVT LocVT, MVT ValVT, SDValue Val) const { 2120 Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()), 2121 Val); 2122 if (Subtarget->hasFullFP16()) { 2123 Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val); 2124 } else { 2125 Val = DAG.getNode(ISD::TRUNCATE, dl, 2126 MVT::getIntegerVT(ValVT.getSizeInBits()), Val); 2127 Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val); 2128 } 2129 return Val; 2130 } 2131 2132 SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG, 2133 MVT LocVT, MVT ValVT, 2134 SDValue Val) const { 2135 if (Subtarget->hasFullFP16()) { 2136 Val = DAG.getNode(ARMISD::VMOVrh, dl, 2137 MVT::getIntegerVT(LocVT.getSizeInBits()), Val); 2138 } else { 2139 Val = DAG.getNode(ISD::BITCAST, dl, 2140 MVT::getIntegerVT(ValVT.getSizeInBits()), Val); 2141 Val = DAG.getNode(ISD::ZERO_EXTEND, dl, 2142 MVT::getIntegerVT(LocVT.getSizeInBits()), Val); 2143 } 2144 return DAG.getNode(ISD::BITCAST, dl, LocVT, Val); 2145 } 2146 2147 /// LowerCallResult - Lower the result values of a call into the 2148 /// appropriate copies out of appropriate physical registers. 2149 SDValue ARMTargetLowering::LowerCallResult( 2150 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, 2151 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 2152 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn, 2153 SDValue ThisVal) const { 2154 // Assign locations to each value returned by this call. 2155 SmallVector<CCValAssign, 16> RVLocs; 2156 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 2157 *DAG.getContext()); 2158 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg)); 2159 2160 // Copy all of the result registers out of their specified physreg. 2161 for (unsigned i = 0; i != RVLocs.size(); ++i) { 2162 CCValAssign VA = RVLocs[i]; 2163 2164 // Pass 'this' value directly from the argument to return value, to avoid 2165 // reg unit interference 2166 if (i == 0 && isThisReturn) { 2167 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 && 2168 "unexpected return calling convention register assignment"); 2169 InVals.push_back(ThisVal); 2170 continue; 2171 } 2172 2173 SDValue Val; 2174 if (VA.needsCustom() && 2175 (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) { 2176 // Handle f64 or half of a v2f64. 2177 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 2178 InFlag); 2179 Chain = Lo.getValue(1); 2180 InFlag = Lo.getValue(2); 2181 VA = RVLocs[++i]; // skip ahead to next loc 2182 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 2183 InFlag); 2184 Chain = Hi.getValue(1); 2185 InFlag = Hi.getValue(2); 2186 if (!Subtarget->isLittle()) 2187 std::swap (Lo, Hi); 2188 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 2189 2190 if (VA.getLocVT() == MVT::v2f64) { 2191 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 2192 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 2193 DAG.getConstant(0, dl, MVT::i32)); 2194 2195 VA = RVLocs[++i]; // skip ahead to next loc 2196 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 2197 Chain = Lo.getValue(1); 2198 InFlag = Lo.getValue(2); 2199 VA = RVLocs[++i]; // skip ahead to next loc 2200 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 2201 Chain = Hi.getValue(1); 2202 InFlag = Hi.getValue(2); 2203 if (!Subtarget->isLittle()) 2204 std::swap (Lo, Hi); 2205 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 2206 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 2207 DAG.getConstant(1, dl, MVT::i32)); 2208 } 2209 } else { 2210 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), 2211 InFlag); 2212 Chain = Val.getValue(1); 2213 InFlag = Val.getValue(2); 2214 } 2215 2216 switch (VA.getLocInfo()) { 2217 default: llvm_unreachable("Unknown loc info!"); 2218 case CCValAssign::Full: break; 2219 case CCValAssign::BCvt: 2220 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val); 2221 break; 2222 } 2223 2224 // f16 arguments have their size extended to 4 bytes and passed as if they 2225 // had been copied to the LSBs of a 32-bit register. 2226 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI) 2227 if (VA.needsCustom() && 2228 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) 2229 Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val); 2230 2231 InVals.push_back(Val); 2232 } 2233 2234 return Chain; 2235 } 2236 2237 std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg( 2238 const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr, 2239 bool IsTailCall, int SPDiff) const { 2240 SDValue DstAddr; 2241 MachinePointerInfo DstInfo; 2242 int32_t Offset = VA.getLocMemOffset(); 2243 MachineFunction &MF = DAG.getMachineFunction(); 2244 2245 if (IsTailCall) { 2246 Offset += SPDiff; 2247 auto PtrVT = getPointerTy(DAG.getDataLayout()); 2248 int Size = VA.getLocVT().getFixedSizeInBits() / 8; 2249 int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true); 2250 DstAddr = DAG.getFrameIndex(FI, PtrVT); 2251 DstInfo = 2252 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); 2253 } else { 2254 SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl); 2255 DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), 2256 StackPtr, PtrOff); 2257 DstInfo = 2258 MachinePointerInfo::getStack(DAG.getMachineFunction(), Offset); 2259 } 2260 2261 return std::make_pair(DstAddr, DstInfo); 2262 } 2263 2264 void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, 2265 SDValue Chain, SDValue &Arg, 2266 RegsToPassVector &RegsToPass, 2267 CCValAssign &VA, CCValAssign &NextVA, 2268 SDValue &StackPtr, 2269 SmallVectorImpl<SDValue> &MemOpChains, 2270 bool IsTailCall, 2271 int SPDiff) const { 2272 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 2273 DAG.getVTList(MVT::i32, MVT::i32), Arg); 2274 unsigned id = Subtarget->isLittle() ? 0 : 1; 2275 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id))); 2276 2277 if (NextVA.isRegLoc()) 2278 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id))); 2279 else { 2280 assert(NextVA.isMemLoc()); 2281 if (!StackPtr.getNode()) 2282 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, 2283 getPointerTy(DAG.getDataLayout())); 2284 2285 SDValue DstAddr; 2286 MachinePointerInfo DstInfo; 2287 std::tie(DstAddr, DstInfo) = 2288 computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff); 2289 MemOpChains.push_back( 2290 DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo)); 2291 } 2292 } 2293 2294 static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) { 2295 return (CC == CallingConv::Fast && GuaranteeTailCalls) || 2296 CC == CallingConv::Tail || CC == CallingConv::SwiftTail; 2297 } 2298 2299 /// LowerCall - Lowering a call into a callseq_start <- 2300 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter 2301 /// nodes. 2302 SDValue 2303 ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 2304 SmallVectorImpl<SDValue> &InVals) const { 2305 SelectionDAG &DAG = CLI.DAG; 2306 SDLoc &dl = CLI.DL; 2307 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 2308 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 2309 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 2310 SDValue Chain = CLI.Chain; 2311 SDValue Callee = CLI.Callee; 2312 bool &isTailCall = CLI.IsTailCall; 2313 CallingConv::ID CallConv = CLI.CallConv; 2314 bool doesNotRet = CLI.DoesNotReturn; 2315 bool isVarArg = CLI.IsVarArg; 2316 2317 MachineFunction &MF = DAG.getMachineFunction(); 2318 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2319 MachineFunction::CallSiteInfo CSInfo; 2320 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); 2321 bool isThisReturn = false; 2322 bool isCmseNSCall = false; 2323 bool isSibCall = false; 2324 bool PreferIndirect = false; 2325 bool GuardWithBTI = false; 2326 2327 // Lower 'returns_twice' calls to a pseudo-instruction. 2328 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) && 2329 !Subtarget->getNoBTIAtReturnTwice()) 2330 GuardWithBTI = AFI->branchTargetEnforcement(); 2331 2332 // Determine whether this is a non-secure function call. 2333 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call")) 2334 isCmseNSCall = true; 2335 2336 // Disable tail calls if they're not supported. 2337 if (!Subtarget->supportsTailCall()) 2338 isTailCall = false; 2339 2340 // For both the non-secure calls and the returns from a CMSE entry function, 2341 // the function needs to do some extra work afte r the call, or before the 2342 // return, respectively, thus it cannot end with atail call 2343 if (isCmseNSCall || AFI->isCmseNSEntryFunction()) 2344 isTailCall = false; 2345 2346 if (isa<GlobalAddressSDNode>(Callee)) { 2347 // If we're optimizing for minimum size and the function is called three or 2348 // more times in this block, we can improve codesize by calling indirectly 2349 // as BLXr has a 16-bit encoding. 2350 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal(); 2351 if (CLI.CB) { 2352 auto *BB = CLI.CB->getParent(); 2353 PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() && 2354 count_if(GV->users(), [&BB](const User *U) { 2355 return isa<Instruction>(U) && 2356 cast<Instruction>(U)->getParent() == BB; 2357 }) > 2; 2358 } 2359 } 2360 if (isTailCall) { 2361 // Check if it's really possible to do a tail call. 2362 isTailCall = IsEligibleForTailCallOptimization( 2363 Callee, CallConv, isVarArg, isStructRet, 2364 MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG, 2365 PreferIndirect); 2366 2367 if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt && 2368 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail) 2369 isSibCall = true; 2370 2371 // We don't support GuaranteedTailCallOpt for ARM, only automatically 2372 // detected sibcalls. 2373 if (isTailCall) 2374 ++NumTailCalls; 2375 } 2376 2377 if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall()) 2378 report_fatal_error("failed to perform tail call elimination on a call " 2379 "site marked musttail"); 2380 // Analyze operands of the call, assigning locations to each operand. 2381 SmallVector<CCValAssign, 16> ArgLocs; 2382 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 2383 *DAG.getContext()); 2384 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg)); 2385 2386 // Get a count of how many bytes are to be pushed on the stack. 2387 unsigned NumBytes = CCInfo.getNextStackOffset(); 2388 2389 // SPDiff is the byte offset of the call's argument area from the callee's. 2390 // Stores to callee stack arguments will be placed in FixedStackSlots offset 2391 // by this amount for a tail call. In a sibling call it must be 0 because the 2392 // caller will deallocate the entire stack and the callee still expects its 2393 // arguments to begin at SP+0. Completely unused for non-tail calls. 2394 int SPDiff = 0; 2395 2396 if (isTailCall && !isSibCall) { 2397 auto FuncInfo = MF.getInfo<ARMFunctionInfo>(); 2398 unsigned NumReusableBytes = FuncInfo->getArgumentStackSize(); 2399 2400 // Since callee will pop argument stack as a tail call, we must keep the 2401 // popped size 16-byte aligned. 2402 Align StackAlign = DAG.getDataLayout().getStackAlignment(); 2403 NumBytes = alignTo(NumBytes, StackAlign); 2404 2405 // SPDiff will be negative if this tail call requires more space than we 2406 // would automatically have in our incoming argument space. Positive if we 2407 // can actually shrink the stack. 2408 SPDiff = NumReusableBytes - NumBytes; 2409 2410 // If this call requires more stack than we have available from 2411 // LowerFormalArguments, tell FrameLowering to reserve space for it. 2412 if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff) 2413 AFI->setArgRegsSaveSize(-SPDiff); 2414 } 2415 2416 if (isSibCall) { 2417 // For sibling tail calls, memory operands are available in our caller's stack. 2418 NumBytes = 0; 2419 } else { 2420 // Adjust the stack pointer for the new arguments... 2421 // These operations are automatically eliminated by the prolog/epilog pass 2422 Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl); 2423 } 2424 2425 SDValue StackPtr = 2426 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout())); 2427 2428 RegsToPassVector RegsToPass; 2429 SmallVector<SDValue, 8> MemOpChains; 2430 2431 // During a tail call, stores to the argument area must happen after all of 2432 // the function's incoming arguments have been loaded because they may alias. 2433 // This is done by folding in a TokenFactor from LowerFormalArguments, but 2434 // there's no point in doing so repeatedly so this tracks whether that's 2435 // happened yet. 2436 bool AfterFormalArgLoads = false; 2437 2438 // Walk the register/memloc assignments, inserting copies/loads. In the case 2439 // of tail call optimization, arguments are handled later. 2440 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 2441 i != e; 2442 ++i, ++realArgIdx) { 2443 CCValAssign &VA = ArgLocs[i]; 2444 SDValue Arg = OutVals[realArgIdx]; 2445 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 2446 bool isByVal = Flags.isByVal(); 2447 2448 // Promote the value if needed. 2449 switch (VA.getLocInfo()) { 2450 default: llvm_unreachable("Unknown loc info!"); 2451 case CCValAssign::Full: break; 2452 case CCValAssign::SExt: 2453 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); 2454 break; 2455 case CCValAssign::ZExt: 2456 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); 2457 break; 2458 case CCValAssign::AExt: 2459 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); 2460 break; 2461 case CCValAssign::BCvt: 2462 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 2463 break; 2464 } 2465 2466 if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) { 2467 Chain = DAG.getStackArgumentTokenFactor(Chain); 2468 AfterFormalArgLoads = true; 2469 } 2470 2471 // f16 arguments have their size extended to 4 bytes and passed as if they 2472 // had been copied to the LSBs of a 32-bit register. 2473 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI) 2474 if (VA.needsCustom() && 2475 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) { 2476 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg); 2477 } else { 2478 // f16 arguments could have been extended prior to argument lowering. 2479 // Mask them arguments if this is a CMSE nonsecure call. 2480 auto ArgVT = Outs[realArgIdx].ArgVT; 2481 if (isCmseNSCall && (ArgVT == MVT::f16)) { 2482 auto LocBits = VA.getLocVT().getSizeInBits(); 2483 auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits()); 2484 SDValue Mask = 2485 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits)); 2486 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg); 2487 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask); 2488 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 2489 } 2490 } 2491 2492 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces 2493 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) { 2494 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2495 DAG.getConstant(0, dl, MVT::i32)); 2496 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2497 DAG.getConstant(1, dl, MVT::i32)); 2498 2499 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i], 2500 StackPtr, MemOpChains, isTailCall, SPDiff); 2501 2502 VA = ArgLocs[++i]; // skip ahead to next loc 2503 if (VA.isRegLoc()) { 2504 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i], 2505 StackPtr, MemOpChains, isTailCall, SPDiff); 2506 } else { 2507 assert(VA.isMemLoc()); 2508 SDValue DstAddr; 2509 MachinePointerInfo DstInfo; 2510 std::tie(DstAddr, DstInfo) = 2511 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff); 2512 MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo)); 2513 } 2514 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) { 2515 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], 2516 StackPtr, MemOpChains, isTailCall, SPDiff); 2517 } else if (VA.isRegLoc()) { 2518 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() && 2519 Outs[0].VT == MVT::i32) { 2520 assert(VA.getLocVT() == MVT::i32 && 2521 "unexpected calling convention register assignment"); 2522 assert(!Ins.empty() && Ins[0].VT == MVT::i32 && 2523 "unexpected use of 'returned'"); 2524 isThisReturn = true; 2525 } 2526 const TargetOptions &Options = DAG.getTarget().Options; 2527 if (Options.EmitCallSiteInfo) 2528 CSInfo.emplace_back(VA.getLocReg(), i); 2529 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2530 } else if (isByVal) { 2531 assert(VA.isMemLoc()); 2532 unsigned offset = 0; 2533 2534 // True if this byval aggregate will be split between registers 2535 // and memory. 2536 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount(); 2537 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed(); 2538 2539 if (CurByValIdx < ByValArgsCount) { 2540 2541 unsigned RegBegin, RegEnd; 2542 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd); 2543 2544 EVT PtrVT = 2545 DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 2546 unsigned int i, j; 2547 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) { 2548 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32); 2549 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); 2550 SDValue Load = 2551 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(), 2552 DAG.InferPtrAlign(AddArg)); 2553 MemOpChains.push_back(Load.getValue(1)); 2554 RegsToPass.push_back(std::make_pair(j, Load)); 2555 } 2556 2557 // If parameter size outsides register area, "offset" value 2558 // helps us to calculate stack slot for remained part properly. 2559 offset = RegEnd - RegBegin; 2560 2561 CCInfo.nextInRegsParam(); 2562 } 2563 2564 if (Flags.getByValSize() > 4*offset) { 2565 auto PtrVT = getPointerTy(DAG.getDataLayout()); 2566 SDValue Dst; 2567 MachinePointerInfo DstInfo; 2568 std::tie(Dst, DstInfo) = 2569 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff); 2570 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl); 2571 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset); 2572 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl, 2573 MVT::i32); 2574 SDValue AlignNode = 2575 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32); 2576 2577 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); 2578 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode}; 2579 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, 2580 Ops)); 2581 } 2582 } else { 2583 assert(VA.isMemLoc()); 2584 SDValue DstAddr; 2585 MachinePointerInfo DstInfo; 2586 std::tie(DstAddr, DstInfo) = 2587 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff); 2588 2589 SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo); 2590 MemOpChains.push_back(Store); 2591 } 2592 } 2593 2594 if (!MemOpChains.empty()) 2595 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 2596 2597 // Build a sequence of copy-to-reg nodes chained together with token chain 2598 // and flag operands which copy the outgoing args into the appropriate regs. 2599 SDValue InFlag; 2600 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2601 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2602 RegsToPass[i].second, InFlag); 2603 InFlag = Chain.getValue(1); 2604 } 2605 2606 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every 2607 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol 2608 // node so that legalize doesn't hack it. 2609 bool isDirect = false; 2610 2611 const TargetMachine &TM = getTargetMachine(); 2612 const Module *Mod = MF.getFunction().getParent(); 2613 const GlobalValue *GV = nullptr; 2614 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 2615 GV = G->getGlobal(); 2616 bool isStub = 2617 !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO(); 2618 2619 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass()); 2620 bool isLocalARMFunc = false; 2621 auto PtrVt = getPointerTy(DAG.getDataLayout()); 2622 2623 if (Subtarget->genLongCalls()) { 2624 assert((!isPositionIndependent() || Subtarget->isTargetWindows()) && 2625 "long-calls codegen is not position independent!"); 2626 // Handle a global address or an external symbol. If it's not one of 2627 // those, the target's already in a register, so we don't need to do 2628 // anything extra. 2629 if (isa<GlobalAddressSDNode>(Callee)) { 2630 // Create a constant pool entry for the callee address 2631 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2632 ARMConstantPoolValue *CPV = 2633 ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0); 2634 2635 // Get the address of the callee into a register 2636 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4)); 2637 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2638 Callee = DAG.getLoad( 2639 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2640 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2641 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) { 2642 const char *Sym = S->getSymbol(); 2643 2644 // Create a constant pool entry for the callee address 2645 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2646 ARMConstantPoolValue *CPV = 2647 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 2648 ARMPCLabelIndex, 0); 2649 // Get the address of the callee into a register 2650 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4)); 2651 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2652 Callee = DAG.getLoad( 2653 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2654 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2655 } 2656 } else if (isa<GlobalAddressSDNode>(Callee)) { 2657 if (!PreferIndirect) { 2658 isDirect = true; 2659 bool isDef = GV->isStrongDefinitionForLinker(); 2660 2661 // ARM call to a local ARM function is predicable. 2662 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking); 2663 // tBX takes a register source operand. 2664 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 2665 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?"); 2666 Callee = DAG.getNode( 2667 ARMISD::WrapperPIC, dl, PtrVt, 2668 DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY)); 2669 Callee = DAG.getLoad( 2670 PtrVt, dl, DAG.getEntryNode(), Callee, 2671 MachinePointerInfo::getGOT(DAG.getMachineFunction()), MaybeAlign(), 2672 MachineMemOperand::MODereferenceable | 2673 MachineMemOperand::MOInvariant); 2674 } else if (Subtarget->isTargetCOFF()) { 2675 assert(Subtarget->isTargetWindows() && 2676 "Windows is the only supported COFF target"); 2677 unsigned TargetFlags = ARMII::MO_NO_FLAG; 2678 if (GV->hasDLLImportStorageClass()) 2679 TargetFlags = ARMII::MO_DLLIMPORT; 2680 else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 2681 TargetFlags = ARMII::MO_COFFSTUB; 2682 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*offset=*/0, 2683 TargetFlags); 2684 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB)) 2685 Callee = 2686 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), 2687 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee), 2688 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 2689 } else { 2690 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0); 2691 } 2692 } 2693 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2694 isDirect = true; 2695 // tBX takes a register source operand. 2696 const char *Sym = S->getSymbol(); 2697 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 2698 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2699 ARMConstantPoolValue *CPV = 2700 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 2701 ARMPCLabelIndex, 4); 2702 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4)); 2703 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2704 Callee = DAG.getLoad( 2705 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2706 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2707 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 2708 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel); 2709 } else { 2710 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0); 2711 } 2712 } 2713 2714 if (isCmseNSCall) { 2715 assert(!isARMFunc && !isDirect && 2716 "Cannot handle call to ARM function or direct call"); 2717 if (NumBytes > 0) { 2718 DiagnosticInfoUnsupported Diag(DAG.getMachineFunction().getFunction(), 2719 "call to non-secure function would " 2720 "require passing arguments on stack", 2721 dl.getDebugLoc()); 2722 DAG.getContext()->diagnose(Diag); 2723 } 2724 if (isStructRet) { 2725 DiagnosticInfoUnsupported Diag( 2726 DAG.getMachineFunction().getFunction(), 2727 "call to non-secure function would return value through pointer", 2728 dl.getDebugLoc()); 2729 DAG.getContext()->diagnose(Diag); 2730 } 2731 } 2732 2733 // FIXME: handle tail calls differently. 2734 unsigned CallOpc; 2735 if (Subtarget->isThumb()) { 2736 if (GuardWithBTI) 2737 CallOpc = ARMISD::t2CALL_BTI; 2738 else if (isCmseNSCall) 2739 CallOpc = ARMISD::tSECALL; 2740 else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) 2741 CallOpc = ARMISD::CALL_NOLINK; 2742 else 2743 CallOpc = ARMISD::CALL; 2744 } else { 2745 if (!isDirect && !Subtarget->hasV5TOps()) 2746 CallOpc = ARMISD::CALL_NOLINK; 2747 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() && 2748 // Emit regular call when code size is the priority 2749 !Subtarget->hasMinSize()) 2750 // "mov lr, pc; b _foo" to avoid confusing the RSP 2751 CallOpc = ARMISD::CALL_NOLINK; 2752 else 2753 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL; 2754 } 2755 2756 // We don't usually want to end the call-sequence here because we would tidy 2757 // the frame up *after* the call, however in the ABI-changing tail-call case 2758 // we've carefully laid out the parameters so that when sp is reset they'll be 2759 // in the correct location. 2760 if (isTailCall && !isSibCall) { 2761 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), 2762 DAG.getIntPtrConstant(0, dl, true), InFlag, dl); 2763 InFlag = Chain.getValue(1); 2764 } 2765 2766 std::vector<SDValue> Ops; 2767 Ops.push_back(Chain); 2768 Ops.push_back(Callee); 2769 2770 if (isTailCall) { 2771 Ops.push_back(DAG.getTargetConstant(SPDiff, dl, MVT::i32)); 2772 } 2773 2774 // Add argument registers to the end of the list so that they are known live 2775 // into the call. 2776 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2777 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2778 RegsToPass[i].second.getValueType())); 2779 2780 // Add a register mask operand representing the call-preserved registers. 2781 if (!isTailCall) { 2782 const uint32_t *Mask; 2783 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); 2784 if (isThisReturn) { 2785 // For 'this' returns, use the R0-preserving mask if applicable 2786 Mask = ARI->getThisReturnPreservedMask(MF, CallConv); 2787 if (!Mask) { 2788 // Set isThisReturn to false if the calling convention is not one that 2789 // allows 'returned' to be modeled in this way, so LowerCallResult does 2790 // not try to pass 'this' straight through 2791 isThisReturn = false; 2792 Mask = ARI->getCallPreservedMask(MF, CallConv); 2793 } 2794 } else 2795 Mask = ARI->getCallPreservedMask(MF, CallConv); 2796 2797 assert(Mask && "Missing call preserved mask for calling convention"); 2798 Ops.push_back(DAG.getRegisterMask(Mask)); 2799 } 2800 2801 if (InFlag.getNode()) 2802 Ops.push_back(InFlag); 2803 2804 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2805 if (isTailCall) { 2806 MF.getFrameInfo().setHasTailCall(); 2807 SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops); 2808 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); 2809 return Ret; 2810 } 2811 2812 // Returns a chain and a flag for retval copy to use. 2813 Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); 2814 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); 2815 InFlag = Chain.getValue(1); 2816 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); 2817 2818 // If we're guaranteeing tail-calls will be honoured, the callee must 2819 // pop its own argument stack on return. But this call is *not* a tail call so 2820 // we need to undo that after it returns to restore the status-quo. 2821 bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt; 2822 uint64_t CalleePopBytes = 2823 canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1ULL; 2824 2825 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), 2826 DAG.getIntPtrConstant(CalleePopBytes, dl, true), 2827 InFlag, dl); 2828 if (!Ins.empty()) 2829 InFlag = Chain.getValue(1); 2830 2831 // Handle result values, copying them out of physregs into vregs that we 2832 // return. 2833 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, 2834 InVals, isThisReturn, 2835 isThisReturn ? OutVals[0] : SDValue()); 2836 } 2837 2838 /// HandleByVal - Every parameter *after* a byval parameter is passed 2839 /// on the stack. Remember the next parameter register to allocate, 2840 /// and then confiscate the rest of the parameter registers to insure 2841 /// this. 2842 void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size, 2843 Align Alignment) const { 2844 // Byval (as with any stack) slots are always at least 4 byte aligned. 2845 Alignment = std::max(Alignment, Align(4)); 2846 2847 unsigned Reg = State->AllocateReg(GPRArgRegs); 2848 if (!Reg) 2849 return; 2850 2851 unsigned AlignInRegs = Alignment.value() / 4; 2852 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs; 2853 for (unsigned i = 0; i < Waste; ++i) 2854 Reg = State->AllocateReg(GPRArgRegs); 2855 2856 if (!Reg) 2857 return; 2858 2859 unsigned Excess = 4 * (ARM::R4 - Reg); 2860 2861 // Special case when NSAA != SP and parameter size greater than size of 2862 // all remained GPR regs. In that case we can't split parameter, we must 2863 // send it to stack. We also must set NCRN to R4, so waste all 2864 // remained registers. 2865 const unsigned NSAAOffset = State->getNextStackOffset(); 2866 if (NSAAOffset != 0 && Size > Excess) { 2867 while (State->AllocateReg(GPRArgRegs)) 2868 ; 2869 return; 2870 } 2871 2872 // First register for byval parameter is the first register that wasn't 2873 // allocated before this method call, so it would be "reg". 2874 // If parameter is small enough to be saved in range [reg, r4), then 2875 // the end (first after last) register would be reg + param-size-in-regs, 2876 // else parameter would be splitted between registers and stack, 2877 // end register would be r4 in this case. 2878 unsigned ByValRegBegin = Reg; 2879 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4); 2880 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd); 2881 // Note, first register is allocated in the beginning of function already, 2882 // allocate remained amount of registers we need. 2883 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i) 2884 State->AllocateReg(GPRArgRegs); 2885 // A byval parameter that is split between registers and memory needs its 2886 // size truncated here. 2887 // In the case where the entire structure fits in registers, we set the 2888 // size in memory to zero. 2889 Size = std::max<int>(Size - Excess, 0); 2890 } 2891 2892 /// MatchingStackOffset - Return true if the given stack call argument is 2893 /// already available in the same position (relatively) of the caller's 2894 /// incoming argument stack. 2895 static 2896 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2897 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI, 2898 const TargetInstrInfo *TII) { 2899 unsigned Bytes = Arg.getValueSizeInBits() / 8; 2900 int FI = std::numeric_limits<int>::max(); 2901 if (Arg.getOpcode() == ISD::CopyFromReg) { 2902 Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2903 if (!Register::isVirtualRegister(VR)) 2904 return false; 2905 MachineInstr *Def = MRI->getVRegDef(VR); 2906 if (!Def) 2907 return false; 2908 if (!Flags.isByVal()) { 2909 if (!TII->isLoadFromStackSlot(*Def, FI)) 2910 return false; 2911 } else { 2912 return false; 2913 } 2914 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2915 if (Flags.isByVal()) 2916 // ByVal argument is passed in as a pointer but it's now being 2917 // dereferenced. e.g. 2918 // define @foo(%struct.X* %A) { 2919 // tail call @bar(%struct.X* byval %A) 2920 // } 2921 return false; 2922 SDValue Ptr = Ld->getBasePtr(); 2923 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2924 if (!FINode) 2925 return false; 2926 FI = FINode->getIndex(); 2927 } else 2928 return false; 2929 2930 assert(FI != std::numeric_limits<int>::max()); 2931 if (!MFI.isFixedObjectIndex(FI)) 2932 return false; 2933 return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI); 2934 } 2935 2936 /// IsEligibleForTailCallOptimization - Check whether the call is eligible 2937 /// for tail call optimization. Targets which want to do tail call 2938 /// optimization should implement this function. 2939 bool ARMTargetLowering::IsEligibleForTailCallOptimization( 2940 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, 2941 bool isCalleeStructRet, bool isCallerStructRet, 2942 const SmallVectorImpl<ISD::OutputArg> &Outs, 2943 const SmallVectorImpl<SDValue> &OutVals, 2944 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG, 2945 const bool isIndirect) const { 2946 MachineFunction &MF = DAG.getMachineFunction(); 2947 const Function &CallerF = MF.getFunction(); 2948 CallingConv::ID CallerCC = CallerF.getCallingConv(); 2949 2950 assert(Subtarget->supportsTailCall()); 2951 2952 // Indirect tail calls cannot be optimized for Thumb1 if the args 2953 // to the call take up r0-r3. The reason is that there are no legal registers 2954 // left to hold the pointer to the function to be called. 2955 // Similarly, if the function uses return address sign and authentication, 2956 // r12 is needed to hold the PAC and is not available to hold the callee 2957 // address. 2958 if (Outs.size() >= 4 && 2959 (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect)) { 2960 if (Subtarget->isThumb1Only()) 2961 return false; 2962 // Conservatively assume the function spills LR. 2963 if (MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(true)) 2964 return false; 2965 } 2966 2967 // Look for obvious safe cases to perform tail call optimization that do not 2968 // require ABI changes. This is what gcc calls sibcall. 2969 2970 // Exception-handling functions need a special set of instructions to indicate 2971 // a return to the hardware. Tail-calling another function would probably 2972 // break this. 2973 if (CallerF.hasFnAttribute("interrupt")) 2974 return false; 2975 2976 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt)) 2977 return CalleeCC == CallerCC; 2978 2979 // Also avoid sibcall optimization if either caller or callee uses struct 2980 // return semantics. 2981 if (isCalleeStructRet || isCallerStructRet) 2982 return false; 2983 2984 // Externally-defined functions with weak linkage should not be 2985 // tail-called on ARM when the OS does not support dynamic 2986 // pre-emption of symbols, as the AAELF spec requires normal calls 2987 // to undefined weak functions to be replaced with a NOP or jump to the 2988 // next instruction. The behaviour of branch instructions in this 2989 // situation (as used for tail calls) is implementation-defined, so we 2990 // cannot rely on the linker replacing the tail call with a return. 2991 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2992 const GlobalValue *GV = G->getGlobal(); 2993 const Triple &TT = getTargetMachine().getTargetTriple(); 2994 if (GV->hasExternalWeakLinkage() && 2995 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) 2996 return false; 2997 } 2998 2999 // Check that the call results are passed in the same way. 3000 LLVMContext &C = *DAG.getContext(); 3001 if (!CCState::resultsCompatible( 3002 getEffectiveCallingConv(CalleeCC, isVarArg), 3003 getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins, 3004 CCAssignFnForReturn(CalleeCC, isVarArg), 3005 CCAssignFnForReturn(CallerCC, CallerF.isVarArg()))) 3006 return false; 3007 // The callee has to preserve all registers the caller needs to preserve. 3008 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 3009 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); 3010 if (CalleeCC != CallerCC) { 3011 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); 3012 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) 3013 return false; 3014 } 3015 3016 // If Caller's vararg or byval argument has been split between registers and 3017 // stack, do not perform tail call, since part of the argument is in caller's 3018 // local frame. 3019 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>(); 3020 if (AFI_Caller->getArgRegsSaveSize()) 3021 return false; 3022 3023 // If the callee takes no arguments then go on to check the results of the 3024 // call. 3025 if (!Outs.empty()) { 3026 // Check if stack adjustment is needed. For now, do not do this if any 3027 // argument is passed on the stack. 3028 SmallVector<CCValAssign, 16> ArgLocs; 3029 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); 3030 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); 3031 if (CCInfo.getNextStackOffset()) { 3032 // Check if the arguments are already laid out in the right way as 3033 // the caller's fixed stack objects. 3034 MachineFrameInfo &MFI = MF.getFrameInfo(); 3035 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 3036 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 3037 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 3038 i != e; 3039 ++i, ++realArgIdx) { 3040 CCValAssign &VA = ArgLocs[i]; 3041 EVT RegVT = VA.getLocVT(); 3042 SDValue Arg = OutVals[realArgIdx]; 3043 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 3044 if (VA.getLocInfo() == CCValAssign::Indirect) 3045 return false; 3046 if (VA.needsCustom() && (RegVT == MVT::f64 || RegVT == MVT::v2f64)) { 3047 // f64 and vector types are split into multiple registers or 3048 // register/stack-slot combinations. The types will not match 3049 // the registers; give up on memory f64 refs until we figure 3050 // out what to do about this. 3051 if (!VA.isRegLoc()) 3052 return false; 3053 if (!ArgLocs[++i].isRegLoc()) 3054 return false; 3055 if (RegVT == MVT::v2f64) { 3056 if (!ArgLocs[++i].isRegLoc()) 3057 return false; 3058 if (!ArgLocs[++i].isRegLoc()) 3059 return false; 3060 } 3061 } else if (!VA.isRegLoc()) { 3062 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 3063 MFI, MRI, TII)) 3064 return false; 3065 } 3066 } 3067 } 3068 3069 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3070 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) 3071 return false; 3072 } 3073 3074 return true; 3075 } 3076 3077 bool 3078 ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv, 3079 MachineFunction &MF, bool isVarArg, 3080 const SmallVectorImpl<ISD::OutputArg> &Outs, 3081 LLVMContext &Context) const { 3082 SmallVector<CCValAssign, 16> RVLocs; 3083 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 3084 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); 3085 } 3086 3087 static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps, 3088 const SDLoc &DL, SelectionDAG &DAG) { 3089 const MachineFunction &MF = DAG.getMachineFunction(); 3090 const Function &F = MF.getFunction(); 3091 3092 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString(); 3093 3094 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset 3095 // version of the "preferred return address". These offsets affect the return 3096 // instruction if this is a return from PL1 without hypervisor extensions. 3097 // IRQ/FIQ: +4 "subs pc, lr, #4" 3098 // SWI: 0 "subs pc, lr, #0" 3099 // ABORT: +4 "subs pc, lr, #4" 3100 // UNDEF: +4/+2 "subs pc, lr, #0" 3101 // UNDEF varies depending on where the exception came from ARM or Thumb 3102 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0. 3103 3104 int64_t LROffset; 3105 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" || 3106 IntKind == "ABORT") 3107 LROffset = 4; 3108 else if (IntKind == "SWI" || IntKind == "UNDEF") 3109 LROffset = 0; 3110 else 3111 report_fatal_error("Unsupported interrupt attribute. If present, value " 3112 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF"); 3113 3114 RetOps.insert(RetOps.begin() + 1, 3115 DAG.getConstant(LROffset, DL, MVT::i32, false)); 3116 3117 return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps); 3118 } 3119 3120 SDValue 3121 ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 3122 bool isVarArg, 3123 const SmallVectorImpl<ISD::OutputArg> &Outs, 3124 const SmallVectorImpl<SDValue> &OutVals, 3125 const SDLoc &dl, SelectionDAG &DAG) const { 3126 // CCValAssign - represent the assignment of the return value to a location. 3127 SmallVector<CCValAssign, 16> RVLocs; 3128 3129 // CCState - Info about the registers and stack slots. 3130 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 3131 *DAG.getContext()); 3132 3133 // Analyze outgoing return values. 3134 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); 3135 3136 SDValue Flag; 3137 SmallVector<SDValue, 4> RetOps; 3138 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 3139 bool isLittleEndian = Subtarget->isLittle(); 3140 3141 MachineFunction &MF = DAG.getMachineFunction(); 3142 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3143 AFI->setReturnRegsCount(RVLocs.size()); 3144 3145 // Report error if cmse entry function returns structure through first ptr arg. 3146 if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) { 3147 // Note: using an empty SDLoc(), as the first line of the function is a 3148 // better place to report than the last line. 3149 DiagnosticInfoUnsupported Diag( 3150 DAG.getMachineFunction().getFunction(), 3151 "secure entry function would return value through pointer", 3152 SDLoc().getDebugLoc()); 3153 DAG.getContext()->diagnose(Diag); 3154 } 3155 3156 // Copy the result values into the output registers. 3157 for (unsigned i = 0, realRVLocIdx = 0; 3158 i != RVLocs.size(); 3159 ++i, ++realRVLocIdx) { 3160 CCValAssign &VA = RVLocs[i]; 3161 assert(VA.isRegLoc() && "Can only return in registers!"); 3162 3163 SDValue Arg = OutVals[realRVLocIdx]; 3164 bool ReturnF16 = false; 3165 3166 if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) { 3167 // Half-precision return values can be returned like this: 3168 // 3169 // t11 f16 = fadd ... 3170 // t12: i16 = bitcast t11 3171 // t13: i32 = zero_extend t12 3172 // t14: f32 = bitcast t13 <~~~~~~~ Arg 3173 // 3174 // to avoid code generation for bitcasts, we simply set Arg to the node 3175 // that produces the f16 value, t11 in this case. 3176 // 3177 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) { 3178 SDValue ZE = Arg.getOperand(0); 3179 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) { 3180 SDValue BC = ZE.getOperand(0); 3181 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) { 3182 Arg = BC.getOperand(0); 3183 ReturnF16 = true; 3184 } 3185 } 3186 } 3187 } 3188 3189 switch (VA.getLocInfo()) { 3190 default: llvm_unreachable("Unknown loc info!"); 3191 case CCValAssign::Full: break; 3192 case CCValAssign::BCvt: 3193 if (!ReturnF16) 3194 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 3195 break; 3196 } 3197 3198 // Mask f16 arguments if this is a CMSE nonsecure entry. 3199 auto RetVT = Outs[realRVLocIdx].ArgVT; 3200 if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) { 3201 if (VA.needsCustom() && VA.getValVT() == MVT::f16) { 3202 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg); 3203 } else { 3204 auto LocBits = VA.getLocVT().getSizeInBits(); 3205 auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits()); 3206 SDValue Mask = 3207 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits)); 3208 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg); 3209 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask); 3210 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 3211 } 3212 } 3213 3214 if (VA.needsCustom() && 3215 (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) { 3216 if (VA.getLocVT() == MVT::v2f64) { 3217 // Extract the first half and return it in two registers. 3218 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 3219 DAG.getConstant(0, dl, MVT::i32)); 3220 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl, 3221 DAG.getVTList(MVT::i32, MVT::i32), Half); 3222 3223 Chain = 3224 DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 3225 HalfGPRs.getValue(isLittleEndian ? 0 : 1), Flag); 3226 Flag = Chain.getValue(1); 3227 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 3228 VA = RVLocs[++i]; // skip ahead to next loc 3229 Chain = 3230 DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 3231 HalfGPRs.getValue(isLittleEndian ? 1 : 0), Flag); 3232 Flag = Chain.getValue(1); 3233 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 3234 VA = RVLocs[++i]; // skip ahead to next loc 3235 3236 // Extract the 2nd half and fall through to handle it as an f64 value. 3237 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 3238 DAG.getConstant(1, dl, MVT::i32)); 3239 } 3240 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is 3241 // available. 3242 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 3243 DAG.getVTList(MVT::i32, MVT::i32), Arg); 3244 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 3245 fmrrd.getValue(isLittleEndian ? 0 : 1), Flag); 3246 Flag = Chain.getValue(1); 3247 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 3248 VA = RVLocs[++i]; // skip ahead to next loc 3249 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 3250 fmrrd.getValue(isLittleEndian ? 1 : 0), Flag); 3251 } else 3252 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); 3253 3254 // Guarantee that all emitted copies are 3255 // stuck together, avoiding something bad. 3256 Flag = Chain.getValue(1); 3257 RetOps.push_back(DAG.getRegister( 3258 VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT())); 3259 } 3260 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 3261 const MCPhysReg *I = 3262 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 3263 if (I) { 3264 for (; *I; ++I) { 3265 if (ARM::GPRRegClass.contains(*I)) 3266 RetOps.push_back(DAG.getRegister(*I, MVT::i32)); 3267 else if (ARM::DPRRegClass.contains(*I)) 3268 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); 3269 else 3270 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 3271 } 3272 } 3273 3274 // Update chain and glue. 3275 RetOps[0] = Chain; 3276 if (Flag.getNode()) 3277 RetOps.push_back(Flag); 3278 3279 // CPUs which aren't M-class use a special sequence to return from 3280 // exceptions (roughly, any instruction setting pc and cpsr simultaneously, 3281 // though we use "subs pc, lr, #N"). 3282 // 3283 // M-class CPUs actually use a normal return sequence with a special 3284 // (hardware-provided) value in LR, so the normal code path works. 3285 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") && 3286 !Subtarget->isMClass()) { 3287 if (Subtarget->isThumb1Only()) 3288 report_fatal_error("interrupt attribute is not supported in Thumb1"); 3289 return LowerInterruptReturn(RetOps, dl, DAG); 3290 } 3291 3292 ARMISD::NodeType RetNode = AFI->isCmseNSEntryFunction() ? ARMISD::SERET_FLAG : 3293 ARMISD::RET_FLAG; 3294 return DAG.getNode(RetNode, dl, MVT::Other, RetOps); 3295 } 3296 3297 bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { 3298 if (N->getNumValues() != 1) 3299 return false; 3300 if (!N->hasNUsesOfValue(1, 0)) 3301 return false; 3302 3303 SDValue TCChain = Chain; 3304 SDNode *Copy = *N->use_begin(); 3305 if (Copy->getOpcode() == ISD::CopyToReg) { 3306 // If the copy has a glue operand, we conservatively assume it isn't safe to 3307 // perform a tail call. 3308 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 3309 return false; 3310 TCChain = Copy->getOperand(0); 3311 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) { 3312 SDNode *VMov = Copy; 3313 // f64 returned in a pair of GPRs. 3314 SmallPtrSet<SDNode*, 2> Copies; 3315 for (SDNode *U : VMov->uses()) { 3316 if (U->getOpcode() != ISD::CopyToReg) 3317 return false; 3318 Copies.insert(U); 3319 } 3320 if (Copies.size() > 2) 3321 return false; 3322 3323 for (SDNode *U : VMov->uses()) { 3324 SDValue UseChain = U->getOperand(0); 3325 if (Copies.count(UseChain.getNode())) 3326 // Second CopyToReg 3327 Copy = U; 3328 else { 3329 // We are at the top of this chain. 3330 // If the copy has a glue operand, we conservatively assume it 3331 // isn't safe to perform a tail call. 3332 if (U->getOperand(U->getNumOperands() - 1).getValueType() == MVT::Glue) 3333 return false; 3334 // First CopyToReg 3335 TCChain = UseChain; 3336 } 3337 } 3338 } else if (Copy->getOpcode() == ISD::BITCAST) { 3339 // f32 returned in a single GPR. 3340 if (!Copy->hasOneUse()) 3341 return false; 3342 Copy = *Copy->use_begin(); 3343 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0)) 3344 return false; 3345 // If the copy has a glue operand, we conservatively assume it isn't safe to 3346 // perform a tail call. 3347 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 3348 return false; 3349 TCChain = Copy->getOperand(0); 3350 } else { 3351 return false; 3352 } 3353 3354 bool HasRet = false; 3355 for (const SDNode *U : Copy->uses()) { 3356 if (U->getOpcode() != ARMISD::RET_FLAG && 3357 U->getOpcode() != ARMISD::INTRET_FLAG) 3358 return false; 3359 HasRet = true; 3360 } 3361 3362 if (!HasRet) 3363 return false; 3364 3365 Chain = TCChain; 3366 return true; 3367 } 3368 3369 bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { 3370 if (!Subtarget->supportsTailCall()) 3371 return false; 3372 3373 if (!CI->isTailCall()) 3374 return false; 3375 3376 return true; 3377 } 3378 3379 // Trying to write a 64 bit value so need to split into two 32 bit values first, 3380 // and pass the lower and high parts through. 3381 static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) { 3382 SDLoc DL(Op); 3383 SDValue WriteValue = Op->getOperand(2); 3384 3385 // This function is only supposed to be called for i64 type argument. 3386 assert(WriteValue.getValueType() == MVT::i64 3387 && "LowerWRITE_REGISTER called for non-i64 type argument."); 3388 3389 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, 3390 DAG.getConstant(0, DL, MVT::i32)); 3391 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, 3392 DAG.getConstant(1, DL, MVT::i32)); 3393 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi }; 3394 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops); 3395 } 3396 3397 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 3398 // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is 3399 // one of the above mentioned nodes. It has to be wrapped because otherwise 3400 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 3401 // be used to form addressing mode. These wrapped nodes will be selected 3402 // into MOVi. 3403 SDValue ARMTargetLowering::LowerConstantPool(SDValue Op, 3404 SelectionDAG &DAG) const { 3405 EVT PtrVT = Op.getValueType(); 3406 // FIXME there is no actual debug info here 3407 SDLoc dl(Op); 3408 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 3409 SDValue Res; 3410 3411 // When generating execute-only code Constant Pools must be promoted to the 3412 // global data section. It's a bit ugly that we can't share them across basic 3413 // blocks, but this way we guarantee that execute-only behaves correct with 3414 // position-independent addressing modes. 3415 if (Subtarget->genExecuteOnly()) { 3416 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>(); 3417 auto T = const_cast<Type*>(CP->getType()); 3418 auto C = const_cast<Constant*>(CP->getConstVal()); 3419 auto M = const_cast<Module*>(DAG.getMachineFunction(). 3420 getFunction().getParent()); 3421 auto GV = new GlobalVariable( 3422 *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C, 3423 Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" + 3424 Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" + 3425 Twine(AFI->createPICLabelUId()) 3426 ); 3427 SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV), 3428 dl, PtrVT); 3429 return LowerGlobalAddress(GA, DAG); 3430 } 3431 3432 if (CP->isMachineConstantPoolEntry()) 3433 Res = 3434 DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CP->getAlign()); 3435 else 3436 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlign()); 3437 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res); 3438 } 3439 3440 unsigned ARMTargetLowering::getJumpTableEncoding() const { 3441 return MachineJumpTableInfo::EK_Inline; 3442 } 3443 3444 SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, 3445 SelectionDAG &DAG) const { 3446 MachineFunction &MF = DAG.getMachineFunction(); 3447 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3448 unsigned ARMPCLabelIndex = 0; 3449 SDLoc DL(Op); 3450 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3451 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 3452 SDValue CPAddr; 3453 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI(); 3454 if (!IsPositionIndependent) { 3455 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4)); 3456 } else { 3457 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; 3458 ARMPCLabelIndex = AFI->createPICLabelUId(); 3459 ARMConstantPoolValue *CPV = 3460 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex, 3461 ARMCP::CPBlockAddress, PCAdj); 3462 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); 3463 } 3464 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr); 3465 SDValue Result = DAG.getLoad( 3466 PtrVT, DL, DAG.getEntryNode(), CPAddr, 3467 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3468 if (!IsPositionIndependent) 3469 return Result; 3470 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32); 3471 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel); 3472 } 3473 3474 /// Convert a TLS address reference into the correct sequence of loads 3475 /// and calls to compute the variable's address for Darwin, and return an 3476 /// SDValue containing the final node. 3477 3478 /// Darwin only has one TLS scheme which must be capable of dealing with the 3479 /// fully general situation, in the worst case. This means: 3480 /// + "extern __thread" declaration. 3481 /// + Defined in a possibly unknown dynamic library. 3482 /// 3483 /// The general system is that each __thread variable has a [3 x i32] descriptor 3484 /// which contains information used by the runtime to calculate the address. The 3485 /// only part of this the compiler needs to know about is the first word, which 3486 /// contains a function pointer that must be called with the address of the 3487 /// entire descriptor in "r0". 3488 /// 3489 /// Since this descriptor may be in a different unit, in general access must 3490 /// proceed along the usual ARM rules. A common sequence to produce is: 3491 /// 3492 /// movw rT1, :lower16:_var$non_lazy_ptr 3493 /// movt rT1, :upper16:_var$non_lazy_ptr 3494 /// ldr r0, [rT1] 3495 /// ldr rT2, [r0] 3496 /// blx rT2 3497 /// [...address now in r0...] 3498 SDValue 3499 ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op, 3500 SelectionDAG &DAG) const { 3501 assert(Subtarget->isTargetDarwin() && 3502 "This function expects a Darwin target"); 3503 SDLoc DL(Op); 3504 3505 // First step is to get the address of the actua global symbol. This is where 3506 // the TLS descriptor lives. 3507 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG); 3508 3509 // The first entry in the descriptor is a function pointer that we must call 3510 // to obtain the address of the variable. 3511 SDValue Chain = DAG.getEntryNode(); 3512 SDValue FuncTLVGet = DAG.getLoad( 3513 MVT::i32, DL, Chain, DescAddr, 3514 MachinePointerInfo::getGOT(DAG.getMachineFunction()), Align(4), 3515 MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable | 3516 MachineMemOperand::MOInvariant); 3517 Chain = FuncTLVGet.getValue(1); 3518 3519 MachineFunction &F = DAG.getMachineFunction(); 3520 MachineFrameInfo &MFI = F.getFrameInfo(); 3521 MFI.setAdjustsStack(true); 3522 3523 // TLS calls preserve all registers except those that absolutely must be 3524 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be 3525 // silly). 3526 auto TRI = 3527 getTargetMachine().getSubtargetImpl(F.getFunction())->getRegisterInfo(); 3528 auto ARI = static_cast<const ARMRegisterInfo *>(TRI); 3529 const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction()); 3530 3531 // Finally, we can make the call. This is just a degenerate version of a 3532 // normal AArch64 call node: r0 takes the address of the descriptor, and 3533 // returns the address of the variable in this thread. 3534 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue()); 3535 Chain = 3536 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), 3537 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32), 3538 DAG.getRegisterMask(Mask), Chain.getValue(1)); 3539 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1)); 3540 } 3541 3542 SDValue 3543 ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op, 3544 SelectionDAG &DAG) const { 3545 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering"); 3546 3547 SDValue Chain = DAG.getEntryNode(); 3548 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3549 SDLoc DL(Op); 3550 3551 // Load the current TEB (thread environment block) 3552 SDValue Ops[] = {Chain, 3553 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32), 3554 DAG.getTargetConstant(15, DL, MVT::i32), 3555 DAG.getTargetConstant(0, DL, MVT::i32), 3556 DAG.getTargetConstant(13, DL, MVT::i32), 3557 DAG.getTargetConstant(0, DL, MVT::i32), 3558 DAG.getTargetConstant(2, DL, MVT::i32)}; 3559 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, 3560 DAG.getVTList(MVT::i32, MVT::Other), Ops); 3561 3562 SDValue TEB = CurrentTEB.getValue(0); 3563 Chain = CurrentTEB.getValue(1); 3564 3565 // Load the ThreadLocalStoragePointer from the TEB 3566 // A pointer to the TLS array is located at offset 0x2c from the TEB. 3567 SDValue TLSArray = 3568 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL)); 3569 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo()); 3570 3571 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4 3572 // offset into the TLSArray. 3573 3574 // Load the TLS index from the C runtime 3575 SDValue TLSIndex = 3576 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG); 3577 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex); 3578 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo()); 3579 3580 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex, 3581 DAG.getConstant(2, DL, MVT::i32)); 3582 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain, 3583 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot), 3584 MachinePointerInfo()); 3585 3586 // Get the offset of the start of the .tls section (section base) 3587 const auto *GA = cast<GlobalAddressSDNode>(Op); 3588 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL); 3589 SDValue Offset = DAG.getLoad( 3590 PtrVT, DL, Chain, 3591 DAG.getNode(ARMISD::Wrapper, DL, MVT::i32, 3592 DAG.getTargetConstantPool(CPV, PtrVT, Align(4))), 3593 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3594 3595 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset); 3596 } 3597 3598 // Lower ISD::GlobalTLSAddress using the "general dynamic" model 3599 SDValue 3600 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, 3601 SelectionDAG &DAG) const { 3602 SDLoc dl(GA); 3603 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3604 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 3605 MachineFunction &MF = DAG.getMachineFunction(); 3606 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3607 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 3608 ARMConstantPoolValue *CPV = 3609 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 3610 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true); 3611 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); 3612 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument); 3613 Argument = DAG.getLoad( 3614 PtrVT, dl, DAG.getEntryNode(), Argument, 3615 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3616 SDValue Chain = Argument.getValue(1); 3617 3618 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3619 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel); 3620 3621 // call __tls_get_addr. 3622 ArgListTy Args; 3623 ArgListEntry Entry; 3624 Entry.Node = Argument; 3625 Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext()); 3626 Args.push_back(Entry); 3627 3628 // FIXME: is there useful debug info available here? 3629 TargetLowering::CallLoweringInfo CLI(DAG); 3630 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee( 3631 CallingConv::C, Type::getInt32Ty(*DAG.getContext()), 3632 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args)); 3633 3634 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 3635 return CallResult.first; 3636 } 3637 3638 // Lower ISD::GlobalTLSAddress using the "initial exec" or 3639 // "local exec" model. 3640 SDValue 3641 ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, 3642 SelectionDAG &DAG, 3643 TLSModel::Model model) const { 3644 const GlobalValue *GV = GA->getGlobal(); 3645 SDLoc dl(GA); 3646 SDValue Offset; 3647 SDValue Chain = DAG.getEntryNode(); 3648 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3649 // Get the Thread Pointer 3650 SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 3651 3652 if (model == TLSModel::InitialExec) { 3653 MachineFunction &MF = DAG.getMachineFunction(); 3654 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3655 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 3656 // Initial exec model. 3657 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 3658 ARMConstantPoolValue *CPV = 3659 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 3660 ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF, 3661 true); 3662 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); 3663 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 3664 Offset = DAG.getLoad( 3665 PtrVT, dl, Chain, Offset, 3666 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3667 Chain = Offset.getValue(1); 3668 3669 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3670 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel); 3671 3672 Offset = DAG.getLoad( 3673 PtrVT, dl, Chain, Offset, 3674 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3675 } else { 3676 // local exec model 3677 assert(model == TLSModel::LocalExec); 3678 ARMConstantPoolValue *CPV = 3679 ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF); 3680 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); 3681 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 3682 Offset = DAG.getLoad( 3683 PtrVT, dl, Chain, Offset, 3684 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3685 } 3686 3687 // The address of the thread local variable is the add of the thread 3688 // pointer with the offset of the variable. 3689 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 3690 } 3691 3692 SDValue 3693 ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 3694 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 3695 if (DAG.getTarget().useEmulatedTLS()) 3696 return LowerToTLSEmulatedModel(GA, DAG); 3697 3698 if (Subtarget->isTargetDarwin()) 3699 return LowerGlobalTLSAddressDarwin(Op, DAG); 3700 3701 if (Subtarget->isTargetWindows()) 3702 return LowerGlobalTLSAddressWindows(Op, DAG); 3703 3704 // TODO: implement the "local dynamic" model 3705 assert(Subtarget->isTargetELF() && "Only ELF implemented here"); 3706 TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal()); 3707 3708 switch (model) { 3709 case TLSModel::GeneralDynamic: 3710 case TLSModel::LocalDynamic: 3711 return LowerToTLSGeneralDynamicModel(GA, DAG); 3712 case TLSModel::InitialExec: 3713 case TLSModel::LocalExec: 3714 return LowerToTLSExecModels(GA, DAG, model); 3715 } 3716 llvm_unreachable("bogus TLS model"); 3717 } 3718 3719 /// Return true if all users of V are within function F, looking through 3720 /// ConstantExprs. 3721 static bool allUsersAreInFunction(const Value *V, const Function *F) { 3722 SmallVector<const User*,4> Worklist(V->users()); 3723 while (!Worklist.empty()) { 3724 auto *U = Worklist.pop_back_val(); 3725 if (isa<ConstantExpr>(U)) { 3726 append_range(Worklist, U->users()); 3727 continue; 3728 } 3729 3730 auto *I = dyn_cast<Instruction>(U); 3731 if (!I || I->getParent()->getParent() != F) 3732 return false; 3733 } 3734 return true; 3735 } 3736 3737 static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, 3738 const GlobalValue *GV, SelectionDAG &DAG, 3739 EVT PtrVT, const SDLoc &dl) { 3740 // If we're creating a pool entry for a constant global with unnamed address, 3741 // and the global is small enough, we can emit it inline into the constant pool 3742 // to save ourselves an indirection. 3743 // 3744 // This is a win if the constant is only used in one function (so it doesn't 3745 // need to be duplicated) or duplicating the constant wouldn't increase code 3746 // size (implying the constant is no larger than 4 bytes). 3747 const Function &F = DAG.getMachineFunction().getFunction(); 3748 3749 // We rely on this decision to inline being idemopotent and unrelated to the 3750 // use-site. We know that if we inline a variable at one use site, we'll 3751 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel 3752 // doesn't know about this optimization, so bail out if it's enabled else 3753 // we could decide to inline here (and thus never emit the GV) but require 3754 // the GV from fast-isel generated code. 3755 if (!EnableConstpoolPromotion || 3756 DAG.getMachineFunction().getTarget().Options.EnableFastISel) 3757 return SDValue(); 3758 3759 auto *GVar = dyn_cast<GlobalVariable>(GV); 3760 if (!GVar || !GVar->hasInitializer() || 3761 !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() || 3762 !GVar->hasLocalLinkage()) 3763 return SDValue(); 3764 3765 // If we inline a value that contains relocations, we move the relocations 3766 // from .data to .text. This is not allowed in position-independent code. 3767 auto *Init = GVar->getInitializer(); 3768 if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) && 3769 Init->needsDynamicRelocation()) 3770 return SDValue(); 3771 3772 // The constant islands pass can only really deal with alignment requests 3773 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote 3774 // any type wanting greater alignment requirements than 4 bytes. We also 3775 // can only promote constants that are multiples of 4 bytes in size or 3776 // are paddable to a multiple of 4. Currently we only try and pad constants 3777 // that are strings for simplicity. 3778 auto *CDAInit = dyn_cast<ConstantDataArray>(Init); 3779 unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType()); 3780 Align PrefAlign = DAG.getDataLayout().getPreferredAlign(GVar); 3781 unsigned RequiredPadding = 4 - (Size % 4); 3782 bool PaddingPossible = 3783 RequiredPadding == 4 || (CDAInit && CDAInit->isString()); 3784 if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize || 3785 Size == 0) 3786 return SDValue(); 3787 3788 unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding); 3789 MachineFunction &MF = DAG.getMachineFunction(); 3790 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3791 3792 // We can't bloat the constant pool too much, else the ConstantIslands pass 3793 // may fail to converge. If we haven't promoted this global yet (it may have 3794 // multiple uses), and promoting it would increase the constant pool size (Sz 3795 // > 4), ensure we have space to do so up to MaxTotal. 3796 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4) 3797 if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >= 3798 ConstpoolPromotionMaxTotal) 3799 return SDValue(); 3800 3801 // This is only valid if all users are in a single function; we can't clone 3802 // the constant in general. The LLVM IR unnamed_addr allows merging 3803 // constants, but not cloning them. 3804 // 3805 // We could potentially allow cloning if we could prove all uses of the 3806 // constant in the current function don't care about the address, like 3807 // printf format strings. But that isn't implemented for now. 3808 if (!allUsersAreInFunction(GVar, &F)) 3809 return SDValue(); 3810 3811 // We're going to inline this global. Pad it out if needed. 3812 if (RequiredPadding != 4) { 3813 StringRef S = CDAInit->getAsString(); 3814 3815 SmallVector<uint8_t,16> V(S.size()); 3816 std::copy(S.bytes_begin(), S.bytes_end(), V.begin()); 3817 while (RequiredPadding--) 3818 V.push_back(0); 3819 Init = ConstantDataArray::get(*DAG.getContext(), V); 3820 } 3821 3822 auto CPVal = ARMConstantPoolConstant::Create(GVar, Init); 3823 SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, Align(4)); 3824 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) { 3825 AFI->markGlobalAsPromotedToConstantPool(GVar); 3826 AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() + 3827 PaddedSize - 4); 3828 } 3829 ++NumConstpoolPromoted; 3830 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3831 } 3832 3833 bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const { 3834 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 3835 if (!(GV = GA->getAliaseeObject())) 3836 return false; 3837 if (const auto *V = dyn_cast<GlobalVariable>(GV)) 3838 return V->isConstant(); 3839 return isa<Function>(GV); 3840 } 3841 3842 SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op, 3843 SelectionDAG &DAG) const { 3844 switch (Subtarget->getTargetTriple().getObjectFormat()) { 3845 default: llvm_unreachable("unknown object format"); 3846 case Triple::COFF: 3847 return LowerGlobalAddressWindows(Op, DAG); 3848 case Triple::ELF: 3849 return LowerGlobalAddressELF(Op, DAG); 3850 case Triple::MachO: 3851 return LowerGlobalAddressDarwin(Op, DAG); 3852 } 3853 } 3854 3855 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, 3856 SelectionDAG &DAG) const { 3857 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3858 SDLoc dl(Op); 3859 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3860 const TargetMachine &TM = getTargetMachine(); 3861 bool IsRO = isReadOnly(GV); 3862 3863 // promoteToConstantPool only if not generating XO text section 3864 if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV) && !Subtarget->genExecuteOnly()) 3865 if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl)) 3866 return V; 3867 3868 if (isPositionIndependent()) { 3869 bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV); 3870 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 3871 UseGOT_PREL ? ARMII::MO_GOT : 0); 3872 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G); 3873 if (UseGOT_PREL) 3874 Result = 3875 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, 3876 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3877 return Result; 3878 } else if (Subtarget->isROPI() && IsRO) { 3879 // PC-relative. 3880 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT); 3881 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G); 3882 return Result; 3883 } else if (Subtarget->isRWPI() && !IsRO) { 3884 // SB-relative. 3885 SDValue RelAddr; 3886 if (Subtarget->useMovt()) { 3887 ++NumMovwMovt; 3888 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL); 3889 RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G); 3890 } else { // use literal pool for address constant 3891 ARMConstantPoolValue *CPV = 3892 ARMConstantPoolConstant::Create(GV, ARMCP::SBREL); 3893 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); 3894 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3895 RelAddr = DAG.getLoad( 3896 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3897 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3898 } 3899 SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT); 3900 SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr); 3901 return Result; 3902 } 3903 3904 // If we have T2 ops, we can materialize the address directly via movt/movw 3905 // pair. This is always cheaper. 3906 if (Subtarget->useMovt()) { 3907 ++NumMovwMovt; 3908 // FIXME: Once remat is capable of dealing with instructions with register 3909 // operands, expand this into two nodes. 3910 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, 3911 DAG.getTargetGlobalAddress(GV, dl, PtrVT)); 3912 } else { 3913 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4)); 3914 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3915 return DAG.getLoad( 3916 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3917 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3918 } 3919 } 3920 3921 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, 3922 SelectionDAG &DAG) const { 3923 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 3924 "ROPI/RWPI not currently supported for Darwin"); 3925 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3926 SDLoc dl(Op); 3927 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3928 3929 if (Subtarget->useMovt()) 3930 ++NumMovwMovt; 3931 3932 // FIXME: Once remat is capable of dealing with instructions with register 3933 // operands, expand this into multiple nodes 3934 unsigned Wrapper = 3935 isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper; 3936 3937 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY); 3938 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G); 3939 3940 if (Subtarget->isGVIndirectSymbol(GV)) 3941 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, 3942 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3943 return Result; 3944 } 3945 3946 SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, 3947 SelectionDAG &DAG) const { 3948 assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported"); 3949 assert(Subtarget->useMovt() && 3950 "Windows on ARM expects to use movw/movt"); 3951 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 3952 "ROPI/RWPI not currently supported for Windows"); 3953 3954 const TargetMachine &TM = getTargetMachine(); 3955 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3956 ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG; 3957 if (GV->hasDLLImportStorageClass()) 3958 TargetFlags = ARMII::MO_DLLIMPORT; 3959 else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 3960 TargetFlags = ARMII::MO_COFFSTUB; 3961 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3962 SDValue Result; 3963 SDLoc DL(Op); 3964 3965 ++NumMovwMovt; 3966 3967 // FIXME: Once remat is capable of dealing with instructions with register 3968 // operands, expand this into two nodes. 3969 Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, 3970 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0, 3971 TargetFlags)); 3972 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB)) 3973 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, 3974 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3975 return Result; 3976 } 3977 3978 SDValue 3979 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { 3980 SDLoc dl(Op); 3981 SDValue Val = DAG.getConstant(0, dl, MVT::i32); 3982 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, 3983 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), 3984 Op.getOperand(1), Val); 3985 } 3986 3987 SDValue 3988 ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { 3989 SDLoc dl(Op); 3990 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0), 3991 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32)); 3992 } 3993 3994 SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, 3995 SelectionDAG &DAG) const { 3996 SDLoc dl(Op); 3997 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other, 3998 Op.getOperand(0)); 3999 } 4000 4001 SDValue ARMTargetLowering::LowerINTRINSIC_VOID( 4002 SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const { 4003 unsigned IntNo = 4004 cast<ConstantSDNode>( 4005 Op.getOperand(Op.getOperand(0).getValueType() == MVT::Other)) 4006 ->getZExtValue(); 4007 switch (IntNo) { 4008 default: 4009 return SDValue(); // Don't custom lower most intrinsics. 4010 case Intrinsic::arm_gnu_eabi_mcount: { 4011 MachineFunction &MF = DAG.getMachineFunction(); 4012 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 4013 SDLoc dl(Op); 4014 SDValue Chain = Op.getOperand(0); 4015 // call "\01__gnu_mcount_nc" 4016 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); 4017 const uint32_t *Mask = 4018 ARI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C); 4019 assert(Mask && "Missing call preserved mask for calling convention"); 4020 // Mark LR an implicit live-in. 4021 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); 4022 SDValue ReturnAddress = 4023 DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT); 4024 constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue}; 4025 SDValue Callee = 4026 DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0); 4027 SDValue RegisterMask = DAG.getRegisterMask(Mask); 4028 if (Subtarget->isThumb()) 4029 return SDValue( 4030 DAG.getMachineNode( 4031 ARM::tBL_PUSHLR, dl, ResultTys, 4032 {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT), 4033 DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}), 4034 0); 4035 return SDValue( 4036 DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys, 4037 {ReturnAddress, Callee, RegisterMask, Chain}), 4038 0); 4039 } 4040 } 4041 } 4042 4043 SDValue 4044 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, 4045 const ARMSubtarget *Subtarget) const { 4046 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 4047 SDLoc dl(Op); 4048 switch (IntNo) { 4049 default: return SDValue(); // Don't custom lower most intrinsics. 4050 case Intrinsic::thread_pointer: { 4051 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 4052 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 4053 } 4054 case Intrinsic::arm_cls: { 4055 const SDValue &Operand = Op.getOperand(1); 4056 const EVT VTy = Op.getValueType(); 4057 SDValue SRA = 4058 DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy)); 4059 SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand); 4060 SDValue SHL = 4061 DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy)); 4062 SDValue OR = 4063 DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy)); 4064 SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR); 4065 return Result; 4066 } 4067 case Intrinsic::arm_cls64: { 4068 // cls(x) = if cls(hi(x)) != 31 then cls(hi(x)) 4069 // else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x))) 4070 const SDValue &Operand = Op.getOperand(1); 4071 const EVT VTy = Op.getValueType(); 4072 4073 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand, 4074 DAG.getConstant(1, dl, VTy)); 4075 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand, 4076 DAG.getConstant(0, dl, VTy)); 4077 SDValue Constant0 = DAG.getConstant(0, dl, VTy); 4078 SDValue Constant1 = DAG.getConstant(1, dl, VTy); 4079 SDValue Constant31 = DAG.getConstant(31, dl, VTy); 4080 SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31); 4081 SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi); 4082 SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1); 4083 SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1); 4084 SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi); 4085 SDValue CheckLo = 4086 DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ); 4087 SDValue HiIsZero = 4088 DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ); 4089 SDValue AdjustedLo = 4090 DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy)); 4091 SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VTy, AdjustedLo); 4092 SDValue Result = 4093 DAG.getSelect(dl, VTy, CheckLo, 4094 DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi); 4095 return Result; 4096 } 4097 case Intrinsic::eh_sjlj_lsda: { 4098 MachineFunction &MF = DAG.getMachineFunction(); 4099 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 4100 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 4101 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 4102 SDValue CPAddr; 4103 bool IsPositionIndependent = isPositionIndependent(); 4104 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0; 4105 ARMConstantPoolValue *CPV = 4106 ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex, 4107 ARMCP::CPLSDA, PCAdj); 4108 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); 4109 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 4110 SDValue Result = DAG.getLoad( 4111 PtrVT, dl, DAG.getEntryNode(), CPAddr, 4112 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 4113 4114 if (IsPositionIndependent) { 4115 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 4116 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); 4117 } 4118 return Result; 4119 } 4120 case Intrinsic::arm_neon_vabs: 4121 return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(), 4122 Op.getOperand(1)); 4123 case Intrinsic::arm_neon_vmulls: 4124 case Intrinsic::arm_neon_vmullu: { 4125 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls) 4126 ? ARMISD::VMULLs : ARMISD::VMULLu; 4127 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 4128 Op.getOperand(1), Op.getOperand(2)); 4129 } 4130 case Intrinsic::arm_neon_vminnm: 4131 case Intrinsic::arm_neon_vmaxnm: { 4132 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm) 4133 ? ISD::FMINNUM : ISD::FMAXNUM; 4134 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 4135 Op.getOperand(1), Op.getOperand(2)); 4136 } 4137 case Intrinsic::arm_neon_vminu: 4138 case Intrinsic::arm_neon_vmaxu: { 4139 if (Op.getValueType().isFloatingPoint()) 4140 return SDValue(); 4141 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu) 4142 ? ISD::UMIN : ISD::UMAX; 4143 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 4144 Op.getOperand(1), Op.getOperand(2)); 4145 } 4146 case Intrinsic::arm_neon_vmins: 4147 case Intrinsic::arm_neon_vmaxs: { 4148 // v{min,max}s is overloaded between signed integers and floats. 4149 if (!Op.getValueType().isFloatingPoint()) { 4150 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) 4151 ? ISD::SMIN : ISD::SMAX; 4152 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 4153 Op.getOperand(1), Op.getOperand(2)); 4154 } 4155 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) 4156 ? ISD::FMINIMUM : ISD::FMAXIMUM; 4157 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 4158 Op.getOperand(1), Op.getOperand(2)); 4159 } 4160 case Intrinsic::arm_neon_vtbl1: 4161 return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(), 4162 Op.getOperand(1), Op.getOperand(2)); 4163 case Intrinsic::arm_neon_vtbl2: 4164 return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(), 4165 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 4166 case Intrinsic::arm_mve_pred_i2v: 4167 case Intrinsic::arm_mve_pred_v2i: 4168 return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(), 4169 Op.getOperand(1)); 4170 case Intrinsic::arm_mve_vreinterpretq: 4171 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(), 4172 Op.getOperand(1)); 4173 case Intrinsic::arm_mve_lsll: 4174 return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(), 4175 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 4176 case Intrinsic::arm_mve_asrl: 4177 return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(), 4178 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 4179 } 4180 } 4181 4182 static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, 4183 const ARMSubtarget *Subtarget) { 4184 SDLoc dl(Op); 4185 ConstantSDNode *SSIDNode = cast<ConstantSDNode>(Op.getOperand(2)); 4186 auto SSID = static_cast<SyncScope::ID>(SSIDNode->getZExtValue()); 4187 if (SSID == SyncScope::SingleThread) 4188 return Op; 4189 4190 if (!Subtarget->hasDataBarrier()) { 4191 // Some ARMv6 cpus can support data barriers with an mcr instruction. 4192 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 4193 // here. 4194 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() && 4195 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!"); 4196 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0), 4197 DAG.getConstant(0, dl, MVT::i32)); 4198 } 4199 4200 ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1)); 4201 AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue()); 4202 ARM_MB::MemBOpt Domain = ARM_MB::ISH; 4203 if (Subtarget->isMClass()) { 4204 // Only a full system barrier exists in the M-class architectures. 4205 Domain = ARM_MB::SY; 4206 } else if (Subtarget->preferISHSTBarriers() && 4207 Ord == AtomicOrdering::Release) { 4208 // Swift happens to implement ISHST barriers in a way that's compatible with 4209 // Release semantics but weaker than ISH so we'd be fools not to use 4210 // it. Beware: other processors probably don't! 4211 Domain = ARM_MB::ISHST; 4212 } 4213 4214 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0), 4215 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32), 4216 DAG.getConstant(Domain, dl, MVT::i32)); 4217 } 4218 4219 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG, 4220 const ARMSubtarget *Subtarget) { 4221 // ARM pre v5TE and Thumb1 does not have preload instructions. 4222 if (!(Subtarget->isThumb2() || 4223 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps()))) 4224 // Just preserve the chain. 4225 return Op.getOperand(0); 4226 4227 SDLoc dl(Op); 4228 unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1; 4229 if (!isRead && 4230 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension())) 4231 // ARMv7 with MP extension has PLDW. 4232 return Op.getOperand(0); 4233 4234 unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 4235 if (Subtarget->isThumb()) { 4236 // Invert the bits. 4237 isRead = ~isRead & 1; 4238 isData = ~isData & 1; 4239 } 4240 4241 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0), 4242 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32), 4243 DAG.getConstant(isData, dl, MVT::i32)); 4244 } 4245 4246 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { 4247 MachineFunction &MF = DAG.getMachineFunction(); 4248 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>(); 4249 4250 // vastart just stores the address of the VarArgsFrameIndex slot into the 4251 // memory location argument. 4252 SDLoc dl(Op); 4253 EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 4254 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 4255 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 4256 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), 4257 MachinePointerInfo(SV)); 4258 } 4259 4260 SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, 4261 CCValAssign &NextVA, 4262 SDValue &Root, 4263 SelectionDAG &DAG, 4264 const SDLoc &dl) const { 4265 MachineFunction &MF = DAG.getMachineFunction(); 4266 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 4267 4268 const TargetRegisterClass *RC; 4269 if (AFI->isThumb1OnlyFunction()) 4270 RC = &ARM::tGPRRegClass; 4271 else 4272 RC = &ARM::GPRRegClass; 4273 4274 // Transform the arguments stored in physical registers into virtual ones. 4275 Register Reg = MF.addLiveIn(VA.getLocReg(), RC); 4276 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 4277 4278 SDValue ArgValue2; 4279 if (NextVA.isMemLoc()) { 4280 MachineFrameInfo &MFI = MF.getFrameInfo(); 4281 int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true); 4282 4283 // Create load node to retrieve arguments from the stack. 4284 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 4285 ArgValue2 = DAG.getLoad( 4286 MVT::i32, dl, Root, FIN, 4287 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); 4288 } else { 4289 Reg = MF.addLiveIn(NextVA.getLocReg(), RC); 4290 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 4291 } 4292 if (!Subtarget->isLittle()) 4293 std::swap (ArgValue, ArgValue2); 4294 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2); 4295 } 4296 4297 // The remaining GPRs hold either the beginning of variable-argument 4298 // data, or the beginning of an aggregate passed by value (usually 4299 // byval). Either way, we allocate stack slots adjacent to the data 4300 // provided by our caller, and store the unallocated registers there. 4301 // If this is a variadic function, the va_list pointer will begin with 4302 // these values; otherwise, this reassembles a (byval) structure that 4303 // was split between registers and memory. 4304 // Return: The frame index registers were stored into. 4305 int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, 4306 const SDLoc &dl, SDValue &Chain, 4307 const Value *OrigArg, 4308 unsigned InRegsParamRecordIdx, 4309 int ArgOffset, unsigned ArgSize) const { 4310 // Currently, two use-cases possible: 4311 // Case #1. Non-var-args function, and we meet first byval parameter. 4312 // Setup first unallocated register as first byval register; 4313 // eat all remained registers 4314 // (these two actions are performed by HandleByVal method). 4315 // Then, here, we initialize stack frame with 4316 // "store-reg" instructions. 4317 // Case #2. Var-args function, that doesn't contain byval parameters. 4318 // The same: eat all remained unallocated registers, 4319 // initialize stack frame. 4320 4321 MachineFunction &MF = DAG.getMachineFunction(); 4322 MachineFrameInfo &MFI = MF.getFrameInfo(); 4323 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 4324 unsigned RBegin, REnd; 4325 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) { 4326 CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd); 4327 } else { 4328 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs); 4329 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx]; 4330 REnd = ARM::R4; 4331 } 4332 4333 if (REnd != RBegin) 4334 ArgOffset = -4 * (ARM::R4 - RBegin); 4335 4336 auto PtrVT = getPointerTy(DAG.getDataLayout()); 4337 int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false); 4338 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT); 4339 4340 SmallVector<SDValue, 4> MemOps; 4341 const TargetRegisterClass *RC = 4342 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass; 4343 4344 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) { 4345 Register VReg = MF.addLiveIn(Reg, RC); 4346 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); 4347 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, 4348 MachinePointerInfo(OrigArg, 4 * i)); 4349 MemOps.push_back(Store); 4350 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT)); 4351 } 4352 4353 if (!MemOps.empty()) 4354 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); 4355 return FrameIndex; 4356 } 4357 4358 // Setup stack frame, the va_list pointer will start from. 4359 void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, 4360 const SDLoc &dl, SDValue &Chain, 4361 unsigned ArgOffset, 4362 unsigned TotalArgRegsSaveSize, 4363 bool ForceMutable) const { 4364 MachineFunction &MF = DAG.getMachineFunction(); 4365 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 4366 4367 // Try to store any remaining integer argument regs 4368 // to their spots on the stack so that they may be loaded by dereferencing 4369 // the result of va_next. 4370 // If there is no regs to be stored, just point address after last 4371 // argument passed via stack. 4372 int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr, 4373 CCInfo.getInRegsParamsCount(), 4374 CCInfo.getNextStackOffset(), 4375 std::max(4U, TotalArgRegsSaveSize)); 4376 AFI->setVarArgsFrameIndex(FrameIndex); 4377 } 4378 4379 bool ARMTargetLowering::splitValueIntoRegisterParts( 4380 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, 4381 unsigned NumParts, MVT PartVT, Optional<CallingConv::ID> CC) const { 4382 bool IsABIRegCopy = CC.hasValue(); 4383 EVT ValueVT = Val.getValueType(); 4384 if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) && 4385 PartVT == MVT::f32) { 4386 unsigned ValueBits = ValueVT.getSizeInBits(); 4387 unsigned PartBits = PartVT.getSizeInBits(); 4388 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val); 4389 Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val); 4390 Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val); 4391 Parts[0] = Val; 4392 return true; 4393 } 4394 return false; 4395 } 4396 4397 SDValue ARMTargetLowering::joinRegisterPartsIntoValue( 4398 SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts, 4399 MVT PartVT, EVT ValueVT, Optional<CallingConv::ID> CC) const { 4400 bool IsABIRegCopy = CC.hasValue(); 4401 if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) && 4402 PartVT == MVT::f32) { 4403 unsigned ValueBits = ValueVT.getSizeInBits(); 4404 unsigned PartBits = PartVT.getSizeInBits(); 4405 SDValue Val = Parts[0]; 4406 4407 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val); 4408 Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val); 4409 Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val); 4410 return Val; 4411 } 4412 return SDValue(); 4413 } 4414 4415 SDValue ARMTargetLowering::LowerFormalArguments( 4416 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 4417 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 4418 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 4419 MachineFunction &MF = DAG.getMachineFunction(); 4420 MachineFrameInfo &MFI = MF.getFrameInfo(); 4421 4422 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 4423 4424 // Assign locations to all of the incoming arguments. 4425 SmallVector<CCValAssign, 16> ArgLocs; 4426 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 4427 *DAG.getContext()); 4428 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg)); 4429 4430 SmallVector<SDValue, 16> ArgValues; 4431 SDValue ArgValue; 4432 Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin(); 4433 unsigned CurArgIdx = 0; 4434 4435 // Initially ArgRegsSaveSize is zero. 4436 // Then we increase this value each time we meet byval parameter. 4437 // We also increase this value in case of varargs function. 4438 AFI->setArgRegsSaveSize(0); 4439 4440 // Calculate the amount of stack space that we need to allocate to store 4441 // byval and variadic arguments that are passed in registers. 4442 // We need to know this before we allocate the first byval or variadic 4443 // argument, as they will be allocated a stack slot below the CFA (Canonical 4444 // Frame Address, the stack pointer at entry to the function). 4445 unsigned ArgRegBegin = ARM::R4; 4446 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 4447 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount()) 4448 break; 4449 4450 CCValAssign &VA = ArgLocs[i]; 4451 unsigned Index = VA.getValNo(); 4452 ISD::ArgFlagsTy Flags = Ins[Index].Flags; 4453 if (!Flags.isByVal()) 4454 continue; 4455 4456 assert(VA.isMemLoc() && "unexpected byval pointer in reg"); 4457 unsigned RBegin, REnd; 4458 CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd); 4459 ArgRegBegin = std::min(ArgRegBegin, RBegin); 4460 4461 CCInfo.nextInRegsParam(); 4462 } 4463 CCInfo.rewindByValRegsInfo(); 4464 4465 int lastInsIndex = -1; 4466 if (isVarArg && MFI.hasVAStart()) { 4467 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs); 4468 if (RegIdx != array_lengthof(GPRArgRegs)) 4469 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]); 4470 } 4471 4472 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin); 4473 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize); 4474 auto PtrVT = getPointerTy(DAG.getDataLayout()); 4475 4476 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 4477 CCValAssign &VA = ArgLocs[i]; 4478 if (Ins[VA.getValNo()].isOrigArg()) { 4479 std::advance(CurOrigArg, 4480 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx); 4481 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex(); 4482 } 4483 // Arguments stored in registers. 4484 if (VA.isRegLoc()) { 4485 EVT RegVT = VA.getLocVT(); 4486 4487 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) { 4488 // f64 and vector types are split up into multiple registers or 4489 // combinations of registers and stack slots. 4490 SDValue ArgValue1 = 4491 GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); 4492 VA = ArgLocs[++i]; // skip ahead to next loc 4493 SDValue ArgValue2; 4494 if (VA.isMemLoc()) { 4495 int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true); 4496 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 4497 ArgValue2 = DAG.getLoad( 4498 MVT::f64, dl, Chain, FIN, 4499 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); 4500 } else { 4501 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); 4502 } 4503 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 4504 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue, 4505 ArgValue1, DAG.getIntPtrConstant(0, dl)); 4506 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue, 4507 ArgValue2, DAG.getIntPtrConstant(1, dl)); 4508 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) { 4509 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); 4510 } else { 4511 const TargetRegisterClass *RC; 4512 4513 if (RegVT == MVT::f16 || RegVT == MVT::bf16) 4514 RC = &ARM::HPRRegClass; 4515 else if (RegVT == MVT::f32) 4516 RC = &ARM::SPRRegClass; 4517 else if (RegVT == MVT::f64 || RegVT == MVT::v4f16 || 4518 RegVT == MVT::v4bf16) 4519 RC = &ARM::DPRRegClass; 4520 else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16 || 4521 RegVT == MVT::v8bf16) 4522 RC = &ARM::QPRRegClass; 4523 else if (RegVT == MVT::i32) 4524 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass 4525 : &ARM::GPRRegClass; 4526 else 4527 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); 4528 4529 // Transform the arguments in physical registers into virtual ones. 4530 Register Reg = MF.addLiveIn(VA.getLocReg(), RC); 4531 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 4532 4533 // If this value is passed in r0 and has the returned attribute (e.g. 4534 // C++ 'structors), record this fact for later use. 4535 if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) { 4536 AFI->setPreservesR0(); 4537 } 4538 } 4539 4540 // If this is an 8 or 16-bit value, it is really passed promoted 4541 // to 32 bits. Insert an assert[sz]ext to capture this, then 4542 // truncate to the right size. 4543 switch (VA.getLocInfo()) { 4544 default: llvm_unreachable("Unknown loc info!"); 4545 case CCValAssign::Full: break; 4546 case CCValAssign::BCvt: 4547 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 4548 break; 4549 case CCValAssign::SExt: 4550 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 4551 DAG.getValueType(VA.getValVT())); 4552 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 4553 break; 4554 case CCValAssign::ZExt: 4555 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 4556 DAG.getValueType(VA.getValVT())); 4557 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 4558 break; 4559 } 4560 4561 // f16 arguments have their size extended to 4 bytes and passed as if they 4562 // had been copied to the LSBs of a 32-bit register. 4563 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI) 4564 if (VA.needsCustom() && 4565 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) 4566 ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue); 4567 4568 InVals.push_back(ArgValue); 4569 } else { // VA.isRegLoc() 4570 // Only arguments passed on the stack should make it here. 4571 assert(VA.isMemLoc()); 4572 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered"); 4573 4574 int index = VA.getValNo(); 4575 4576 // Some Ins[] entries become multiple ArgLoc[] entries. 4577 // Process them only once. 4578 if (index != lastInsIndex) 4579 { 4580 ISD::ArgFlagsTy Flags = Ins[index].Flags; 4581 // FIXME: For now, all byval parameter objects are marked mutable. 4582 // This can be changed with more analysis. 4583 // In case of tail call optimization mark all arguments mutable. 4584 // Since they could be overwritten by lowering of arguments in case of 4585 // a tail call. 4586 if (Flags.isByVal()) { 4587 assert(Ins[index].isOrigArg() && 4588 "Byval arguments cannot be implicit"); 4589 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed(); 4590 4591 int FrameIndex = StoreByValRegs( 4592 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex, 4593 VA.getLocMemOffset(), Flags.getByValSize()); 4594 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT)); 4595 CCInfo.nextInRegsParam(); 4596 } else { 4597 unsigned FIOffset = VA.getLocMemOffset(); 4598 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8, 4599 FIOffset, true); 4600 4601 // Create load nodes to retrieve arguments from the stack. 4602 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 4603 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, 4604 MachinePointerInfo::getFixedStack( 4605 DAG.getMachineFunction(), FI))); 4606 } 4607 lastInsIndex = index; 4608 } 4609 } 4610 } 4611 4612 // varargs 4613 if (isVarArg && MFI.hasVAStart()) { 4614 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getNextStackOffset(), 4615 TotalArgRegsSaveSize); 4616 if (AFI->isCmseNSEntryFunction()) { 4617 DiagnosticInfoUnsupported Diag( 4618 DAG.getMachineFunction().getFunction(), 4619 "secure entry function must not be variadic", dl.getDebugLoc()); 4620 DAG.getContext()->diagnose(Diag); 4621 } 4622 } 4623 4624 unsigned StackArgSize = CCInfo.getNextStackOffset(); 4625 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 4626 if (canGuaranteeTCO(CallConv, TailCallOpt)) { 4627 // The only way to guarantee a tail call is if the callee restores its 4628 // argument area, but it must also keep the stack aligned when doing so. 4629 const DataLayout &DL = DAG.getDataLayout(); 4630 StackArgSize = alignTo(StackArgSize, DL.getStackAlignment()); 4631 4632 AFI->setArgumentStackToRestore(StackArgSize); 4633 } 4634 AFI->setArgumentStackSize(StackArgSize); 4635 4636 if (CCInfo.getNextStackOffset() > 0 && AFI->isCmseNSEntryFunction()) { 4637 DiagnosticInfoUnsupported Diag( 4638 DAG.getMachineFunction().getFunction(), 4639 "secure entry function requires arguments on stack", dl.getDebugLoc()); 4640 DAG.getContext()->diagnose(Diag); 4641 } 4642 4643 return Chain; 4644 } 4645 4646 /// isFloatingPointZero - Return true if this is +0.0. 4647 static bool isFloatingPointZero(SDValue Op) { 4648 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) 4649 return CFP->getValueAPF().isPosZero(); 4650 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) { 4651 // Maybe this has already been legalized into the constant pool? 4652 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) { 4653 SDValue WrapperOp = Op.getOperand(1).getOperand(0); 4654 if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp)) 4655 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal())) 4656 return CFP->getValueAPF().isPosZero(); 4657 } 4658 } else if (Op->getOpcode() == ISD::BITCAST && 4659 Op->getValueType(0) == MVT::f64) { 4660 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64) 4661 // created by LowerConstantFP(). 4662 SDValue BitcastOp = Op->getOperand(0); 4663 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM && 4664 isNullConstant(BitcastOp->getOperand(0))) 4665 return true; 4666 } 4667 return false; 4668 } 4669 4670 /// Returns appropriate ARM CMP (cmp) and corresponding condition code for 4671 /// the given operands. 4672 SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, 4673 SDValue &ARMcc, SelectionDAG &DAG, 4674 const SDLoc &dl) const { 4675 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { 4676 unsigned C = RHSC->getZExtValue(); 4677 if (!isLegalICmpImmediate((int32_t)C)) { 4678 // Constant does not fit, try adjusting it by one. 4679 switch (CC) { 4680 default: break; 4681 case ISD::SETLT: 4682 case ISD::SETGE: 4683 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) { 4684 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; 4685 RHS = DAG.getConstant(C - 1, dl, MVT::i32); 4686 } 4687 break; 4688 case ISD::SETULT: 4689 case ISD::SETUGE: 4690 if (C != 0 && isLegalICmpImmediate(C-1)) { 4691 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; 4692 RHS = DAG.getConstant(C - 1, dl, MVT::i32); 4693 } 4694 break; 4695 case ISD::SETLE: 4696 case ISD::SETGT: 4697 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) { 4698 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; 4699 RHS = DAG.getConstant(C + 1, dl, MVT::i32); 4700 } 4701 break; 4702 case ISD::SETULE: 4703 case ISD::SETUGT: 4704 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) { 4705 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; 4706 RHS = DAG.getConstant(C + 1, dl, MVT::i32); 4707 } 4708 break; 4709 } 4710 } 4711 } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) && 4712 (ARM_AM::getShiftOpcForNode(RHS.getOpcode()) == ARM_AM::no_shift)) { 4713 // In ARM and Thumb-2, the compare instructions can shift their second 4714 // operand. 4715 CC = ISD::getSetCCSwappedOperands(CC); 4716 std::swap(LHS, RHS); 4717 } 4718 4719 // Thumb1 has very limited immediate modes, so turning an "and" into a 4720 // shift can save multiple instructions. 4721 // 4722 // If we have (x & C1), and C1 is an appropriate mask, we can transform it 4723 // into "((x << n) >> n)". But that isn't necessarily profitable on its 4724 // own. If it's the operand to an unsigned comparison with an immediate, 4725 // we can eliminate one of the shifts: we transform 4726 // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)". 4727 // 4728 // We avoid transforming cases which aren't profitable due to encoding 4729 // details: 4730 // 4731 // 1. C2 fits into the immediate field of a cmp, and the transformed version 4732 // would not; in that case, we're essentially trading one immediate load for 4733 // another. 4734 // 2. C1 is 255 or 65535, so we can use uxtb or uxth. 4735 // 3. C2 is zero; we have other code for this special case. 4736 // 4737 // FIXME: Figure out profitability for Thumb2; we usually can't save an 4738 // instruction, since the AND is always one instruction anyway, but we could 4739 // use narrow instructions in some cases. 4740 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND && 4741 LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) && 4742 LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) && 4743 !isSignedIntSetCC(CC)) { 4744 unsigned Mask = cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue(); 4745 auto *RHSC = cast<ConstantSDNode>(RHS.getNode()); 4746 uint64_t RHSV = RHSC->getZExtValue(); 4747 if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) { 4748 unsigned ShiftBits = countLeadingZeros(Mask); 4749 if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) { 4750 SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32); 4751 LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt); 4752 RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32); 4753 } 4754 } 4755 } 4756 4757 // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a 4758 // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same 4759 // way a cmp would. 4760 // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and 4761 // some tweaks to the heuristics for the previous and->shift transform. 4762 // FIXME: Optimize cases where the LHS isn't a shift. 4763 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL && 4764 isa<ConstantSDNode>(RHS) && 4765 cast<ConstantSDNode>(RHS)->getZExtValue() == 0x80000000U && 4766 CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) && 4767 cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() < 31) { 4768 unsigned ShiftAmt = 4769 cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() + 1; 4770 SDValue Shift = DAG.getNode(ARMISD::LSLS, dl, 4771 DAG.getVTList(MVT::i32, MVT::i32), 4772 LHS.getOperand(0), 4773 DAG.getConstant(ShiftAmt, dl, MVT::i32)); 4774 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, 4775 Shift.getValue(1), SDValue()); 4776 ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32); 4777 return Chain.getValue(1); 4778 } 4779 4780 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 4781 4782 // If the RHS is a constant zero then the V (overflow) flag will never be 4783 // set. This can allow us to simplify GE to PL or LT to MI, which can be 4784 // simpler for other passes (like the peephole optimiser) to deal with. 4785 if (isNullConstant(RHS)) { 4786 switch (CondCode) { 4787 default: break; 4788 case ARMCC::GE: 4789 CondCode = ARMCC::PL; 4790 break; 4791 case ARMCC::LT: 4792 CondCode = ARMCC::MI; 4793 break; 4794 } 4795 } 4796 4797 ARMISD::NodeType CompareType; 4798 switch (CondCode) { 4799 default: 4800 CompareType = ARMISD::CMP; 4801 break; 4802 case ARMCC::EQ: 4803 case ARMCC::NE: 4804 // Uses only Z Flag 4805 CompareType = ARMISD::CMPZ; 4806 break; 4807 } 4808 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 4809 return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS); 4810 } 4811 4812 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. 4813 SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, 4814 SelectionDAG &DAG, const SDLoc &dl, 4815 bool Signaling) const { 4816 assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64); 4817 SDValue Cmp; 4818 if (!isFloatingPointZero(RHS)) 4819 Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP, 4820 dl, MVT::Glue, LHS, RHS); 4821 else 4822 Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0, 4823 dl, MVT::Glue, LHS); 4824 return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp); 4825 } 4826 4827 /// duplicateCmp - Glue values can have only one use, so this function 4828 /// duplicates a comparison node. 4829 SDValue 4830 ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const { 4831 unsigned Opc = Cmp.getOpcode(); 4832 SDLoc DL(Cmp); 4833 if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ) 4834 return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); 4835 4836 assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation"); 4837 Cmp = Cmp.getOperand(0); 4838 Opc = Cmp.getOpcode(); 4839 if (Opc == ARMISD::CMPFP) 4840 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); 4841 else { 4842 assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT"); 4843 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0)); 4844 } 4845 return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp); 4846 } 4847 4848 // This function returns three things: the arithmetic computation itself 4849 // (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The 4850 // comparison and the condition code define the case in which the arithmetic 4851 // computation *does not* overflow. 4852 std::pair<SDValue, SDValue> 4853 ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG, 4854 SDValue &ARMcc) const { 4855 assert(Op.getValueType() == MVT::i32 && "Unsupported value type"); 4856 4857 SDValue Value, OverflowCmp; 4858 SDValue LHS = Op.getOperand(0); 4859 SDValue RHS = Op.getOperand(1); 4860 SDLoc dl(Op); 4861 4862 // FIXME: We are currently always generating CMPs because we don't support 4863 // generating CMN through the backend. This is not as good as the natural 4864 // CMP case because it causes a register dependency and cannot be folded 4865 // later. 4866 4867 switch (Op.getOpcode()) { 4868 default: 4869 llvm_unreachable("Unknown overflow instruction!"); 4870 case ISD::SADDO: 4871 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); 4872 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS); 4873 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); 4874 break; 4875 case ISD::UADDO: 4876 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); 4877 // We use ADDC here to correspond to its use in LowerUnsignedALUO. 4878 // We do not use it in the USUBO case as Value may not be used. 4879 Value = DAG.getNode(ARMISD::ADDC, dl, 4880 DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS) 4881 .getValue(0); 4882 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); 4883 break; 4884 case ISD::SSUBO: 4885 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); 4886 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); 4887 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); 4888 break; 4889 case ISD::USUBO: 4890 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); 4891 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); 4892 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); 4893 break; 4894 case ISD::UMULO: 4895 // We generate a UMUL_LOHI and then check if the high word is 0. 4896 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32); 4897 Value = DAG.getNode(ISD::UMUL_LOHI, dl, 4898 DAG.getVTList(Op.getValueType(), Op.getValueType()), 4899 LHS, RHS); 4900 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1), 4901 DAG.getConstant(0, dl, MVT::i32)); 4902 Value = Value.getValue(0); // We only want the low 32 bits for the result. 4903 break; 4904 case ISD::SMULO: 4905 // We generate a SMUL_LOHI and then check if all the bits of the high word 4906 // are the same as the sign bit of the low word. 4907 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32); 4908 Value = DAG.getNode(ISD::SMUL_LOHI, dl, 4909 DAG.getVTList(Op.getValueType(), Op.getValueType()), 4910 LHS, RHS); 4911 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1), 4912 DAG.getNode(ISD::SRA, dl, Op.getValueType(), 4913 Value.getValue(0), 4914 DAG.getConstant(31, dl, MVT::i32))); 4915 Value = Value.getValue(0); // We only want the low 32 bits for the result. 4916 break; 4917 } // switch (...) 4918 4919 return std::make_pair(Value, OverflowCmp); 4920 } 4921 4922 SDValue 4923 ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const { 4924 // Let legalize expand this if it isn't a legal type yet. 4925 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 4926 return SDValue(); 4927 4928 SDValue Value, OverflowCmp; 4929 SDValue ARMcc; 4930 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc); 4931 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4932 SDLoc dl(Op); 4933 // We use 0 and 1 as false and true values. 4934 SDValue TVal = DAG.getConstant(1, dl, MVT::i32); 4935 SDValue FVal = DAG.getConstant(0, dl, MVT::i32); 4936 EVT VT = Op.getValueType(); 4937 4938 SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal, 4939 ARMcc, CCR, OverflowCmp); 4940 4941 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 4942 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 4943 } 4944 4945 static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry, 4946 SelectionDAG &DAG) { 4947 SDLoc DL(BoolCarry); 4948 EVT CarryVT = BoolCarry.getValueType(); 4949 4950 // This converts the boolean value carry into the carry flag by doing 4951 // ARMISD::SUBC Carry, 1 4952 SDValue Carry = DAG.getNode(ARMISD::SUBC, DL, 4953 DAG.getVTList(CarryVT, MVT::i32), 4954 BoolCarry, DAG.getConstant(1, DL, CarryVT)); 4955 return Carry.getValue(1); 4956 } 4957 4958 static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT, 4959 SelectionDAG &DAG) { 4960 SDLoc DL(Flags); 4961 4962 // Now convert the carry flag into a boolean carry. We do this 4963 // using ARMISD:ADDE 0, 0, Carry 4964 return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32), 4965 DAG.getConstant(0, DL, MVT::i32), 4966 DAG.getConstant(0, DL, MVT::i32), Flags); 4967 } 4968 4969 SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op, 4970 SelectionDAG &DAG) const { 4971 // Let legalize expand this if it isn't a legal type yet. 4972 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 4973 return SDValue(); 4974 4975 SDValue LHS = Op.getOperand(0); 4976 SDValue RHS = Op.getOperand(1); 4977 SDLoc dl(Op); 4978 4979 EVT VT = Op.getValueType(); 4980 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 4981 SDValue Value; 4982 SDValue Overflow; 4983 switch (Op.getOpcode()) { 4984 default: 4985 llvm_unreachable("Unknown overflow instruction!"); 4986 case ISD::UADDO: 4987 Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS); 4988 // Convert the carry flag into a boolean value. 4989 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG); 4990 break; 4991 case ISD::USUBO: { 4992 Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS); 4993 // Convert the carry flag into a boolean value. 4994 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG); 4995 // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow 4996 // value. So compute 1 - C. 4997 Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32, 4998 DAG.getConstant(1, dl, MVT::i32), Overflow); 4999 break; 5000 } 5001 } 5002 5003 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 5004 } 5005 5006 static SDValue LowerADDSUBSAT(SDValue Op, SelectionDAG &DAG, 5007 const ARMSubtarget *Subtarget) { 5008 EVT VT = Op.getValueType(); 5009 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) 5010 return SDValue(); 5011 if (!VT.isSimple()) 5012 return SDValue(); 5013 5014 unsigned NewOpcode; 5015 switch (VT.getSimpleVT().SimpleTy) { 5016 default: 5017 return SDValue(); 5018 case MVT::i8: 5019 switch (Op->getOpcode()) { 5020 case ISD::UADDSAT: 5021 NewOpcode = ARMISD::UQADD8b; 5022 break; 5023 case ISD::SADDSAT: 5024 NewOpcode = ARMISD::QADD8b; 5025 break; 5026 case ISD::USUBSAT: 5027 NewOpcode = ARMISD::UQSUB8b; 5028 break; 5029 case ISD::SSUBSAT: 5030 NewOpcode = ARMISD::QSUB8b; 5031 break; 5032 } 5033 break; 5034 case MVT::i16: 5035 switch (Op->getOpcode()) { 5036 case ISD::UADDSAT: 5037 NewOpcode = ARMISD::UQADD16b; 5038 break; 5039 case ISD::SADDSAT: 5040 NewOpcode = ARMISD::QADD16b; 5041 break; 5042 case ISD::USUBSAT: 5043 NewOpcode = ARMISD::UQSUB16b; 5044 break; 5045 case ISD::SSUBSAT: 5046 NewOpcode = ARMISD::QSUB16b; 5047 break; 5048 } 5049 break; 5050 } 5051 5052 SDLoc dl(Op); 5053 SDValue Add = 5054 DAG.getNode(NewOpcode, dl, MVT::i32, 5055 DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32), 5056 DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32)); 5057 return DAG.getNode(ISD::TRUNCATE, dl, VT, Add); 5058 } 5059 5060 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 5061 SDValue Cond = Op.getOperand(0); 5062 SDValue SelectTrue = Op.getOperand(1); 5063 SDValue SelectFalse = Op.getOperand(2); 5064 SDLoc dl(Op); 5065 unsigned Opc = Cond.getOpcode(); 5066 5067 if (Cond.getResNo() == 1 && 5068 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 5069 Opc == ISD::USUBO)) { 5070 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) 5071 return SDValue(); 5072 5073 SDValue Value, OverflowCmp; 5074 SDValue ARMcc; 5075 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); 5076 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5077 EVT VT = Op.getValueType(); 5078 5079 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR, 5080 OverflowCmp, DAG); 5081 } 5082 5083 // Convert: 5084 // 5085 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond) 5086 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond) 5087 // 5088 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) { 5089 const ConstantSDNode *CMOVTrue = 5090 dyn_cast<ConstantSDNode>(Cond.getOperand(0)); 5091 const ConstantSDNode *CMOVFalse = 5092 dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 5093 5094 if (CMOVTrue && CMOVFalse) { 5095 unsigned CMOVTrueVal = CMOVTrue->getZExtValue(); 5096 unsigned CMOVFalseVal = CMOVFalse->getZExtValue(); 5097 5098 SDValue True; 5099 SDValue False; 5100 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) { 5101 True = SelectTrue; 5102 False = SelectFalse; 5103 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) { 5104 True = SelectFalse; 5105 False = SelectTrue; 5106 } 5107 5108 if (True.getNode() && False.getNode()) { 5109 EVT VT = Op.getValueType(); 5110 SDValue ARMcc = Cond.getOperand(2); 5111 SDValue CCR = Cond.getOperand(3); 5112 SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG); 5113 assert(True.getValueType() == VT); 5114 return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG); 5115 } 5116 } 5117 } 5118 5119 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the 5120 // undefined bits before doing a full-word comparison with zero. 5121 Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond, 5122 DAG.getConstant(1, dl, Cond.getValueType())); 5123 5124 return DAG.getSelectCC(dl, Cond, 5125 DAG.getConstant(0, dl, Cond.getValueType()), 5126 SelectTrue, SelectFalse, ISD::SETNE); 5127 } 5128 5129 static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, 5130 bool &swpCmpOps, bool &swpVselOps) { 5131 // Start by selecting the GE condition code for opcodes that return true for 5132 // 'equality' 5133 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE || 5134 CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE) 5135 CondCode = ARMCC::GE; 5136 5137 // and GT for opcodes that return false for 'equality'. 5138 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT || 5139 CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT) 5140 CondCode = ARMCC::GT; 5141 5142 // Since we are constrained to GE/GT, if the opcode contains 'less', we need 5143 // to swap the compare operands. 5144 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT || 5145 CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT) 5146 swpCmpOps = true; 5147 5148 // Both GT and GE are ordered comparisons, and return false for 'unordered'. 5149 // If we have an unordered opcode, we need to swap the operands to the VSEL 5150 // instruction (effectively negating the condition). 5151 // 5152 // This also has the effect of swapping which one of 'less' or 'greater' 5153 // returns true, so we also swap the compare operands. It also switches 5154 // whether we return true for 'equality', so we compensate by picking the 5155 // opposite condition code to our original choice. 5156 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE || 5157 CC == ISD::SETUGT) { 5158 swpCmpOps = !swpCmpOps; 5159 swpVselOps = !swpVselOps; 5160 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT; 5161 } 5162 5163 // 'ordered' is 'anything but unordered', so use the VS condition code and 5164 // swap the VSEL operands. 5165 if (CC == ISD::SETO) { 5166 CondCode = ARMCC::VS; 5167 swpVselOps = true; 5168 } 5169 5170 // 'unordered or not equal' is 'anything but equal', so use the EQ condition 5171 // code and swap the VSEL operands. Also do this if we don't care about the 5172 // unordered case. 5173 if (CC == ISD::SETUNE || CC == ISD::SETNE) { 5174 CondCode = ARMCC::EQ; 5175 swpVselOps = true; 5176 } 5177 } 5178 5179 SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, 5180 SDValue TrueVal, SDValue ARMcc, SDValue CCR, 5181 SDValue Cmp, SelectionDAG &DAG) const { 5182 if (!Subtarget->hasFP64() && VT == MVT::f64) { 5183 FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl, 5184 DAG.getVTList(MVT::i32, MVT::i32), FalseVal); 5185 TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl, 5186 DAG.getVTList(MVT::i32, MVT::i32), TrueVal); 5187 5188 SDValue TrueLow = TrueVal.getValue(0); 5189 SDValue TrueHigh = TrueVal.getValue(1); 5190 SDValue FalseLow = FalseVal.getValue(0); 5191 SDValue FalseHigh = FalseVal.getValue(1); 5192 5193 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow, 5194 ARMcc, CCR, Cmp); 5195 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh, 5196 ARMcc, CCR, duplicateCmp(Cmp, DAG)); 5197 5198 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High); 5199 } else { 5200 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR, 5201 Cmp); 5202 } 5203 } 5204 5205 static bool isGTorGE(ISD::CondCode CC) { 5206 return CC == ISD::SETGT || CC == ISD::SETGE; 5207 } 5208 5209 static bool isLTorLE(ISD::CondCode CC) { 5210 return CC == ISD::SETLT || CC == ISD::SETLE; 5211 } 5212 5213 // See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating. 5214 // All of these conditions (and their <= and >= counterparts) will do: 5215 // x < k ? k : x 5216 // x > k ? x : k 5217 // k < x ? x : k 5218 // k > x ? k : x 5219 static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, 5220 const SDValue TrueVal, const SDValue FalseVal, 5221 const ISD::CondCode CC, const SDValue K) { 5222 return (isGTorGE(CC) && 5223 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) || 5224 (isLTorLE(CC) && 5225 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))); 5226 } 5227 5228 // Check if two chained conditionals could be converted into SSAT or USAT. 5229 // 5230 // SSAT can replace a set of two conditional selectors that bound a number to an 5231 // interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples: 5232 // 5233 // x < -k ? -k : (x > k ? k : x) 5234 // x < -k ? -k : (x < k ? x : k) 5235 // x > -k ? (x > k ? k : x) : -k 5236 // x < k ? (x < -k ? -k : x) : k 5237 // etc. 5238 // 5239 // LLVM canonicalizes these to either a min(max()) or a max(min()) 5240 // pattern. This function tries to match one of these and will return a SSAT 5241 // node if successful. 5242 // 5243 // USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1 5244 // is a power of 2. 5245 static SDValue LowerSaturatingConditional(SDValue Op, SelectionDAG &DAG) { 5246 EVT VT = Op.getValueType(); 5247 SDValue V1 = Op.getOperand(0); 5248 SDValue K1 = Op.getOperand(1); 5249 SDValue TrueVal1 = Op.getOperand(2); 5250 SDValue FalseVal1 = Op.getOperand(3); 5251 ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 5252 5253 const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1; 5254 if (Op2.getOpcode() != ISD::SELECT_CC) 5255 return SDValue(); 5256 5257 SDValue V2 = Op2.getOperand(0); 5258 SDValue K2 = Op2.getOperand(1); 5259 SDValue TrueVal2 = Op2.getOperand(2); 5260 SDValue FalseVal2 = Op2.getOperand(3); 5261 ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get(); 5262 5263 SDValue V1Tmp = V1; 5264 SDValue V2Tmp = V2; 5265 5266 // Check that the registers and the constants match a max(min()) or min(max()) 5267 // pattern 5268 if (V1Tmp != TrueVal1 || V2Tmp != TrueVal2 || K1 != FalseVal1 || 5269 K2 != FalseVal2 || 5270 !((isGTorGE(CC1) && isLTorLE(CC2)) || (isLTorLE(CC1) && isGTorGE(CC2)))) 5271 return SDValue(); 5272 5273 // Check that the constant in the lower-bound check is 5274 // the opposite of the constant in the upper-bound check 5275 // in 1's complement. 5276 if (!isa<ConstantSDNode>(K1) || !isa<ConstantSDNode>(K2)) 5277 return SDValue(); 5278 5279 int64_t Val1 = cast<ConstantSDNode>(K1)->getSExtValue(); 5280 int64_t Val2 = cast<ConstantSDNode>(K2)->getSExtValue(); 5281 int64_t PosVal = std::max(Val1, Val2); 5282 int64_t NegVal = std::min(Val1, Val2); 5283 5284 if (!((Val1 > Val2 && isLTorLE(CC1)) || (Val1 < Val2 && isLTorLE(CC2))) || 5285 !isPowerOf2_64(PosVal + 1)) 5286 return SDValue(); 5287 5288 // Handle the difference between USAT (unsigned) and SSAT (signed) 5289 // saturation 5290 // At this point, PosVal is guaranteed to be positive 5291 uint64_t K = PosVal; 5292 SDLoc dl(Op); 5293 if (Val1 == ~Val2) 5294 return DAG.getNode(ARMISD::SSAT, dl, VT, V2Tmp, 5295 DAG.getConstant(countTrailingOnes(K), dl, VT)); 5296 if (NegVal == 0) 5297 return DAG.getNode(ARMISD::USAT, dl, VT, V2Tmp, 5298 DAG.getConstant(countTrailingOnes(K), dl, VT)); 5299 5300 return SDValue(); 5301 } 5302 5303 // Check if a condition of the type x < k ? k : x can be converted into a 5304 // bit operation instead of conditional moves. 5305 // Currently this is allowed given: 5306 // - The conditions and values match up 5307 // - k is 0 or -1 (all ones) 5308 // This function will not check the last condition, thats up to the caller 5309 // It returns true if the transformation can be made, and in such case 5310 // returns x in V, and k in SatK. 5311 static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, 5312 SDValue &SatK) 5313 { 5314 SDValue LHS = Op.getOperand(0); 5315 SDValue RHS = Op.getOperand(1); 5316 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 5317 SDValue TrueVal = Op.getOperand(2); 5318 SDValue FalseVal = Op.getOperand(3); 5319 5320 SDValue *K = isa<ConstantSDNode>(LHS) ? &LHS : isa<ConstantSDNode>(RHS) 5321 ? &RHS 5322 : nullptr; 5323 5324 // No constant operation in comparison, early out 5325 if (!K) 5326 return false; 5327 5328 SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal; 5329 V = (KTmp == TrueVal) ? FalseVal : TrueVal; 5330 SDValue VTmp = (K && *K == LHS) ? RHS : LHS; 5331 5332 // If the constant on left and right side, or variable on left and right, 5333 // does not match, early out 5334 if (*K != KTmp || V != VTmp) 5335 return false; 5336 5337 if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) { 5338 SatK = *K; 5339 return true; 5340 } 5341 5342 return false; 5343 } 5344 5345 bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const { 5346 if (VT == MVT::f32) 5347 return !Subtarget->hasVFP2Base(); 5348 if (VT == MVT::f64) 5349 return !Subtarget->hasFP64(); 5350 if (VT == MVT::f16) 5351 return !Subtarget->hasFullFP16(); 5352 return false; 5353 } 5354 5355 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 5356 EVT VT = Op.getValueType(); 5357 SDLoc dl(Op); 5358 5359 // Try to convert two saturating conditional selects into a single SSAT 5360 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) 5361 if (SDValue SatValue = LowerSaturatingConditional(Op, DAG)) 5362 return SatValue; 5363 5364 // Try to convert expressions of the form x < k ? k : x (and similar forms) 5365 // into more efficient bit operations, which is possible when k is 0 or -1 5366 // On ARM and Thumb-2 which have flexible operand 2 this will result in 5367 // single instructions. On Thumb the shift and the bit operation will be two 5368 // instructions. 5369 // Only allow this transformation on full-width (32-bit) operations 5370 SDValue LowerSatConstant; 5371 SDValue SatValue; 5372 if (VT == MVT::i32 && 5373 isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) { 5374 SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue, 5375 DAG.getConstant(31, dl, VT)); 5376 if (isNullConstant(LowerSatConstant)) { 5377 SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV, 5378 DAG.getAllOnesConstant(dl, VT)); 5379 return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV); 5380 } else if (isAllOnesConstant(LowerSatConstant)) 5381 return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV); 5382 } 5383 5384 SDValue LHS = Op.getOperand(0); 5385 SDValue RHS = Op.getOperand(1); 5386 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 5387 SDValue TrueVal = Op.getOperand(2); 5388 SDValue FalseVal = Op.getOperand(3); 5389 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal); 5390 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal); 5391 5392 if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal && 5393 LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) { 5394 unsigned TVal = CTVal->getZExtValue(); 5395 unsigned FVal = CFVal->getZExtValue(); 5396 unsigned Opcode = 0; 5397 5398 if (TVal == ~FVal) { 5399 Opcode = ARMISD::CSINV; 5400 } else if (TVal == ~FVal + 1) { 5401 Opcode = ARMISD::CSNEG; 5402 } else if (TVal + 1 == FVal) { 5403 Opcode = ARMISD::CSINC; 5404 } else if (TVal == FVal + 1) { 5405 Opcode = ARMISD::CSINC; 5406 std::swap(TrueVal, FalseVal); 5407 std::swap(TVal, FVal); 5408 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 5409 } 5410 5411 if (Opcode) { 5412 // If one of the constants is cheaper than another, materialise the 5413 // cheaper one and let the csel generate the other. 5414 if (Opcode != ARMISD::CSINC && 5415 HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) { 5416 std::swap(TrueVal, FalseVal); 5417 std::swap(TVal, FVal); 5418 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 5419 } 5420 5421 // Attempt to use ZR checking TVal is 0, possibly inverting the condition 5422 // to get there. CSINC not is invertable like the other two (~(~a) == a, 5423 // -(-a) == a, but (a+1)+1 != a). 5424 if (FVal == 0 && Opcode != ARMISD::CSINC) { 5425 std::swap(TrueVal, FalseVal); 5426 std::swap(TVal, FVal); 5427 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 5428 } 5429 5430 // Drops F's value because we can get it by inverting/negating TVal. 5431 FalseVal = TrueVal; 5432 5433 SDValue ARMcc; 5434 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5435 EVT VT = TrueVal.getValueType(); 5436 return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp); 5437 } 5438 } 5439 5440 if (isUnsupportedFloatingType(LHS.getValueType())) { 5441 DAG.getTargetLoweringInfo().softenSetCCOperands( 5442 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS); 5443 5444 // If softenSetCCOperands only returned one value, we should compare it to 5445 // zero. 5446 if (!RHS.getNode()) { 5447 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 5448 CC = ISD::SETNE; 5449 } 5450 } 5451 5452 if (LHS.getValueType() == MVT::i32) { 5453 // Try to generate VSEL on ARMv8. 5454 // The VSEL instruction can't use all the usual ARM condition 5455 // codes: it only has two bits to select the condition code, so it's 5456 // constrained to use only GE, GT, VS and EQ. 5457 // 5458 // To implement all the various ISD::SETXXX opcodes, we sometimes need to 5459 // swap the operands of the previous compare instruction (effectively 5460 // inverting the compare condition, swapping 'less' and 'greater') and 5461 // sometimes need to swap the operands to the VSEL (which inverts the 5462 // condition in the sense of firing whenever the previous condition didn't) 5463 if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 || 5464 TrueVal.getValueType() == MVT::f32 || 5465 TrueVal.getValueType() == MVT::f64)) { 5466 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 5467 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE || 5468 CondCode == ARMCC::VC || CondCode == ARMCC::NE) { 5469 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 5470 std::swap(TrueVal, FalseVal); 5471 } 5472 } 5473 5474 SDValue ARMcc; 5475 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5476 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5477 // Choose GE over PL, which vsel does now support 5478 if (cast<ConstantSDNode>(ARMcc)->getZExtValue() == ARMCC::PL) 5479 ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32); 5480 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); 5481 } 5482 5483 ARMCC::CondCodes CondCode, CondCode2; 5484 FPCCToARMCC(CC, CondCode, CondCode2); 5485 5486 // Normalize the fp compare. If RHS is zero we prefer to keep it there so we 5487 // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we 5488 // must use VSEL (limited condition codes), due to not having conditional f16 5489 // moves. 5490 if (Subtarget->hasFPARMv8Base() && 5491 !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) && 5492 (TrueVal.getValueType() == MVT::f16 || 5493 TrueVal.getValueType() == MVT::f32 || 5494 TrueVal.getValueType() == MVT::f64)) { 5495 bool swpCmpOps = false; 5496 bool swpVselOps = false; 5497 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps); 5498 5499 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE || 5500 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) { 5501 if (swpCmpOps) 5502 std::swap(LHS, RHS); 5503 if (swpVselOps) 5504 std::swap(TrueVal, FalseVal); 5505 } 5506 } 5507 5508 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 5509 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); 5510 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5511 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); 5512 if (CondCode2 != ARMCC::AL) { 5513 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32); 5514 // FIXME: Needs another CMP because flag can have but one use. 5515 SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl); 5516 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG); 5517 } 5518 return Result; 5519 } 5520 5521 /// canChangeToInt - Given the fp compare operand, return true if it is suitable 5522 /// to morph to an integer compare sequence. 5523 static bool canChangeToInt(SDValue Op, bool &SeenZero, 5524 const ARMSubtarget *Subtarget) { 5525 SDNode *N = Op.getNode(); 5526 if (!N->hasOneUse()) 5527 // Otherwise it requires moving the value from fp to integer registers. 5528 return false; 5529 if (!N->getNumValues()) 5530 return false; 5531 EVT VT = Op.getValueType(); 5532 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow()) 5533 // f32 case is generally profitable. f64 case only makes sense when vcmpe + 5534 // vmrs are very slow, e.g. cortex-a8. 5535 return false; 5536 5537 if (isFloatingPointZero(Op)) { 5538 SeenZero = true; 5539 return true; 5540 } 5541 return ISD::isNormalLoad(N); 5542 } 5543 5544 static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) { 5545 if (isFloatingPointZero(Op)) 5546 return DAG.getConstant(0, SDLoc(Op), MVT::i32); 5547 5548 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) 5549 return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(), 5550 Ld->getPointerInfo(), Ld->getAlignment(), 5551 Ld->getMemOperand()->getFlags()); 5552 5553 llvm_unreachable("Unknown VFP cmp argument!"); 5554 } 5555 5556 static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, 5557 SDValue &RetVal1, SDValue &RetVal2) { 5558 SDLoc dl(Op); 5559 5560 if (isFloatingPointZero(Op)) { 5561 RetVal1 = DAG.getConstant(0, dl, MVT::i32); 5562 RetVal2 = DAG.getConstant(0, dl, MVT::i32); 5563 return; 5564 } 5565 5566 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) { 5567 SDValue Ptr = Ld->getBasePtr(); 5568 RetVal1 = 5569 DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), 5570 Ld->getAlignment(), Ld->getMemOperand()->getFlags()); 5571 5572 EVT PtrType = Ptr.getValueType(); 5573 unsigned NewAlign = MinAlign(Ld->getAlignment(), 4); 5574 SDValue NewPtr = DAG.getNode(ISD::ADD, dl, 5575 PtrType, Ptr, DAG.getConstant(4, dl, PtrType)); 5576 RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr, 5577 Ld->getPointerInfo().getWithOffset(4), NewAlign, 5578 Ld->getMemOperand()->getFlags()); 5579 return; 5580 } 5581 5582 llvm_unreachable("Unknown VFP cmp argument!"); 5583 } 5584 5585 /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some 5586 /// f32 and even f64 comparisons to integer ones. 5587 SDValue 5588 ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { 5589 SDValue Chain = Op.getOperand(0); 5590 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 5591 SDValue LHS = Op.getOperand(2); 5592 SDValue RHS = Op.getOperand(3); 5593 SDValue Dest = Op.getOperand(4); 5594 SDLoc dl(Op); 5595 5596 bool LHSSeenZero = false; 5597 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget); 5598 bool RHSSeenZero = false; 5599 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget); 5600 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) { 5601 // If unsafe fp math optimization is enabled and there are no other uses of 5602 // the CMP operands, and the condition code is EQ or NE, we can optimize it 5603 // to an integer comparison. 5604 if (CC == ISD::SETOEQ) 5605 CC = ISD::SETEQ; 5606 else if (CC == ISD::SETUNE) 5607 CC = ISD::SETNE; 5608 5609 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32); 5610 SDValue ARMcc; 5611 if (LHS.getValueType() == MVT::f32) { 5612 LHS = DAG.getNode(ISD::AND, dl, MVT::i32, 5613 bitcastf32Toi32(LHS, DAG), Mask); 5614 RHS = DAG.getNode(ISD::AND, dl, MVT::i32, 5615 bitcastf32Toi32(RHS, DAG), Mask); 5616 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5617 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5618 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 5619 Chain, Dest, ARMcc, CCR, Cmp); 5620 } 5621 5622 SDValue LHS1, LHS2; 5623 SDValue RHS1, RHS2; 5624 expandf64Toi32(LHS, DAG, LHS1, LHS2); 5625 expandf64Toi32(RHS, DAG, RHS1, RHS2); 5626 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask); 5627 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask); 5628 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 5629 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 5630 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 5631 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest }; 5632 return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops); 5633 } 5634 5635 return SDValue(); 5636 } 5637 5638 SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 5639 SDValue Chain = Op.getOperand(0); 5640 SDValue Cond = Op.getOperand(1); 5641 SDValue Dest = Op.getOperand(2); 5642 SDLoc dl(Op); 5643 5644 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 5645 // instruction. 5646 unsigned Opc = Cond.getOpcode(); 5647 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) && 5648 !Subtarget->isThumb1Only(); 5649 if (Cond.getResNo() == 1 && 5650 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 5651 Opc == ISD::USUBO || OptimizeMul)) { 5652 // Only lower legal XALUO ops. 5653 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) 5654 return SDValue(); 5655 5656 // The actual operation with overflow check. 5657 SDValue Value, OverflowCmp; 5658 SDValue ARMcc; 5659 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); 5660 5661 // Reverse the condition code. 5662 ARMCC::CondCodes CondCode = 5663 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue(); 5664 CondCode = ARMCC::getOppositeCondition(CondCode); 5665 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); 5666 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5667 5668 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, 5669 OverflowCmp); 5670 } 5671 5672 return SDValue(); 5673 } 5674 5675 SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { 5676 SDValue Chain = Op.getOperand(0); 5677 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 5678 SDValue LHS = Op.getOperand(2); 5679 SDValue RHS = Op.getOperand(3); 5680 SDValue Dest = Op.getOperand(4); 5681 SDLoc dl(Op); 5682 5683 if (isUnsupportedFloatingType(LHS.getValueType())) { 5684 DAG.getTargetLoweringInfo().softenSetCCOperands( 5685 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS); 5686 5687 // If softenSetCCOperands only returned one value, we should compare it to 5688 // zero. 5689 if (!RHS.getNode()) { 5690 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 5691 CC = ISD::SETNE; 5692 } 5693 } 5694 5695 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 5696 // instruction. 5697 unsigned Opc = LHS.getOpcode(); 5698 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) && 5699 !Subtarget->isThumb1Only(); 5700 if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) && 5701 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 5702 Opc == ISD::USUBO || OptimizeMul) && 5703 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 5704 // Only lower legal XALUO ops. 5705 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) 5706 return SDValue(); 5707 5708 // The actual operation with overflow check. 5709 SDValue Value, OverflowCmp; 5710 SDValue ARMcc; 5711 std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc); 5712 5713 if ((CC == ISD::SETNE) != isOneConstant(RHS)) { 5714 // Reverse the condition code. 5715 ARMCC::CondCodes CondCode = 5716 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue(); 5717 CondCode = ARMCC::getOppositeCondition(CondCode); 5718 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); 5719 } 5720 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5721 5722 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, 5723 OverflowCmp); 5724 } 5725 5726 if (LHS.getValueType() == MVT::i32) { 5727 SDValue ARMcc; 5728 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5729 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5730 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 5731 Chain, Dest, ARMcc, CCR, Cmp); 5732 } 5733 5734 if (getTargetMachine().Options.UnsafeFPMath && 5735 (CC == ISD::SETEQ || CC == ISD::SETOEQ || 5736 CC == ISD::SETNE || CC == ISD::SETUNE)) { 5737 if (SDValue Result = OptimizeVFPBrcond(Op, DAG)) 5738 return Result; 5739 } 5740 5741 ARMCC::CondCodes CondCode, CondCode2; 5742 FPCCToARMCC(CC, CondCode, CondCode2); 5743 5744 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 5745 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); 5746 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5747 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 5748 SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp }; 5749 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); 5750 if (CondCode2 != ARMCC::AL) { 5751 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32); 5752 SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) }; 5753 Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); 5754 } 5755 return Res; 5756 } 5757 5758 SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { 5759 SDValue Chain = Op.getOperand(0); 5760 SDValue Table = Op.getOperand(1); 5761 SDValue Index = Op.getOperand(2); 5762 SDLoc dl(Op); 5763 5764 EVT PTy = getPointerTy(DAG.getDataLayout()); 5765 JumpTableSDNode *JT = cast<JumpTableSDNode>(Table); 5766 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy); 5767 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI); 5768 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy)); 5769 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index); 5770 if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) { 5771 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table 5772 // which does another jump to the destination. This also makes it easier 5773 // to translate it to TBB / TBH later (Thumb2 only). 5774 // FIXME: This might not work if the function is extremely large. 5775 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain, 5776 Addr, Op.getOperand(2), JTI); 5777 } 5778 if (isPositionIndependent() || Subtarget->isROPI()) { 5779 Addr = 5780 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, 5781 MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); 5782 Chain = Addr.getValue(1); 5783 Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr); 5784 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); 5785 } else { 5786 Addr = 5787 DAG.getLoad(PTy, dl, Chain, Addr, 5788 MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); 5789 Chain = Addr.getValue(1); 5790 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); 5791 } 5792 } 5793 5794 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { 5795 EVT VT = Op.getValueType(); 5796 SDLoc dl(Op); 5797 5798 if (Op.getValueType().getVectorElementType() == MVT::i32) { 5799 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32) 5800 return Op; 5801 return DAG.UnrollVectorOp(Op.getNode()); 5802 } 5803 5804 const bool HasFullFP16 = 5805 static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16(); 5806 5807 EVT NewTy; 5808 const EVT OpTy = Op.getOperand(0).getValueType(); 5809 if (OpTy == MVT::v4f32) 5810 NewTy = MVT::v4i32; 5811 else if (OpTy == MVT::v4f16 && HasFullFP16) 5812 NewTy = MVT::v4i16; 5813 else if (OpTy == MVT::v8f16 && HasFullFP16) 5814 NewTy = MVT::v8i16; 5815 else 5816 llvm_unreachable("Invalid type for custom lowering!"); 5817 5818 if (VT != MVT::v4i16 && VT != MVT::v8i16) 5819 return DAG.UnrollVectorOp(Op.getNode()); 5820 5821 Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0)); 5822 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op); 5823 } 5824 5825 SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { 5826 EVT VT = Op.getValueType(); 5827 if (VT.isVector()) 5828 return LowerVectorFP_TO_INT(Op, DAG); 5829 5830 bool IsStrict = Op->isStrictFPOpcode(); 5831 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 5832 5833 if (isUnsupportedFloatingType(SrcVal.getValueType())) { 5834 RTLIB::Libcall LC; 5835 if (Op.getOpcode() == ISD::FP_TO_SINT || 5836 Op.getOpcode() == ISD::STRICT_FP_TO_SINT) 5837 LC = RTLIB::getFPTOSINT(SrcVal.getValueType(), 5838 Op.getValueType()); 5839 else 5840 LC = RTLIB::getFPTOUINT(SrcVal.getValueType(), 5841 Op.getValueType()); 5842 SDLoc Loc(Op); 5843 MakeLibCallOptions CallOptions; 5844 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); 5845 SDValue Result; 5846 std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal, 5847 CallOptions, Loc, Chain); 5848 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result; 5849 } 5850 5851 // FIXME: Remove this when we have strict fp instruction selection patterns 5852 if (IsStrict) { 5853 SDLoc Loc(Op); 5854 SDValue Result = 5855 DAG.getNode(Op.getOpcode() == ISD::STRICT_FP_TO_SINT ? ISD::FP_TO_SINT 5856 : ISD::FP_TO_UINT, 5857 Loc, Op.getValueType(), SrcVal); 5858 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc); 5859 } 5860 5861 return Op; 5862 } 5863 5864 static SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG, 5865 const ARMSubtarget *Subtarget) { 5866 EVT VT = Op.getValueType(); 5867 EVT ToVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); 5868 EVT FromVT = Op.getOperand(0).getValueType(); 5869 5870 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f32) 5871 return Op; 5872 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f64 && 5873 Subtarget->hasFP64()) 5874 return Op; 5875 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f16 && 5876 Subtarget->hasFullFP16()) 5877 return Op; 5878 if (VT == MVT::v4i32 && ToVT == MVT::i32 && FromVT == MVT::v4f32 && 5879 Subtarget->hasMVEFloatOps()) 5880 return Op; 5881 if (VT == MVT::v8i16 && ToVT == MVT::i16 && FromVT == MVT::v8f16 && 5882 Subtarget->hasMVEFloatOps()) 5883 return Op; 5884 5885 if (FromVT != MVT::v4f32 && FromVT != MVT::v8f16) 5886 return SDValue(); 5887 5888 SDLoc DL(Op); 5889 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT_SAT; 5890 unsigned BW = ToVT.getScalarSizeInBits() - IsSigned; 5891 SDValue CVT = DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0), 5892 DAG.getValueType(VT.getScalarType())); 5893 SDValue Max = DAG.getNode(IsSigned ? ISD::SMIN : ISD::UMIN, DL, VT, CVT, 5894 DAG.getConstant((1 << BW) - 1, DL, VT)); 5895 if (IsSigned) 5896 Max = DAG.getNode(ISD::SMAX, DL, VT, Max, 5897 DAG.getConstant(-(1 << BW), DL, VT)); 5898 return Max; 5899 } 5900 5901 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 5902 EVT VT = Op.getValueType(); 5903 SDLoc dl(Op); 5904 5905 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) { 5906 if (VT.getVectorElementType() == MVT::f32) 5907 return Op; 5908 return DAG.UnrollVectorOp(Op.getNode()); 5909 } 5910 5911 assert((Op.getOperand(0).getValueType() == MVT::v4i16 || 5912 Op.getOperand(0).getValueType() == MVT::v8i16) && 5913 "Invalid type for custom lowering!"); 5914 5915 const bool HasFullFP16 = 5916 static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16(); 5917 5918 EVT DestVecType; 5919 if (VT == MVT::v4f32) 5920 DestVecType = MVT::v4i32; 5921 else if (VT == MVT::v4f16 && HasFullFP16) 5922 DestVecType = MVT::v4i16; 5923 else if (VT == MVT::v8f16 && HasFullFP16) 5924 DestVecType = MVT::v8i16; 5925 else 5926 return DAG.UnrollVectorOp(Op.getNode()); 5927 5928 unsigned CastOpc; 5929 unsigned Opc; 5930 switch (Op.getOpcode()) { 5931 default: llvm_unreachable("Invalid opcode!"); 5932 case ISD::SINT_TO_FP: 5933 CastOpc = ISD::SIGN_EXTEND; 5934 Opc = ISD::SINT_TO_FP; 5935 break; 5936 case ISD::UINT_TO_FP: 5937 CastOpc = ISD::ZERO_EXTEND; 5938 Opc = ISD::UINT_TO_FP; 5939 break; 5940 } 5941 5942 Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0)); 5943 return DAG.getNode(Opc, dl, VT, Op); 5944 } 5945 5946 SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { 5947 EVT VT = Op.getValueType(); 5948 if (VT.isVector()) 5949 return LowerVectorINT_TO_FP(Op, DAG); 5950 if (isUnsupportedFloatingType(VT)) { 5951 RTLIB::Libcall LC; 5952 if (Op.getOpcode() == ISD::SINT_TO_FP) 5953 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), 5954 Op.getValueType()); 5955 else 5956 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), 5957 Op.getValueType()); 5958 MakeLibCallOptions CallOptions; 5959 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), 5960 CallOptions, SDLoc(Op)).first; 5961 } 5962 5963 return Op; 5964 } 5965 5966 SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 5967 // Implement fcopysign with a fabs and a conditional fneg. 5968 SDValue Tmp0 = Op.getOperand(0); 5969 SDValue Tmp1 = Op.getOperand(1); 5970 SDLoc dl(Op); 5971 EVT VT = Op.getValueType(); 5972 EVT SrcVT = Tmp1.getValueType(); 5973 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST || 5974 Tmp0.getOpcode() == ARMISD::VMOVDRR; 5975 bool UseNEON = !InGPR && Subtarget->hasNEON(); 5976 5977 if (UseNEON) { 5978 // Use VBSL to copy the sign bit. 5979 unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80); 5980 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32, 5981 DAG.getTargetConstant(EncodedVal, dl, MVT::i32)); 5982 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64; 5983 if (VT == MVT::f64) 5984 Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT, 5985 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask), 5986 DAG.getConstant(32, dl, MVT::i32)); 5987 else /*if (VT == MVT::f32)*/ 5988 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0); 5989 if (SrcVT == MVT::f32) { 5990 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1); 5991 if (VT == MVT::f64) 5992 Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT, 5993 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1), 5994 DAG.getConstant(32, dl, MVT::i32)); 5995 } else if (VT == MVT::f32) 5996 Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64, 5997 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1), 5998 DAG.getConstant(32, dl, MVT::i32)); 5999 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0); 6000 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1); 6001 6002 SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), 6003 dl, MVT::i32); 6004 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes); 6005 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask, 6006 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes)); 6007 6008 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT, 6009 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask), 6010 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot)); 6011 if (VT == MVT::f32) { 6012 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res); 6013 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res, 6014 DAG.getConstant(0, dl, MVT::i32)); 6015 } else { 6016 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res); 6017 } 6018 6019 return Res; 6020 } 6021 6022 // Bitcast operand 1 to i32. 6023 if (SrcVT == MVT::f64) 6024 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 6025 Tmp1).getValue(1); 6026 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1); 6027 6028 // Or in the signbit with integer operations. 6029 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32); 6030 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32); 6031 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1); 6032 if (VT == MVT::f32) { 6033 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32, 6034 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2); 6035 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, 6036 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1)); 6037 } 6038 6039 // f64: Or the high part with signbit and then combine two parts. 6040 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 6041 Tmp0); 6042 SDValue Lo = Tmp0.getValue(0); 6043 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2); 6044 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1); 6045 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 6046 } 6047 6048 SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ 6049 MachineFunction &MF = DAG.getMachineFunction(); 6050 MachineFrameInfo &MFI = MF.getFrameInfo(); 6051 MFI.setReturnAddressIsTaken(true); 6052 6053 if (verifyReturnAddressArgumentIsConstant(Op, DAG)) 6054 return SDValue(); 6055 6056 EVT VT = Op.getValueType(); 6057 SDLoc dl(Op); 6058 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6059 if (Depth) { 6060 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 6061 SDValue Offset = DAG.getConstant(4, dl, MVT::i32); 6062 return DAG.getLoad(VT, dl, DAG.getEntryNode(), 6063 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset), 6064 MachinePointerInfo()); 6065 } 6066 6067 // Return LR, which contains the return address. Mark it an implicit live-in. 6068 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); 6069 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); 6070 } 6071 6072 SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 6073 const ARMBaseRegisterInfo &ARI = 6074 *static_cast<const ARMBaseRegisterInfo*>(RegInfo); 6075 MachineFunction &MF = DAG.getMachineFunction(); 6076 MachineFrameInfo &MFI = MF.getFrameInfo(); 6077 MFI.setFrameAddressIsTaken(true); 6078 6079 EVT VT = Op.getValueType(); 6080 SDLoc dl(Op); // FIXME probably not meaningful 6081 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6082 Register FrameReg = ARI.getFrameRegister(MF); 6083 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 6084 while (Depth--) 6085 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 6086 MachinePointerInfo()); 6087 return FrameAddr; 6088 } 6089 6090 // FIXME? Maybe this could be a TableGen attribute on some registers and 6091 // this table could be generated automatically from RegInfo. 6092 Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT, 6093 const MachineFunction &MF) const { 6094 Register Reg = StringSwitch<unsigned>(RegName) 6095 .Case("sp", ARM::SP) 6096 .Default(0); 6097 if (Reg) 6098 return Reg; 6099 report_fatal_error(Twine("Invalid register name \"" 6100 + StringRef(RegName) + "\".")); 6101 } 6102 6103 // Result is 64 bit value so split into two 32 bit values and return as a 6104 // pair of values. 6105 static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results, 6106 SelectionDAG &DAG) { 6107 SDLoc DL(N); 6108 6109 // This function is only supposed to be called for i64 type destination. 6110 assert(N->getValueType(0) == MVT::i64 6111 && "ExpandREAD_REGISTER called for non-i64 type result."); 6112 6113 SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL, 6114 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other), 6115 N->getOperand(0), 6116 N->getOperand(1)); 6117 6118 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0), 6119 Read.getValue(1))); 6120 Results.push_back(Read.getOperand(0)); 6121 } 6122 6123 /// \p BC is a bitcast that is about to be turned into a VMOVDRR. 6124 /// When \p DstVT, the destination type of \p BC, is on the vector 6125 /// register bank and the source of bitcast, \p Op, operates on the same bank, 6126 /// it might be possible to combine them, such that everything stays on the 6127 /// vector register bank. 6128 /// \p return The node that would replace \p BT, if the combine 6129 /// is possible. 6130 static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, 6131 SelectionDAG &DAG) { 6132 SDValue Op = BC->getOperand(0); 6133 EVT DstVT = BC->getValueType(0); 6134 6135 // The only vector instruction that can produce a scalar (remember, 6136 // since the bitcast was about to be turned into VMOVDRR, the source 6137 // type is i64) from a vector is EXTRACT_VECTOR_ELT. 6138 // Moreover, we can do this combine only if there is one use. 6139 // Finally, if the destination type is not a vector, there is not 6140 // much point on forcing everything on the vector bank. 6141 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 6142 !Op.hasOneUse()) 6143 return SDValue(); 6144 6145 // If the index is not constant, we will introduce an additional 6146 // multiply that will stick. 6147 // Give up in that case. 6148 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 6149 if (!Index) 6150 return SDValue(); 6151 unsigned DstNumElt = DstVT.getVectorNumElements(); 6152 6153 // Compute the new index. 6154 const APInt &APIntIndex = Index->getAPIntValue(); 6155 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt); 6156 NewIndex *= APIntIndex; 6157 // Check if the new constant index fits into i32. 6158 if (NewIndex.getBitWidth() > 32) 6159 return SDValue(); 6160 6161 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) -> 6162 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M) 6163 SDLoc dl(Op); 6164 SDValue ExtractSrc = Op.getOperand(0); 6165 EVT VecVT = EVT::getVectorVT( 6166 *DAG.getContext(), DstVT.getScalarType(), 6167 ExtractSrc.getValueType().getVectorNumElements() * DstNumElt); 6168 SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc); 6169 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast, 6170 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32)); 6171 } 6172 6173 /// ExpandBITCAST - If the target supports VFP, this function is called to 6174 /// expand a bit convert where either the source or destination type is i64 to 6175 /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 6176 /// operand type is illegal (e.g., v2f32 for a target that doesn't support 6177 /// vectors), since the legalizer won't know what to do with that. 6178 SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG, 6179 const ARMSubtarget *Subtarget) const { 6180 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 6181 SDLoc dl(N); 6182 SDValue Op = N->getOperand(0); 6183 6184 // This function is only supposed to be called for i16 and i64 types, either 6185 // as the source or destination of the bit convert. 6186 EVT SrcVT = Op.getValueType(); 6187 EVT DstVT = N->getValueType(0); 6188 6189 if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) && 6190 (DstVT == MVT::f16 || DstVT == MVT::bf16)) 6191 return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(), 6192 DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op)); 6193 6194 if ((DstVT == MVT::i16 || DstVT == MVT::i32) && 6195 (SrcVT == MVT::f16 || SrcVT == MVT::bf16)) 6196 return DAG.getNode( 6197 ISD::TRUNCATE, SDLoc(N), DstVT, 6198 MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op)); 6199 6200 if (!(SrcVT == MVT::i64 || DstVT == MVT::i64)) 6201 return SDValue(); 6202 6203 // Turn i64->f64 into VMOVDRR. 6204 if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) { 6205 // Do not force values to GPRs (this is what VMOVDRR does for the inputs) 6206 // if we can combine the bitcast with its source. 6207 if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG)) 6208 return Val; 6209 6210 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 6211 DAG.getConstant(0, dl, MVT::i32)); 6212 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 6213 DAG.getConstant(1, dl, MVT::i32)); 6214 return DAG.getNode(ISD::BITCAST, dl, DstVT, 6215 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi)); 6216 } 6217 6218 // Turn f64->i64 into VMOVRRD. 6219 if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) { 6220 SDValue Cvt; 6221 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() && 6222 SrcVT.getVectorNumElements() > 1) 6223 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, 6224 DAG.getVTList(MVT::i32, MVT::i32), 6225 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op)); 6226 else 6227 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, 6228 DAG.getVTList(MVT::i32, MVT::i32), Op); 6229 // Merge the pieces into a single i64 value. 6230 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1)); 6231 } 6232 6233 return SDValue(); 6234 } 6235 6236 /// getZeroVector - Returns a vector of specified type with all zero elements. 6237 /// Zero vectors are used to represent vector negation and in those cases 6238 /// will be implemented with the NEON VNEG instruction. However, VNEG does 6239 /// not support i64 elements, so sometimes the zero vectors will need to be 6240 /// explicitly constructed. Regardless, use a canonical VMOV to create the 6241 /// zero vector. 6242 static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) { 6243 assert(VT.isVector() && "Expected a vector type"); 6244 // The canonical modified immediate encoding of a zero vector is....0! 6245 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32); 6246 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 6247 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal); 6248 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 6249 } 6250 6251 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two 6252 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 6253 SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, 6254 SelectionDAG &DAG) const { 6255 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 6256 EVT VT = Op.getValueType(); 6257 unsigned VTBits = VT.getSizeInBits(); 6258 SDLoc dl(Op); 6259 SDValue ShOpLo = Op.getOperand(0); 6260 SDValue ShOpHi = Op.getOperand(1); 6261 SDValue ShAmt = Op.getOperand(2); 6262 SDValue ARMcc; 6263 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 6264 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 6265 6266 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 6267 6268 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 6269 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); 6270 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 6271 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 6272 DAG.getConstant(VTBits, dl, MVT::i32)); 6273 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 6274 SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 6275 SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 6276 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 6277 ISD::SETGE, ARMcc, DAG, dl); 6278 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift, 6279 ARMcc, CCR, CmpLo); 6280 6281 SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 6282 SDValue HiBigShift = Opc == ISD::SRA 6283 ? DAG.getNode(Opc, dl, VT, ShOpHi, 6284 DAG.getConstant(VTBits - 1, dl, VT)) 6285 : DAG.getConstant(0, dl, VT); 6286 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 6287 ISD::SETGE, ARMcc, DAG, dl); 6288 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, 6289 ARMcc, CCR, CmpHi); 6290 6291 SDValue Ops[2] = { Lo, Hi }; 6292 return DAG.getMergeValues(Ops, dl); 6293 } 6294 6295 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two 6296 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 6297 SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, 6298 SelectionDAG &DAG) const { 6299 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 6300 EVT VT = Op.getValueType(); 6301 unsigned VTBits = VT.getSizeInBits(); 6302 SDLoc dl(Op); 6303 SDValue ShOpLo = Op.getOperand(0); 6304 SDValue ShOpHi = Op.getOperand(1); 6305 SDValue ShAmt = Op.getOperand(2); 6306 SDValue ARMcc; 6307 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 6308 6309 assert(Op.getOpcode() == ISD::SHL_PARTS); 6310 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 6311 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); 6312 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 6313 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 6314 SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 6315 6316 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 6317 DAG.getConstant(VTBits, dl, MVT::i32)); 6318 SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 6319 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 6320 ISD::SETGE, ARMcc, DAG, dl); 6321 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, 6322 ARMcc, CCR, CmpHi); 6323 6324 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 6325 ISD::SETGE, ARMcc, DAG, dl); 6326 SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 6327 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, 6328 DAG.getConstant(0, dl, VT), ARMcc, CCR, CmpLo); 6329 6330 SDValue Ops[2] = { Lo, Hi }; 6331 return DAG.getMergeValues(Ops, dl); 6332 } 6333 6334 SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, 6335 SelectionDAG &DAG) const { 6336 // The rounding mode is in bits 23:22 of the FPSCR. 6337 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 6338 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) 6339 // so that the shift + and get folded into a bitfield extract. 6340 SDLoc dl(Op); 6341 SDValue Chain = Op.getOperand(0); 6342 SDValue Ops[] = {Chain, 6343 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)}; 6344 6345 SDValue FPSCR = 6346 DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops); 6347 Chain = FPSCR.getValue(1); 6348 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR, 6349 DAG.getConstant(1U << 22, dl, MVT::i32)); 6350 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, 6351 DAG.getConstant(22, dl, MVT::i32)); 6352 SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, 6353 DAG.getConstant(3, dl, MVT::i32)); 6354 return DAG.getMergeValues({And, Chain}, dl); 6355 } 6356 6357 SDValue ARMTargetLowering::LowerSET_ROUNDING(SDValue Op, 6358 SelectionDAG &DAG) const { 6359 SDLoc DL(Op); 6360 SDValue Chain = Op->getOperand(0); 6361 SDValue RMValue = Op->getOperand(1); 6362 6363 // The rounding mode is in bits 23:22 of the FPSCR. 6364 // The llvm.set.rounding argument value to ARM rounding mode value mapping 6365 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is 6366 // ((arg - 1) & 3) << 22). 6367 // 6368 // It is expected that the argument of llvm.set.rounding is within the 6369 // segment [0, 3], so NearestTiesToAway (4) is not handled here. It is 6370 // responsibility of the code generated llvm.set.rounding to ensure this 6371 // condition. 6372 6373 // Calculate new value of FPSCR[23:22]. 6374 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue, 6375 DAG.getConstant(1, DL, MVT::i32)); 6376 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue, 6377 DAG.getConstant(0x3, DL, MVT::i32)); 6378 RMValue = DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue, 6379 DAG.getConstant(ARM::RoundingBitsPos, DL, MVT::i32)); 6380 6381 // Get current value of FPSCR. 6382 SDValue Ops[] = {Chain, 6383 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)}; 6384 SDValue FPSCR = 6385 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops); 6386 Chain = FPSCR.getValue(1); 6387 FPSCR = FPSCR.getValue(0); 6388 6389 // Put new rounding mode into FPSCR[23:22]. 6390 const unsigned RMMask = ~(ARM::Rounding::rmMask << ARM::RoundingBitsPos); 6391 FPSCR = DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR, 6392 DAG.getConstant(RMMask, DL, MVT::i32)); 6393 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCR, RMValue); 6394 SDValue Ops2[] = { 6395 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR}; 6396 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2); 6397 } 6398 6399 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, 6400 const ARMSubtarget *ST) { 6401 SDLoc dl(N); 6402 EVT VT = N->getValueType(0); 6403 if (VT.isVector() && ST->hasNEON()) { 6404 6405 // Compute the least significant set bit: LSB = X & -X 6406 SDValue X = N->getOperand(0); 6407 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X); 6408 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX); 6409 6410 EVT ElemTy = VT.getVectorElementType(); 6411 6412 if (ElemTy == MVT::i8) { 6413 // Compute with: cttz(x) = ctpop(lsb - 1) 6414 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 6415 DAG.getTargetConstant(1, dl, ElemTy)); 6416 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); 6417 return DAG.getNode(ISD::CTPOP, dl, VT, Bits); 6418 } 6419 6420 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) && 6421 (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) { 6422 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0 6423 unsigned NumBits = ElemTy.getSizeInBits(); 6424 SDValue WidthMinus1 = 6425 DAG.getNode(ARMISD::VMOVIMM, dl, VT, 6426 DAG.getTargetConstant(NumBits - 1, dl, ElemTy)); 6427 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB); 6428 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ); 6429 } 6430 6431 // Compute with: cttz(x) = ctpop(lsb - 1) 6432 6433 // Compute LSB - 1. 6434 SDValue Bits; 6435 if (ElemTy == MVT::i64) { 6436 // Load constant 0xffff'ffff'ffff'ffff to register. 6437 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 6438 DAG.getTargetConstant(0x1eff, dl, MVT::i32)); 6439 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF); 6440 } else { 6441 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 6442 DAG.getTargetConstant(1, dl, ElemTy)); 6443 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); 6444 } 6445 return DAG.getNode(ISD::CTPOP, dl, VT, Bits); 6446 } 6447 6448 if (!ST->hasV6T2Ops()) 6449 return SDValue(); 6450 6451 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0)); 6452 return DAG.getNode(ISD::CTLZ, dl, VT, rbit); 6453 } 6454 6455 static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG, 6456 const ARMSubtarget *ST) { 6457 EVT VT = N->getValueType(0); 6458 SDLoc DL(N); 6459 6460 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON."); 6461 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 || 6462 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) && 6463 "Unexpected type for custom ctpop lowering"); 6464 6465 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 6466 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; 6467 SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0)); 6468 Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res); 6469 6470 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds. 6471 unsigned EltSize = 8; 6472 unsigned NumElts = VT.is64BitVector() ? 8 : 16; 6473 while (EltSize != VT.getScalarSizeInBits()) { 6474 SmallVector<SDValue, 8> Ops; 6475 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL, 6476 TLI.getPointerTy(DAG.getDataLayout()))); 6477 Ops.push_back(Res); 6478 6479 EltSize *= 2; 6480 NumElts /= 2; 6481 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts); 6482 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops); 6483 } 6484 6485 return Res; 6486 } 6487 6488 /// Getvshiftimm - Check if this is a valid build_vector for the immediate 6489 /// operand of a vector shift operation, where all the elements of the 6490 /// build_vector must have the same constant integer value. 6491 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { 6492 // Ignore bit_converts. 6493 while (Op.getOpcode() == ISD::BITCAST) 6494 Op = Op.getOperand(0); 6495 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); 6496 APInt SplatBits, SplatUndef; 6497 unsigned SplatBitSize; 6498 bool HasAnyUndefs; 6499 if (!BVN || 6500 !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, 6501 ElementBits) || 6502 SplatBitSize > ElementBits) 6503 return false; 6504 Cnt = SplatBits.getSExtValue(); 6505 return true; 6506 } 6507 6508 /// isVShiftLImm - Check if this is a valid build_vector for the immediate 6509 /// operand of a vector shift left operation. That value must be in the range: 6510 /// 0 <= Value < ElementBits for a left shift; or 6511 /// 0 <= Value <= ElementBits for a long left shift. 6512 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { 6513 assert(VT.isVector() && "vector shift count is not a vector type"); 6514 int64_t ElementBits = VT.getScalarSizeInBits(); 6515 if (!getVShiftImm(Op, ElementBits, Cnt)) 6516 return false; 6517 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); 6518 } 6519 6520 /// isVShiftRImm - Check if this is a valid build_vector for the immediate 6521 /// operand of a vector shift right operation. For a shift opcode, the value 6522 /// is positive, but for an intrinsic the value count must be negative. The 6523 /// absolute value must be in the range: 6524 /// 1 <= |Value| <= ElementBits for a right shift; or 6525 /// 1 <= |Value| <= ElementBits/2 for a narrow right shift. 6526 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, 6527 int64_t &Cnt) { 6528 assert(VT.isVector() && "vector shift count is not a vector type"); 6529 int64_t ElementBits = VT.getScalarSizeInBits(); 6530 if (!getVShiftImm(Op, ElementBits, Cnt)) 6531 return false; 6532 if (!isIntrinsic) 6533 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); 6534 if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) { 6535 Cnt = -Cnt; 6536 return true; 6537 } 6538 return false; 6539 } 6540 6541 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, 6542 const ARMSubtarget *ST) { 6543 EVT VT = N->getValueType(0); 6544 SDLoc dl(N); 6545 int64_t Cnt; 6546 6547 if (!VT.isVector()) 6548 return SDValue(); 6549 6550 // We essentially have two forms here. Shift by an immediate and shift by a 6551 // vector register (there are also shift by a gpr, but that is just handled 6552 // with a tablegen pattern). We cannot easily match shift by an immediate in 6553 // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM. 6554 // For shifting by a vector, we don't have VSHR, only VSHL (which can be 6555 // signed or unsigned, and a negative shift indicates a shift right). 6556 if (N->getOpcode() == ISD::SHL) { 6557 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) 6558 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0), 6559 DAG.getConstant(Cnt, dl, MVT::i32)); 6560 return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0), 6561 N->getOperand(1)); 6562 } 6563 6564 assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) && 6565 "unexpected vector shift opcode"); 6566 6567 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { 6568 unsigned VShiftOpc = 6569 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM); 6570 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), 6571 DAG.getConstant(Cnt, dl, MVT::i32)); 6572 } 6573 6574 // Other right shifts we don't have operations for (we use a shift left by a 6575 // negative number). 6576 EVT ShiftVT = N->getOperand(1).getValueType(); 6577 SDValue NegatedCount = DAG.getNode( 6578 ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1)); 6579 unsigned VShiftOpc = 6580 (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu); 6581 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount); 6582 } 6583 6584 static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, 6585 const ARMSubtarget *ST) { 6586 EVT VT = N->getValueType(0); 6587 SDLoc dl(N); 6588 6589 // We can get here for a node like i32 = ISD::SHL i32, i64 6590 if (VT != MVT::i64) 6591 return SDValue(); 6592 6593 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA || 6594 N->getOpcode() == ISD::SHL) && 6595 "Unknown shift to lower!"); 6596 6597 unsigned ShOpc = N->getOpcode(); 6598 if (ST->hasMVEIntegerOps()) { 6599 SDValue ShAmt = N->getOperand(1); 6600 unsigned ShPartsOpc = ARMISD::LSLL; 6601 ConstantSDNode *Con = dyn_cast<ConstantSDNode>(ShAmt); 6602 6603 // If the shift amount is greater than 32 or has a greater bitwidth than 64 6604 // then do the default optimisation 6605 if (ShAmt->getValueType(0).getSizeInBits() > 64 || 6606 (Con && (Con->getZExtValue() == 0 || Con->getZExtValue() >= 32))) 6607 return SDValue(); 6608 6609 // Extract the lower 32 bits of the shift amount if it's not an i32 6610 if (ShAmt->getValueType(0) != MVT::i32) 6611 ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32); 6612 6613 if (ShOpc == ISD::SRL) { 6614 if (!Con) 6615 // There is no t2LSRLr instruction so negate and perform an lsll if the 6616 // shift amount is in a register, emulating a right shift. 6617 ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 6618 DAG.getConstant(0, dl, MVT::i32), ShAmt); 6619 else 6620 // Else generate an lsrl on the immediate shift amount 6621 ShPartsOpc = ARMISD::LSRL; 6622 } else if (ShOpc == ISD::SRA) 6623 ShPartsOpc = ARMISD::ASRL; 6624 6625 // Lower 32 bits of the destination/source 6626 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6627 DAG.getConstant(0, dl, MVT::i32)); 6628 // Upper 32 bits of the destination/source 6629 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6630 DAG.getConstant(1, dl, MVT::i32)); 6631 6632 // Generate the shift operation as computed above 6633 Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi, 6634 ShAmt); 6635 // The upper 32 bits come from the second return value of lsll 6636 Hi = SDValue(Lo.getNode(), 1); 6637 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 6638 } 6639 6640 // We only lower SRA, SRL of 1 here, all others use generic lowering. 6641 if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL) 6642 return SDValue(); 6643 6644 // If we are in thumb mode, we don't have RRX. 6645 if (ST->isThumb1Only()) 6646 return SDValue(); 6647 6648 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr. 6649 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6650 DAG.getConstant(0, dl, MVT::i32)); 6651 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6652 DAG.getConstant(1, dl, MVT::i32)); 6653 6654 // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and 6655 // captures the result into a carry flag. 6656 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG; 6657 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi); 6658 6659 // The low part is an ARMISD::RRX operand, which shifts the carry in. 6660 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1)); 6661 6662 // Merge the pieces into a single i64 value. 6663 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 6664 } 6665 6666 static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, 6667 const ARMSubtarget *ST) { 6668 bool Invert = false; 6669 bool Swap = false; 6670 unsigned Opc = ARMCC::AL; 6671 6672 SDValue Op0 = Op.getOperand(0); 6673 SDValue Op1 = Op.getOperand(1); 6674 SDValue CC = Op.getOperand(2); 6675 EVT VT = Op.getValueType(); 6676 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 6677 SDLoc dl(Op); 6678 6679 EVT CmpVT; 6680 if (ST->hasNEON()) 6681 CmpVT = Op0.getValueType().changeVectorElementTypeToInteger(); 6682 else { 6683 assert(ST->hasMVEIntegerOps() && 6684 "No hardware support for integer vector comparison!"); 6685 6686 if (Op.getValueType().getVectorElementType() != MVT::i1) 6687 return SDValue(); 6688 6689 // Make sure we expand floating point setcc to scalar if we do not have 6690 // mve.fp, so that we can handle them from there. 6691 if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps()) 6692 return SDValue(); 6693 6694 CmpVT = VT; 6695 } 6696 6697 if (Op0.getValueType().getVectorElementType() == MVT::i64 && 6698 (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) { 6699 // Special-case integer 64-bit equality comparisons. They aren't legal, 6700 // but they can be lowered with a few vector instructions. 6701 unsigned CmpElements = CmpVT.getVectorNumElements() * 2; 6702 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements); 6703 SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0); 6704 SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1); 6705 SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1, 6706 DAG.getCondCode(ISD::SETEQ)); 6707 SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp); 6708 SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed); 6709 Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged); 6710 if (SetCCOpcode == ISD::SETNE) 6711 Merged = DAG.getNOT(dl, Merged, CmpVT); 6712 Merged = DAG.getSExtOrTrunc(Merged, dl, VT); 6713 return Merged; 6714 } 6715 6716 if (CmpVT.getVectorElementType() == MVT::i64) 6717 // 64-bit comparisons are not legal in general. 6718 return SDValue(); 6719 6720 if (Op1.getValueType().isFloatingPoint()) { 6721 switch (SetCCOpcode) { 6722 default: llvm_unreachable("Illegal FP comparison"); 6723 case ISD::SETUNE: 6724 case ISD::SETNE: 6725 if (ST->hasMVEFloatOps()) { 6726 Opc = ARMCC::NE; break; 6727 } else { 6728 Invert = true; LLVM_FALLTHROUGH; 6729 } 6730 case ISD::SETOEQ: 6731 case ISD::SETEQ: Opc = ARMCC::EQ; break; 6732 case ISD::SETOLT: 6733 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; 6734 case ISD::SETOGT: 6735 case ISD::SETGT: Opc = ARMCC::GT; break; 6736 case ISD::SETOLE: 6737 case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; 6738 case ISD::SETOGE: 6739 case ISD::SETGE: Opc = ARMCC::GE; break; 6740 case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH; 6741 case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break; 6742 case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH; 6743 case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break; 6744 case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH; 6745 case ISD::SETONE: { 6746 // Expand this to (OLT | OGT). 6747 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0, 6748 DAG.getConstant(ARMCC::GT, dl, MVT::i32)); 6749 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, 6750 DAG.getConstant(ARMCC::GT, dl, MVT::i32)); 6751 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1); 6752 if (Invert) 6753 Result = DAG.getNOT(dl, Result, VT); 6754 return Result; 6755 } 6756 case ISD::SETUO: Invert = true; LLVM_FALLTHROUGH; 6757 case ISD::SETO: { 6758 // Expand this to (OLT | OGE). 6759 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0, 6760 DAG.getConstant(ARMCC::GT, dl, MVT::i32)); 6761 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, 6762 DAG.getConstant(ARMCC::GE, dl, MVT::i32)); 6763 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1); 6764 if (Invert) 6765 Result = DAG.getNOT(dl, Result, VT); 6766 return Result; 6767 } 6768 } 6769 } else { 6770 // Integer comparisons. 6771 switch (SetCCOpcode) { 6772 default: llvm_unreachable("Illegal integer comparison"); 6773 case ISD::SETNE: 6774 if (ST->hasMVEIntegerOps()) { 6775 Opc = ARMCC::NE; break; 6776 } else { 6777 Invert = true; LLVM_FALLTHROUGH; 6778 } 6779 case ISD::SETEQ: Opc = ARMCC::EQ; break; 6780 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; 6781 case ISD::SETGT: Opc = ARMCC::GT; break; 6782 case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; 6783 case ISD::SETGE: Opc = ARMCC::GE; break; 6784 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH; 6785 case ISD::SETUGT: Opc = ARMCC::HI; break; 6786 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH; 6787 case ISD::SETUGE: Opc = ARMCC::HS; break; 6788 } 6789 6790 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero). 6791 if (ST->hasNEON() && Opc == ARMCC::EQ) { 6792 SDValue AndOp; 6793 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 6794 AndOp = Op0; 6795 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) 6796 AndOp = Op1; 6797 6798 // Ignore bitconvert. 6799 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST) 6800 AndOp = AndOp.getOperand(0); 6801 6802 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) { 6803 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0)); 6804 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1)); 6805 SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1); 6806 if (!Invert) 6807 Result = DAG.getNOT(dl, Result, VT); 6808 return Result; 6809 } 6810 } 6811 } 6812 6813 if (Swap) 6814 std::swap(Op0, Op1); 6815 6816 // If one of the operands is a constant vector zero, attempt to fold the 6817 // comparison to a specialized compare-against-zero form. 6818 SDValue SingleOp; 6819 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 6820 SingleOp = Op0; 6821 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) { 6822 if (Opc == ARMCC::GE) 6823 Opc = ARMCC::LE; 6824 else if (Opc == ARMCC::GT) 6825 Opc = ARMCC::LT; 6826 SingleOp = Op1; 6827 } 6828 6829 SDValue Result; 6830 if (SingleOp.getNode()) { 6831 Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, SingleOp, 6832 DAG.getConstant(Opc, dl, MVT::i32)); 6833 } else { 6834 Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, 6835 DAG.getConstant(Opc, dl, MVT::i32)); 6836 } 6837 6838 Result = DAG.getSExtOrTrunc(Result, dl, VT); 6839 6840 if (Invert) 6841 Result = DAG.getNOT(dl, Result, VT); 6842 6843 return Result; 6844 } 6845 6846 static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) { 6847 SDValue LHS = Op.getOperand(0); 6848 SDValue RHS = Op.getOperand(1); 6849 SDValue Carry = Op.getOperand(2); 6850 SDValue Cond = Op.getOperand(3); 6851 SDLoc DL(Op); 6852 6853 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only."); 6854 6855 // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we 6856 // have to invert the carry first. 6857 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 6858 DAG.getConstant(1, DL, MVT::i32), Carry); 6859 // This converts the boolean value carry into the carry flag. 6860 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 6861 6862 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 6863 SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry); 6864 6865 SDValue FVal = DAG.getConstant(0, DL, MVT::i32); 6866 SDValue TVal = DAG.getConstant(1, DL, MVT::i32); 6867 SDValue ARMcc = DAG.getConstant( 6868 IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32); 6869 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 6870 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR, 6871 Cmp.getValue(1), SDValue()); 6872 return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc, 6873 CCR, Chain.getValue(1)); 6874 } 6875 6876 /// isVMOVModifiedImm - Check if the specified splat value corresponds to a 6877 /// valid vector constant for a NEON or MVE instruction with a "modified 6878 /// immediate" operand (e.g., VMOV). If so, return the encoded value. 6879 static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, 6880 unsigned SplatBitSize, SelectionDAG &DAG, 6881 const SDLoc &dl, EVT &VT, EVT VectorVT, 6882 VMOVModImmType type) { 6883 unsigned OpCmode, Imm; 6884 bool is128Bits = VectorVT.is128BitVector(); 6885 6886 // SplatBitSize is set to the smallest size that splats the vector, so a 6887 // zero vector will always have SplatBitSize == 8. However, NEON modified 6888 // immediate instructions others than VMOV do not support the 8-bit encoding 6889 // of a zero vector, and the default encoding of zero is supposed to be the 6890 // 32-bit version. 6891 if (SplatBits == 0) 6892 SplatBitSize = 32; 6893 6894 switch (SplatBitSize) { 6895 case 8: 6896 if (type != VMOVModImm) 6897 return SDValue(); 6898 // Any 1-byte value is OK. Op=0, Cmode=1110. 6899 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); 6900 OpCmode = 0xe; 6901 Imm = SplatBits; 6902 VT = is128Bits ? MVT::v16i8 : MVT::v8i8; 6903 break; 6904 6905 case 16: 6906 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero. 6907 VT = is128Bits ? MVT::v8i16 : MVT::v4i16; 6908 if ((SplatBits & ~0xff) == 0) { 6909 // Value = 0x00nn: Op=x, Cmode=100x. 6910 OpCmode = 0x8; 6911 Imm = SplatBits; 6912 break; 6913 } 6914 if ((SplatBits & ~0xff00) == 0) { 6915 // Value = 0xnn00: Op=x, Cmode=101x. 6916 OpCmode = 0xa; 6917 Imm = SplatBits >> 8; 6918 break; 6919 } 6920 return SDValue(); 6921 6922 case 32: 6923 // NEON's 32-bit VMOV supports splat values where: 6924 // * only one byte is nonzero, or 6925 // * the least significant byte is 0xff and the second byte is nonzero, or 6926 // * the least significant 2 bytes are 0xff and the third is nonzero. 6927 VT = is128Bits ? MVT::v4i32 : MVT::v2i32; 6928 if ((SplatBits & ~0xff) == 0) { 6929 // Value = 0x000000nn: Op=x, Cmode=000x. 6930 OpCmode = 0; 6931 Imm = SplatBits; 6932 break; 6933 } 6934 if ((SplatBits & ~0xff00) == 0) { 6935 // Value = 0x0000nn00: Op=x, Cmode=001x. 6936 OpCmode = 0x2; 6937 Imm = SplatBits >> 8; 6938 break; 6939 } 6940 if ((SplatBits & ~0xff0000) == 0) { 6941 // Value = 0x00nn0000: Op=x, Cmode=010x. 6942 OpCmode = 0x4; 6943 Imm = SplatBits >> 16; 6944 break; 6945 } 6946 if ((SplatBits & ~0xff000000) == 0) { 6947 // Value = 0xnn000000: Op=x, Cmode=011x. 6948 OpCmode = 0x6; 6949 Imm = SplatBits >> 24; 6950 break; 6951 } 6952 6953 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC 6954 if (type == OtherModImm) return SDValue(); 6955 6956 if ((SplatBits & ~0xffff) == 0 && 6957 ((SplatBits | SplatUndef) & 0xff) == 0xff) { 6958 // Value = 0x0000nnff: Op=x, Cmode=1100. 6959 OpCmode = 0xc; 6960 Imm = SplatBits >> 8; 6961 break; 6962 } 6963 6964 // cmode == 0b1101 is not supported for MVE VMVN 6965 if (type == MVEVMVNModImm) 6966 return SDValue(); 6967 6968 if ((SplatBits & ~0xffffff) == 0 && 6969 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { 6970 // Value = 0x00nnffff: Op=x, Cmode=1101. 6971 OpCmode = 0xd; 6972 Imm = SplatBits >> 16; 6973 break; 6974 } 6975 6976 // Note: there are a few 32-bit splat values (specifically: 00ffff00, 6977 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not 6978 // VMOV.I32. A (very) minor optimization would be to replicate the value 6979 // and fall through here to test for a valid 64-bit splat. But, then the 6980 // caller would also need to check and handle the change in size. 6981 return SDValue(); 6982 6983 case 64: { 6984 if (type != VMOVModImm) 6985 return SDValue(); 6986 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff. 6987 uint64_t BitMask = 0xff; 6988 unsigned ImmMask = 1; 6989 Imm = 0; 6990 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { 6991 if (((SplatBits | SplatUndef) & BitMask) == BitMask) { 6992 Imm |= ImmMask; 6993 } else if ((SplatBits & BitMask) != 0) { 6994 return SDValue(); 6995 } 6996 BitMask <<= 8; 6997 ImmMask <<= 1; 6998 } 6999 7000 if (DAG.getDataLayout().isBigEndian()) { 7001 // Reverse the order of elements within the vector. 7002 unsigned BytesPerElem = VectorVT.getScalarSizeInBits() / 8; 7003 unsigned Mask = (1 << BytesPerElem) - 1; 7004 unsigned NumElems = 8 / BytesPerElem; 7005 unsigned NewImm = 0; 7006 for (unsigned ElemNum = 0; ElemNum < NumElems; ++ElemNum) { 7007 unsigned Elem = ((Imm >> ElemNum * BytesPerElem) & Mask); 7008 NewImm |= Elem << (NumElems - ElemNum - 1) * BytesPerElem; 7009 } 7010 Imm = NewImm; 7011 } 7012 7013 // Op=1, Cmode=1110. 7014 OpCmode = 0x1e; 7015 VT = is128Bits ? MVT::v2i64 : MVT::v1i64; 7016 break; 7017 } 7018 7019 default: 7020 llvm_unreachable("unexpected size for isVMOVModifiedImm"); 7021 } 7022 7023 unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm); 7024 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32); 7025 } 7026 7027 SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, 7028 const ARMSubtarget *ST) const { 7029 EVT VT = Op.getValueType(); 7030 bool IsDouble = (VT == MVT::f64); 7031 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op); 7032 const APFloat &FPVal = CFP->getValueAPF(); 7033 7034 // Prevent floating-point constants from using literal loads 7035 // when execute-only is enabled. 7036 if (ST->genExecuteOnly()) { 7037 // If we can represent the constant as an immediate, don't lower it 7038 if (isFPImmLegal(FPVal, VT)) 7039 return Op; 7040 // Otherwise, construct as integer, and move to float register 7041 APInt INTVal = FPVal.bitcastToAPInt(); 7042 SDLoc DL(CFP); 7043 switch (VT.getSimpleVT().SimpleTy) { 7044 default: 7045 llvm_unreachable("Unknown floating point type!"); 7046 break; 7047 case MVT::f64: { 7048 SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32); 7049 SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32); 7050 return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi); 7051 } 7052 case MVT::f32: 7053 return DAG.getNode(ARMISD::VMOVSR, DL, VT, 7054 DAG.getConstant(INTVal, DL, MVT::i32)); 7055 } 7056 } 7057 7058 if (!ST->hasVFP3Base()) 7059 return SDValue(); 7060 7061 // Use the default (constant pool) lowering for double constants when we have 7062 // an SP-only FPU 7063 if (IsDouble && !Subtarget->hasFP64()) 7064 return SDValue(); 7065 7066 // Try splatting with a VMOV.f32... 7067 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal); 7068 7069 if (ImmVal != -1) { 7070 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) { 7071 // We have code in place to select a valid ConstantFP already, no need to 7072 // do any mangling. 7073 return Op; 7074 } 7075 7076 // It's a float and we are trying to use NEON operations where 7077 // possible. Lower it to a splat followed by an extract. 7078 SDLoc DL(Op); 7079 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32); 7080 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32, 7081 NewVal); 7082 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant, 7083 DAG.getConstant(0, DL, MVT::i32)); 7084 } 7085 7086 // The rest of our options are NEON only, make sure that's allowed before 7087 // proceeding.. 7088 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP())) 7089 return SDValue(); 7090 7091 EVT VMovVT; 7092 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue(); 7093 7094 // It wouldn't really be worth bothering for doubles except for one very 7095 // important value, which does happen to match: 0.0. So make sure we don't do 7096 // anything stupid. 7097 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32)) 7098 return SDValue(); 7099 7100 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too). 7101 SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), 7102 VMovVT, VT, VMOVModImm); 7103 if (NewVal != SDValue()) { 7104 SDLoc DL(Op); 7105 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT, 7106 NewVal); 7107 if (IsDouble) 7108 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); 7109 7110 // It's a float: cast and extract a vector element. 7111 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 7112 VecConstant); 7113 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 7114 DAG.getConstant(0, DL, MVT::i32)); 7115 } 7116 7117 // Finally, try a VMVN.i32 7118 NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT, 7119 VT, VMVNModImm); 7120 if (NewVal != SDValue()) { 7121 SDLoc DL(Op); 7122 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal); 7123 7124 if (IsDouble) 7125 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); 7126 7127 // It's a float: cast and extract a vector element. 7128 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 7129 VecConstant); 7130 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 7131 DAG.getConstant(0, DL, MVT::i32)); 7132 } 7133 7134 return SDValue(); 7135 } 7136 7137 // check if an VEXT instruction can handle the shuffle mask when the 7138 // vector sources of the shuffle are the same. 7139 static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { 7140 unsigned NumElts = VT.getVectorNumElements(); 7141 7142 // Assume that the first shuffle index is not UNDEF. Fail if it is. 7143 if (M[0] < 0) 7144 return false; 7145 7146 Imm = M[0]; 7147 7148 // If this is a VEXT shuffle, the immediate value is the index of the first 7149 // element. The other shuffle indices must be the successive elements after 7150 // the first one. 7151 unsigned ExpectedElt = Imm; 7152 for (unsigned i = 1; i < NumElts; ++i) { 7153 // Increment the expected index. If it wraps around, just follow it 7154 // back to index zero and keep going. 7155 ++ExpectedElt; 7156 if (ExpectedElt == NumElts) 7157 ExpectedElt = 0; 7158 7159 if (M[i] < 0) continue; // ignore UNDEF indices 7160 if (ExpectedElt != static_cast<unsigned>(M[i])) 7161 return false; 7162 } 7163 7164 return true; 7165 } 7166 7167 static bool isVEXTMask(ArrayRef<int> M, EVT VT, 7168 bool &ReverseVEXT, unsigned &Imm) { 7169 unsigned NumElts = VT.getVectorNumElements(); 7170 ReverseVEXT = false; 7171 7172 // Assume that the first shuffle index is not UNDEF. Fail if it is. 7173 if (M[0] < 0) 7174 return false; 7175 7176 Imm = M[0]; 7177 7178 // If this is a VEXT shuffle, the immediate value is the index of the first 7179 // element. The other shuffle indices must be the successive elements after 7180 // the first one. 7181 unsigned ExpectedElt = Imm; 7182 for (unsigned i = 1; i < NumElts; ++i) { 7183 // Increment the expected index. If it wraps around, it may still be 7184 // a VEXT but the source vectors must be swapped. 7185 ExpectedElt += 1; 7186 if (ExpectedElt == NumElts * 2) { 7187 ExpectedElt = 0; 7188 ReverseVEXT = true; 7189 } 7190 7191 if (M[i] < 0) continue; // ignore UNDEF indices 7192 if (ExpectedElt != static_cast<unsigned>(M[i])) 7193 return false; 7194 } 7195 7196 // Adjust the index value if the source operands will be swapped. 7197 if (ReverseVEXT) 7198 Imm -= NumElts; 7199 7200 return true; 7201 } 7202 7203 static bool isVTBLMask(ArrayRef<int> M, EVT VT) { 7204 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of 7205 // range, then 0 is placed into the resulting vector. So pretty much any mask 7206 // of 8 elements can work here. 7207 return VT == MVT::v8i8 && M.size() == 8; 7208 } 7209 7210 static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask, 7211 unsigned Index) { 7212 if (Mask.size() == Elements * 2) 7213 return Index / Elements; 7214 return Mask[Index] == 0 ? 0 : 1; 7215 } 7216 7217 // Checks whether the shuffle mask represents a vector transpose (VTRN) by 7218 // checking that pairs of elements in the shuffle mask represent the same index 7219 // in each vector, incrementing the expected index by 2 at each step. 7220 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6] 7221 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g} 7222 // v2={e,f,g,h} 7223 // WhichResult gives the offset for each element in the mask based on which 7224 // of the two results it belongs to. 7225 // 7226 // The transpose can be represented either as: 7227 // result1 = shufflevector v1, v2, result1_shuffle_mask 7228 // result2 = shufflevector v1, v2, result2_shuffle_mask 7229 // where v1/v2 and the shuffle masks have the same number of elements 7230 // (here WhichResult (see below) indicates which result is being checked) 7231 // 7232 // or as: 7233 // results = shufflevector v1, v2, shuffle_mask 7234 // where both results are returned in one vector and the shuffle mask has twice 7235 // as many elements as v1/v2 (here WhichResult will always be 0 if true) here we 7236 // want to check the low half and high half of the shuffle mask as if it were 7237 // the other case 7238 static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 7239 unsigned EltSz = VT.getScalarSizeInBits(); 7240 if (EltSz == 64) 7241 return false; 7242 7243 unsigned NumElts = VT.getVectorNumElements(); 7244 if (M.size() != NumElts && M.size() != NumElts*2) 7245 return false; 7246 7247 // If the mask is twice as long as the input vector then we need to check the 7248 // upper and lower parts of the mask with a matching value for WhichResult 7249 // FIXME: A mask with only even values will be rejected in case the first 7250 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only 7251 // M[0] is used to determine WhichResult 7252 for (unsigned i = 0; i < M.size(); i += NumElts) { 7253 WhichResult = SelectPairHalf(NumElts, M, i); 7254 for (unsigned j = 0; j < NumElts; j += 2) { 7255 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || 7256 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult)) 7257 return false; 7258 } 7259 } 7260 7261 if (M.size() == NumElts*2) 7262 WhichResult = 0; 7263 7264 return true; 7265 } 7266 7267 /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of 7268 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 7269 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. 7270 static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 7271 unsigned EltSz = VT.getScalarSizeInBits(); 7272 if (EltSz == 64) 7273 return false; 7274 7275 unsigned NumElts = VT.getVectorNumElements(); 7276 if (M.size() != NumElts && M.size() != NumElts*2) 7277 return false; 7278 7279 for (unsigned i = 0; i < M.size(); i += NumElts) { 7280 WhichResult = SelectPairHalf(NumElts, M, i); 7281 for (unsigned j = 0; j < NumElts; j += 2) { 7282 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || 7283 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult)) 7284 return false; 7285 } 7286 } 7287 7288 if (M.size() == NumElts*2) 7289 WhichResult = 0; 7290 7291 return true; 7292 } 7293 7294 // Checks whether the shuffle mask represents a vector unzip (VUZP) by checking 7295 // that the mask elements are either all even and in steps of size 2 or all odd 7296 // and in steps of size 2. 7297 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6] 7298 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g} 7299 // v2={e,f,g,h} 7300 // Requires similar checks to that of isVTRNMask with 7301 // respect the how results are returned. 7302 static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 7303 unsigned EltSz = VT.getScalarSizeInBits(); 7304 if (EltSz == 64) 7305 return false; 7306 7307 unsigned NumElts = VT.getVectorNumElements(); 7308 if (M.size() != NumElts && M.size() != NumElts*2) 7309 return false; 7310 7311 for (unsigned i = 0; i < M.size(); i += NumElts) { 7312 WhichResult = SelectPairHalf(NumElts, M, i); 7313 for (unsigned j = 0; j < NumElts; ++j) { 7314 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult) 7315 return false; 7316 } 7317 } 7318 7319 if (M.size() == NumElts*2) 7320 WhichResult = 0; 7321 7322 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 7323 if (VT.is64BitVector() && EltSz == 32) 7324 return false; 7325 7326 return true; 7327 } 7328 7329 /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of 7330 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 7331 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, 7332 static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 7333 unsigned EltSz = VT.getScalarSizeInBits(); 7334 if (EltSz == 64) 7335 return false; 7336 7337 unsigned NumElts = VT.getVectorNumElements(); 7338 if (M.size() != NumElts && M.size() != NumElts*2) 7339 return false; 7340 7341 unsigned Half = NumElts / 2; 7342 for (unsigned i = 0; i < M.size(); i += NumElts) { 7343 WhichResult = SelectPairHalf(NumElts, M, i); 7344 for (unsigned j = 0; j < NumElts; j += Half) { 7345 unsigned Idx = WhichResult; 7346 for (unsigned k = 0; k < Half; ++k) { 7347 int MIdx = M[i + j + k]; 7348 if (MIdx >= 0 && (unsigned) MIdx != Idx) 7349 return false; 7350 Idx += 2; 7351 } 7352 } 7353 } 7354 7355 if (M.size() == NumElts*2) 7356 WhichResult = 0; 7357 7358 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 7359 if (VT.is64BitVector() && EltSz == 32) 7360 return false; 7361 7362 return true; 7363 } 7364 7365 // Checks whether the shuffle mask represents a vector zip (VZIP) by checking 7366 // that pairs of elements of the shufflemask represent the same index in each 7367 // vector incrementing sequentially through the vectors. 7368 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5] 7369 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f} 7370 // v2={e,f,g,h} 7371 // Requires similar checks to that of isVTRNMask with respect the how results 7372 // are returned. 7373 static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 7374 unsigned EltSz = VT.getScalarSizeInBits(); 7375 if (EltSz == 64) 7376 return false; 7377 7378 unsigned NumElts = VT.getVectorNumElements(); 7379 if (M.size() != NumElts && M.size() != NumElts*2) 7380 return false; 7381 7382 for (unsigned i = 0; i < M.size(); i += NumElts) { 7383 WhichResult = SelectPairHalf(NumElts, M, i); 7384 unsigned Idx = WhichResult * NumElts / 2; 7385 for (unsigned j = 0; j < NumElts; j += 2) { 7386 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || 7387 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts)) 7388 return false; 7389 Idx += 1; 7390 } 7391 } 7392 7393 if (M.size() == NumElts*2) 7394 WhichResult = 0; 7395 7396 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 7397 if (VT.is64BitVector() && EltSz == 32) 7398 return false; 7399 7400 return true; 7401 } 7402 7403 /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of 7404 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 7405 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. 7406 static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 7407 unsigned EltSz = VT.getScalarSizeInBits(); 7408 if (EltSz == 64) 7409 return false; 7410 7411 unsigned NumElts = VT.getVectorNumElements(); 7412 if (M.size() != NumElts && M.size() != NumElts*2) 7413 return false; 7414 7415 for (unsigned i = 0; i < M.size(); i += NumElts) { 7416 WhichResult = SelectPairHalf(NumElts, M, i); 7417 unsigned Idx = WhichResult * NumElts / 2; 7418 for (unsigned j = 0; j < NumElts; j += 2) { 7419 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || 7420 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx)) 7421 return false; 7422 Idx += 1; 7423 } 7424 } 7425 7426 if (M.size() == NumElts*2) 7427 WhichResult = 0; 7428 7429 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 7430 if (VT.is64BitVector() && EltSz == 32) 7431 return false; 7432 7433 return true; 7434 } 7435 7436 /// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), 7437 /// and return the corresponding ARMISD opcode if it is, or 0 if it isn't. 7438 static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT, 7439 unsigned &WhichResult, 7440 bool &isV_UNDEF) { 7441 isV_UNDEF = false; 7442 if (isVTRNMask(ShuffleMask, VT, WhichResult)) 7443 return ARMISD::VTRN; 7444 if (isVUZPMask(ShuffleMask, VT, WhichResult)) 7445 return ARMISD::VUZP; 7446 if (isVZIPMask(ShuffleMask, VT, WhichResult)) 7447 return ARMISD::VZIP; 7448 7449 isV_UNDEF = true; 7450 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) 7451 return ARMISD::VTRN; 7452 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 7453 return ARMISD::VUZP; 7454 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 7455 return ARMISD::VZIP; 7456 7457 return 0; 7458 } 7459 7460 /// \return true if this is a reverse operation on an vector. 7461 static bool isReverseMask(ArrayRef<int> M, EVT VT) { 7462 unsigned NumElts = VT.getVectorNumElements(); 7463 // Make sure the mask has the right size. 7464 if (NumElts != M.size()) 7465 return false; 7466 7467 // Look for <15, ..., 3, -1, 1, 0>. 7468 for (unsigned i = 0; i != NumElts; ++i) 7469 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i)) 7470 return false; 7471 7472 return true; 7473 } 7474 7475 static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) { 7476 unsigned NumElts = VT.getVectorNumElements(); 7477 // Make sure the mask has the right size. 7478 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8)) 7479 return false; 7480 7481 // If Top 7482 // Look for <0, N, 2, N+2, 4, N+4, ..>. 7483 // This inserts Input2 into Input1 7484 // else if not Top 7485 // Look for <0, N+1, 2, N+3, 4, N+5, ..> 7486 // This inserts Input1 into Input2 7487 unsigned Offset = Top ? 0 : 1; 7488 unsigned N = SingleSource ? 0 : NumElts; 7489 for (unsigned i = 0; i < NumElts; i += 2) { 7490 if (M[i] >= 0 && M[i] != (int)i) 7491 return false; 7492 if (M[i + 1] >= 0 && M[i + 1] != (int)(N + i + Offset)) 7493 return false; 7494 } 7495 7496 return true; 7497 } 7498 7499 static bool isVMOVNTruncMask(ArrayRef<int> M, EVT ToVT, bool rev) { 7500 unsigned NumElts = ToVT.getVectorNumElements(); 7501 if (NumElts != M.size()) 7502 return false; 7503 7504 // Test if the Trunc can be convertable to a VMOVN with this shuffle. We are 7505 // looking for patterns of: 7506 // !rev: 0 N/2 1 N/2+1 2 N/2+2 ... 7507 // rev: N/2 0 N/2+1 1 N/2+2 2 ... 7508 7509 unsigned Off0 = rev ? NumElts / 2 : 0; 7510 unsigned Off1 = rev ? 0 : NumElts / 2; 7511 for (unsigned i = 0; i < NumElts; i += 2) { 7512 if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2)) 7513 return false; 7514 if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2)) 7515 return false; 7516 } 7517 7518 return true; 7519 } 7520 7521 // Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted 7522 // from a pair of inputs. For example: 7523 // BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0), 7524 // FP_ROUND(EXTRACT_ELT(Y, 0), 7525 // FP_ROUND(EXTRACT_ELT(X, 1), 7526 // FP_ROUND(EXTRACT_ELT(Y, 1), ...) 7527 static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG, 7528 const ARMSubtarget *ST) { 7529 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 7530 if (!ST->hasMVEFloatOps()) 7531 return SDValue(); 7532 7533 SDLoc dl(BV); 7534 EVT VT = BV.getValueType(); 7535 if (VT != MVT::v8f16) 7536 return SDValue(); 7537 7538 // We are looking for a buildvector of fptrunc elements, where all the 7539 // elements are interleavingly extracted from two sources. Check the first two 7540 // items are valid enough and extract some info from them (they are checked 7541 // properly in the loop below). 7542 if (BV.getOperand(0).getOpcode() != ISD::FP_ROUND || 7543 BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT || 7544 BV.getOperand(0).getOperand(0).getConstantOperandVal(1) != 0) 7545 return SDValue(); 7546 if (BV.getOperand(1).getOpcode() != ISD::FP_ROUND || 7547 BV.getOperand(1).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT || 7548 BV.getOperand(1).getOperand(0).getConstantOperandVal(1) != 0) 7549 return SDValue(); 7550 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0); 7551 SDValue Op1 = BV.getOperand(1).getOperand(0).getOperand(0); 7552 if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32) 7553 return SDValue(); 7554 7555 // Check all the values in the BuildVector line up with our expectations. 7556 for (unsigned i = 1; i < 4; i++) { 7557 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) { 7558 return Trunc.getOpcode() == ISD::FP_ROUND && 7559 Trunc.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && 7560 Trunc.getOperand(0).getOperand(0) == Op && 7561 Trunc.getOperand(0).getConstantOperandVal(1) == Idx; 7562 }; 7563 if (!Check(BV.getOperand(i * 2 + 0), Op0, i)) 7564 return SDValue(); 7565 if (!Check(BV.getOperand(i * 2 + 1), Op1, i)) 7566 return SDValue(); 7567 } 7568 7569 SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0, 7570 DAG.getConstant(0, dl, MVT::i32)); 7571 return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1, 7572 DAG.getConstant(1, dl, MVT::i32)); 7573 } 7574 7575 // Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted 7576 // from a single input on alternating lanes. For example: 7577 // BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0), 7578 // FP_ROUND(EXTRACT_ELT(X, 2), 7579 // FP_ROUND(EXTRACT_ELT(X, 4), ...) 7580 static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG, 7581 const ARMSubtarget *ST) { 7582 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 7583 if (!ST->hasMVEFloatOps()) 7584 return SDValue(); 7585 7586 SDLoc dl(BV); 7587 EVT VT = BV.getValueType(); 7588 if (VT != MVT::v4f32) 7589 return SDValue(); 7590 7591 // We are looking for a buildvector of fptext elements, where all the 7592 // elements are alternating lanes from a single source. For example <0,2,4,6> 7593 // or <1,3,5,7>. Check the first two items are valid enough and extract some 7594 // info from them (they are checked properly in the loop below). 7595 if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND || 7596 BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT) 7597 return SDValue(); 7598 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0); 7599 int Offset = BV.getOperand(0).getOperand(0).getConstantOperandVal(1); 7600 if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1)) 7601 return SDValue(); 7602 7603 // Check all the values in the BuildVector line up with our expectations. 7604 for (unsigned i = 1; i < 4; i++) { 7605 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) { 7606 return Trunc.getOpcode() == ISD::FP_EXTEND && 7607 Trunc.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && 7608 Trunc.getOperand(0).getOperand(0) == Op && 7609 Trunc.getOperand(0).getConstantOperandVal(1) == Idx; 7610 }; 7611 if (!Check(BV.getOperand(i), Op0, 2 * i + Offset)) 7612 return SDValue(); 7613 } 7614 7615 return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0, 7616 DAG.getConstant(Offset, dl, MVT::i32)); 7617 } 7618 7619 // If N is an integer constant that can be moved into a register in one 7620 // instruction, return an SDValue of such a constant (will become a MOV 7621 // instruction). Otherwise return null. 7622 static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, 7623 const ARMSubtarget *ST, const SDLoc &dl) { 7624 uint64_t Val; 7625 if (!isa<ConstantSDNode>(N)) 7626 return SDValue(); 7627 Val = cast<ConstantSDNode>(N)->getZExtValue(); 7628 7629 if (ST->isThumb1Only()) { 7630 if (Val <= 255 || ~Val <= 255) 7631 return DAG.getConstant(Val, dl, MVT::i32); 7632 } else { 7633 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1) 7634 return DAG.getConstant(Val, dl, MVT::i32); 7635 } 7636 return SDValue(); 7637 } 7638 7639 static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, 7640 const ARMSubtarget *ST) { 7641 SDLoc dl(Op); 7642 EVT VT = Op.getValueType(); 7643 7644 assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!"); 7645 7646 unsigned NumElts = VT.getVectorNumElements(); 7647 unsigned BoolMask; 7648 unsigned BitsPerBool; 7649 if (NumElts == 2) { 7650 BitsPerBool = 8; 7651 BoolMask = 0xff; 7652 } else if (NumElts == 4) { 7653 BitsPerBool = 4; 7654 BoolMask = 0xf; 7655 } else if (NumElts == 8) { 7656 BitsPerBool = 2; 7657 BoolMask = 0x3; 7658 } else if (NumElts == 16) { 7659 BitsPerBool = 1; 7660 BoolMask = 0x1; 7661 } else 7662 return SDValue(); 7663 7664 // If this is a single value copied into all lanes (a splat), we can just sign 7665 // extend that single value 7666 SDValue FirstOp = Op.getOperand(0); 7667 if (!isa<ConstantSDNode>(FirstOp) && 7668 std::all_of(std::next(Op->op_begin()), Op->op_end(), 7669 [&FirstOp](SDUse &U) { 7670 return U.get().isUndef() || U.get() == FirstOp; 7671 })) { 7672 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp, 7673 DAG.getValueType(MVT::i1)); 7674 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext); 7675 } 7676 7677 // First create base with bits set where known 7678 unsigned Bits32 = 0; 7679 for (unsigned i = 0; i < NumElts; ++i) { 7680 SDValue V = Op.getOperand(i); 7681 if (!isa<ConstantSDNode>(V) && !V.isUndef()) 7682 continue; 7683 bool BitSet = V.isUndef() ? false : cast<ConstantSDNode>(V)->getZExtValue(); 7684 if (BitSet) 7685 Bits32 |= BoolMask << (i * BitsPerBool); 7686 } 7687 7688 // Add in unknown nodes 7689 SDValue Base = DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, 7690 DAG.getConstant(Bits32, dl, MVT::i32)); 7691 for (unsigned i = 0; i < NumElts; ++i) { 7692 SDValue V = Op.getOperand(i); 7693 if (isa<ConstantSDNode>(V) || V.isUndef()) 7694 continue; 7695 Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V, 7696 DAG.getConstant(i, dl, MVT::i32)); 7697 } 7698 7699 return Base; 7700 } 7701 7702 static SDValue LowerBUILD_VECTORToVIDUP(SDValue Op, SelectionDAG &DAG, 7703 const ARMSubtarget *ST) { 7704 if (!ST->hasMVEIntegerOps()) 7705 return SDValue(); 7706 7707 // We are looking for a buildvector where each element is Op[0] + i*N 7708 EVT VT = Op.getValueType(); 7709 SDValue Op0 = Op.getOperand(0); 7710 unsigned NumElts = VT.getVectorNumElements(); 7711 7712 // Get the increment value from operand 1 7713 SDValue Op1 = Op.getOperand(1); 7714 if (Op1.getOpcode() != ISD::ADD || Op1.getOperand(0) != Op0 || 7715 !isa<ConstantSDNode>(Op1.getOperand(1))) 7716 return SDValue(); 7717 unsigned N = Op1.getConstantOperandVal(1); 7718 if (N != 1 && N != 2 && N != 4 && N != 8) 7719 return SDValue(); 7720 7721 // Check that each other operand matches 7722 for (unsigned I = 2; I < NumElts; I++) { 7723 SDValue OpI = Op.getOperand(I); 7724 if (OpI.getOpcode() != ISD::ADD || OpI.getOperand(0) != Op0 || 7725 !isa<ConstantSDNode>(OpI.getOperand(1)) || 7726 OpI.getConstantOperandVal(1) != I * N) 7727 return SDValue(); 7728 } 7729 7730 SDLoc DL(Op); 7731 return DAG.getNode(ARMISD::VIDUP, DL, DAG.getVTList(VT, MVT::i32), Op0, 7732 DAG.getConstant(N, DL, MVT::i32)); 7733 } 7734 7735 // Returns true if the operation N can be treated as qr instruction variant at 7736 // operand Op. 7737 static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op) { 7738 switch (N->getOpcode()) { 7739 case ISD::ADD: 7740 case ISD::MUL: 7741 case ISD::SADDSAT: 7742 case ISD::UADDSAT: 7743 return true; 7744 case ISD::SUB: 7745 case ISD::SSUBSAT: 7746 case ISD::USUBSAT: 7747 return N->getOperand(1).getNode() == Op; 7748 case ISD::INTRINSIC_WO_CHAIN: 7749 switch (N->getConstantOperandVal(0)) { 7750 case Intrinsic::arm_mve_add_predicated: 7751 case Intrinsic::arm_mve_mul_predicated: 7752 case Intrinsic::arm_mve_qadd_predicated: 7753 case Intrinsic::arm_mve_vhadd: 7754 case Intrinsic::arm_mve_hadd_predicated: 7755 case Intrinsic::arm_mve_vqdmulh: 7756 case Intrinsic::arm_mve_qdmulh_predicated: 7757 case Intrinsic::arm_mve_vqrdmulh: 7758 case Intrinsic::arm_mve_qrdmulh_predicated: 7759 case Intrinsic::arm_mve_vqdmull: 7760 case Intrinsic::arm_mve_vqdmull_predicated: 7761 return true; 7762 case Intrinsic::arm_mve_sub_predicated: 7763 case Intrinsic::arm_mve_qsub_predicated: 7764 case Intrinsic::arm_mve_vhsub: 7765 case Intrinsic::arm_mve_hsub_predicated: 7766 return N->getOperand(2).getNode() == Op; 7767 default: 7768 return false; 7769 } 7770 default: 7771 return false; 7772 } 7773 } 7774 7775 // If this is a case we can't handle, return null and let the default 7776 // expansion code take care of it. 7777 SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, 7778 const ARMSubtarget *ST) const { 7779 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 7780 SDLoc dl(Op); 7781 EVT VT = Op.getValueType(); 7782 7783 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) 7784 return LowerBUILD_VECTOR_i1(Op, DAG, ST); 7785 7786 if (SDValue R = LowerBUILD_VECTORToVIDUP(Op, DAG, ST)) 7787 return R; 7788 7789 APInt SplatBits, SplatUndef; 7790 unsigned SplatBitSize; 7791 bool HasAnyUndefs; 7792 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 7793 if (SplatUndef.isAllOnes()) 7794 return DAG.getUNDEF(VT); 7795 7796 // If all the users of this constant splat are qr instruction variants, 7797 // generate a vdup of the constant. 7798 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == SplatBitSize && 7799 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32) && 7800 all_of(BVN->uses(), 7801 [BVN](const SDNode *U) { return IsQRMVEInstruction(U, BVN); })) { 7802 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32 7803 : SplatBitSize == 16 ? MVT::v8i16 7804 : MVT::v16i8; 7805 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32); 7806 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const); 7807 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup); 7808 } 7809 7810 if ((ST->hasNEON() && SplatBitSize <= 64) || 7811 (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) { 7812 // Check if an immediate VMOV works. 7813 EVT VmovVT; 7814 SDValue Val = 7815 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), 7816 SplatBitSize, DAG, dl, VmovVT, VT, VMOVModImm); 7817 7818 if (Val.getNode()) { 7819 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val); 7820 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 7821 } 7822 7823 // Try an immediate VMVN. 7824 uint64_t NegatedImm = (~SplatBits).getZExtValue(); 7825 Val = isVMOVModifiedImm( 7826 NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT, 7827 VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm); 7828 if (Val.getNode()) { 7829 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val); 7830 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 7831 } 7832 7833 // Use vmov.f32 to materialize other v2f32 and v4f32 splats. 7834 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) { 7835 int ImmVal = ARM_AM::getFP32Imm(SplatBits); 7836 if (ImmVal != -1) { 7837 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32); 7838 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val); 7839 } 7840 } 7841 7842 // If we are under MVE, generate a VDUP(constant), bitcast to the original 7843 // type. 7844 if (ST->hasMVEIntegerOps() && 7845 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32)) { 7846 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32 7847 : SplatBitSize == 16 ? MVT::v8i16 7848 : MVT::v16i8; 7849 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32); 7850 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const); 7851 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup); 7852 } 7853 } 7854 } 7855 7856 // Scan through the operands to see if only one value is used. 7857 // 7858 // As an optimisation, even if more than one value is used it may be more 7859 // profitable to splat with one value then change some lanes. 7860 // 7861 // Heuristically we decide to do this if the vector has a "dominant" value, 7862 // defined as splatted to more than half of the lanes. 7863 unsigned NumElts = VT.getVectorNumElements(); 7864 bool isOnlyLowElement = true; 7865 bool usesOnlyOneValue = true; 7866 bool hasDominantValue = false; 7867 bool isConstant = true; 7868 7869 // Map of the number of times a particular SDValue appears in the 7870 // element list. 7871 DenseMap<SDValue, unsigned> ValueCounts; 7872 SDValue Value; 7873 for (unsigned i = 0; i < NumElts; ++i) { 7874 SDValue V = Op.getOperand(i); 7875 if (V.isUndef()) 7876 continue; 7877 if (i > 0) 7878 isOnlyLowElement = false; 7879 if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) 7880 isConstant = false; 7881 7882 ValueCounts.insert(std::make_pair(V, 0)); 7883 unsigned &Count = ValueCounts[V]; 7884 7885 // Is this value dominant? (takes up more than half of the lanes) 7886 if (++Count > (NumElts / 2)) { 7887 hasDominantValue = true; 7888 Value = V; 7889 } 7890 } 7891 if (ValueCounts.size() != 1) 7892 usesOnlyOneValue = false; 7893 if (!Value.getNode() && !ValueCounts.empty()) 7894 Value = ValueCounts.begin()->first; 7895 7896 if (ValueCounts.empty()) 7897 return DAG.getUNDEF(VT); 7898 7899 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR. 7900 // Keep going if we are hitting this case. 7901 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode())) 7902 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); 7903 7904 unsigned EltSize = VT.getScalarSizeInBits(); 7905 7906 // Use VDUP for non-constant splats. For f32 constant splats, reduce to 7907 // i32 and try again. 7908 if (hasDominantValue && EltSize <= 32) { 7909 if (!isConstant) { 7910 SDValue N; 7911 7912 // If we are VDUPing a value that comes directly from a vector, that will 7913 // cause an unnecessary move to and from a GPR, where instead we could 7914 // just use VDUPLANE. We can only do this if the lane being extracted 7915 // is at a constant index, as the VDUP from lane instructions only have 7916 // constant-index forms. 7917 ConstantSDNode *constIndex; 7918 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT && 7919 (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) { 7920 // We need to create a new undef vector to use for the VDUPLANE if the 7921 // size of the vector from which we get the value is different than the 7922 // size of the vector that we need to create. We will insert the element 7923 // such that the register coalescer will remove unnecessary copies. 7924 if (VT != Value->getOperand(0).getValueType()) { 7925 unsigned index = constIndex->getAPIntValue().getLimitedValue() % 7926 VT.getVectorNumElements(); 7927 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 7928 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT), 7929 Value, DAG.getConstant(index, dl, MVT::i32)), 7930 DAG.getConstant(index, dl, MVT::i32)); 7931 } else 7932 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 7933 Value->getOperand(0), Value->getOperand(1)); 7934 } else 7935 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value); 7936 7937 if (!usesOnlyOneValue) { 7938 // The dominant value was splatted as 'N', but we now have to insert 7939 // all differing elements. 7940 for (unsigned I = 0; I < NumElts; ++I) { 7941 if (Op.getOperand(I) == Value) 7942 continue; 7943 SmallVector<SDValue, 3> Ops; 7944 Ops.push_back(N); 7945 Ops.push_back(Op.getOperand(I)); 7946 Ops.push_back(DAG.getConstant(I, dl, MVT::i32)); 7947 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops); 7948 } 7949 } 7950 return N; 7951 } 7952 if (VT.getVectorElementType().isFloatingPoint()) { 7953 SmallVector<SDValue, 8> Ops; 7954 MVT FVT = VT.getVectorElementType().getSimpleVT(); 7955 assert(FVT == MVT::f32 || FVT == MVT::f16); 7956 MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16; 7957 for (unsigned i = 0; i < NumElts; ++i) 7958 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT, 7959 Op.getOperand(i))); 7960 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts); 7961 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); 7962 Val = LowerBUILD_VECTOR(Val, DAG, ST); 7963 if (Val.getNode()) 7964 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 7965 } 7966 if (usesOnlyOneValue) { 7967 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); 7968 if (isConstant && Val.getNode()) 7969 return DAG.getNode(ARMISD::VDUP, dl, VT, Val); 7970 } 7971 } 7972 7973 // If all elements are constants and the case above didn't get hit, fall back 7974 // to the default expansion, which will generate a load from the constant 7975 // pool. 7976 if (isConstant) 7977 return SDValue(); 7978 7979 // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and 7980 // vmovn). Empirical tests suggest this is rarely worth it for vectors of 7981 // length <= 2. 7982 if (NumElts >= 4) 7983 if (SDValue shuffle = ReconstructShuffle(Op, DAG)) 7984 return shuffle; 7985 7986 // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into 7987 // VCVT's 7988 if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget)) 7989 return VCVT; 7990 if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget)) 7991 return VCVT; 7992 7993 if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) { 7994 // If we haven't found an efficient lowering, try splitting a 128-bit vector 7995 // into two 64-bit vectors; we might discover a better way to lower it. 7996 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts); 7997 EVT ExtVT = VT.getVectorElementType(); 7998 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2); 7999 SDValue Lower = 8000 DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2)); 8001 if (Lower.getOpcode() == ISD::BUILD_VECTOR) 8002 Lower = LowerBUILD_VECTOR(Lower, DAG, ST); 8003 SDValue Upper = DAG.getBuildVector( 8004 HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2)); 8005 if (Upper.getOpcode() == ISD::BUILD_VECTOR) 8006 Upper = LowerBUILD_VECTOR(Upper, DAG, ST); 8007 if (Lower && Upper) 8008 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper); 8009 } 8010 8011 // Vectors with 32- or 64-bit elements can be built by directly assigning 8012 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands 8013 // will be legalized. 8014 if (EltSize >= 32) { 8015 // Do the expansion with floating-point types, since that is what the VFP 8016 // registers are defined to use, and since i64 is not legal. 8017 EVT EltVT = EVT::getFloatingPointVT(EltSize); 8018 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 8019 SmallVector<SDValue, 8> Ops; 8020 for (unsigned i = 0; i < NumElts; ++i) 8021 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i))); 8022 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); 8023 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 8024 } 8025 8026 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we 8027 // know the default expansion would otherwise fall back on something even 8028 // worse. For a vector with one or two non-undef values, that's 8029 // scalar_to_vector for the elements followed by a shuffle (provided the 8030 // shuffle is valid for the target) and materialization element by element 8031 // on the stack followed by a load for everything else. 8032 if (!isConstant && !usesOnlyOneValue) { 8033 SDValue Vec = DAG.getUNDEF(VT); 8034 for (unsigned i = 0 ; i < NumElts; ++i) { 8035 SDValue V = Op.getOperand(i); 8036 if (V.isUndef()) 8037 continue; 8038 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32); 8039 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); 8040 } 8041 return Vec; 8042 } 8043 8044 return SDValue(); 8045 } 8046 8047 // Gather data to see if the operation can be modelled as a 8048 // shuffle in combination with VEXTs. 8049 SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, 8050 SelectionDAG &DAG) const { 8051 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 8052 SDLoc dl(Op); 8053 EVT VT = Op.getValueType(); 8054 unsigned NumElts = VT.getVectorNumElements(); 8055 8056 struct ShuffleSourceInfo { 8057 SDValue Vec; 8058 unsigned MinElt = std::numeric_limits<unsigned>::max(); 8059 unsigned MaxElt = 0; 8060 8061 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to 8062 // be compatible with the shuffle we intend to construct. As a result 8063 // ShuffleVec will be some sliding window into the original Vec. 8064 SDValue ShuffleVec; 8065 8066 // Code should guarantee that element i in Vec starts at element "WindowBase 8067 // + i * WindowScale in ShuffleVec". 8068 int WindowBase = 0; 8069 int WindowScale = 1; 8070 8071 ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {} 8072 8073 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } 8074 }; 8075 8076 // First gather all vectors used as an immediate source for this BUILD_VECTOR 8077 // node. 8078 SmallVector<ShuffleSourceInfo, 2> Sources; 8079 for (unsigned i = 0; i < NumElts; ++i) { 8080 SDValue V = Op.getOperand(i); 8081 if (V.isUndef()) 8082 continue; 8083 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { 8084 // A shuffle can only come from building a vector from various 8085 // elements of other vectors. 8086 return SDValue(); 8087 } else if (!isa<ConstantSDNode>(V.getOperand(1))) { 8088 // Furthermore, shuffles require a constant mask, whereas extractelts 8089 // accept variable indices. 8090 return SDValue(); 8091 } 8092 8093 // Add this element source to the list if it's not already there. 8094 SDValue SourceVec = V.getOperand(0); 8095 auto Source = llvm::find(Sources, SourceVec); 8096 if (Source == Sources.end()) 8097 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); 8098 8099 // Update the minimum and maximum lane number seen. 8100 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); 8101 Source->MinElt = std::min(Source->MinElt, EltNo); 8102 Source->MaxElt = std::max(Source->MaxElt, EltNo); 8103 } 8104 8105 // Currently only do something sane when at most two source vectors 8106 // are involved. 8107 if (Sources.size() > 2) 8108 return SDValue(); 8109 8110 // Find out the smallest element size among result and two sources, and use 8111 // it as element size to build the shuffle_vector. 8112 EVT SmallestEltTy = VT.getVectorElementType(); 8113 for (auto &Source : Sources) { 8114 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); 8115 if (SrcEltTy.bitsLT(SmallestEltTy)) 8116 SmallestEltTy = SrcEltTy; 8117 } 8118 unsigned ResMultiplier = 8119 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits(); 8120 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); 8121 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); 8122 8123 // If the source vector is too wide or too narrow, we may nevertheless be able 8124 // to construct a compatible shuffle either by concatenating it with UNDEF or 8125 // extracting a suitable range of elements. 8126 for (auto &Src : Sources) { 8127 EVT SrcVT = Src.ShuffleVec.getValueType(); 8128 8129 uint64_t SrcVTSize = SrcVT.getFixedSizeInBits(); 8130 uint64_t VTSize = VT.getFixedSizeInBits(); 8131 if (SrcVTSize == VTSize) 8132 continue; 8133 8134 // This stage of the search produces a source with the same element type as 8135 // the original, but with a total width matching the BUILD_VECTOR output. 8136 EVT EltVT = SrcVT.getVectorElementType(); 8137 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits(); 8138 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); 8139 8140 if (SrcVTSize < VTSize) { 8141 if (2 * SrcVTSize != VTSize) 8142 return SDValue(); 8143 // We can pad out the smaller vector for free, so if it's part of a 8144 // shuffle... 8145 Src.ShuffleVec = 8146 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, 8147 DAG.getUNDEF(Src.ShuffleVec.getValueType())); 8148 continue; 8149 } 8150 8151 if (SrcVTSize != 2 * VTSize) 8152 return SDValue(); 8153 8154 if (Src.MaxElt - Src.MinElt >= NumSrcElts) { 8155 // Span too large for a VEXT to cope 8156 return SDValue(); 8157 } 8158 8159 if (Src.MinElt >= NumSrcElts) { 8160 // The extraction can just take the second half 8161 Src.ShuffleVec = 8162 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 8163 DAG.getConstant(NumSrcElts, dl, MVT::i32)); 8164 Src.WindowBase = -NumSrcElts; 8165 } else if (Src.MaxElt < NumSrcElts) { 8166 // The extraction can just take the first half 8167 Src.ShuffleVec = 8168 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 8169 DAG.getConstant(0, dl, MVT::i32)); 8170 } else { 8171 // An actual VEXT is needed 8172 SDValue VEXTSrc1 = 8173 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 8174 DAG.getConstant(0, dl, MVT::i32)); 8175 SDValue VEXTSrc2 = 8176 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 8177 DAG.getConstant(NumSrcElts, dl, MVT::i32)); 8178 8179 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1, 8180 VEXTSrc2, 8181 DAG.getConstant(Src.MinElt, dl, MVT::i32)); 8182 Src.WindowBase = -Src.MinElt; 8183 } 8184 } 8185 8186 // Another possible incompatibility occurs from the vector element types. We 8187 // can fix this by bitcasting the source vectors to the same type we intend 8188 // for the shuffle. 8189 for (auto &Src : Sources) { 8190 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); 8191 if (SrcEltTy == SmallestEltTy) 8192 continue; 8193 assert(ShuffleVT.getVectorElementType() == SmallestEltTy); 8194 Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec); 8195 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); 8196 Src.WindowBase *= Src.WindowScale; 8197 } 8198 8199 // Final check before we try to actually produce a shuffle. 8200 LLVM_DEBUG(for (auto Src 8201 : Sources) 8202 assert(Src.ShuffleVec.getValueType() == ShuffleVT);); 8203 8204 // The stars all align, our next step is to produce the mask for the shuffle. 8205 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); 8206 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits(); 8207 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { 8208 SDValue Entry = Op.getOperand(i); 8209 if (Entry.isUndef()) 8210 continue; 8211 8212 auto Src = llvm::find(Sources, Entry.getOperand(0)); 8213 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); 8214 8215 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit 8216 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this 8217 // segment. 8218 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); 8219 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(), 8220 VT.getScalarSizeInBits()); 8221 int LanesDefined = BitsDefined / BitsPerShuffleLane; 8222 8223 // This source is expected to fill ResMultiplier lanes of the final shuffle, 8224 // starting at the appropriate offset. 8225 int *LaneMask = &Mask[i * ResMultiplier]; 8226 8227 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; 8228 ExtractBase += NumElts * (Src - Sources.begin()); 8229 for (int j = 0; j < LanesDefined; ++j) 8230 LaneMask[j] = ExtractBase + j; 8231 } 8232 8233 8234 // We can't handle more than two sources. This should have already 8235 // been checked before this point. 8236 assert(Sources.size() <= 2 && "Too many sources!"); 8237 8238 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; 8239 for (unsigned i = 0; i < Sources.size(); ++i) 8240 ShuffleOps[i] = Sources[i].ShuffleVec; 8241 8242 SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0], 8243 ShuffleOps[1], Mask, DAG); 8244 if (!Shuffle) 8245 return SDValue(); 8246 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle); 8247 } 8248 8249 enum ShuffleOpCodes { 8250 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 8251 OP_VREV, 8252 OP_VDUP0, 8253 OP_VDUP1, 8254 OP_VDUP2, 8255 OP_VDUP3, 8256 OP_VEXT1, 8257 OP_VEXT2, 8258 OP_VEXT3, 8259 OP_VUZPL, // VUZP, left result 8260 OP_VUZPR, // VUZP, right result 8261 OP_VZIPL, // VZIP, left result 8262 OP_VZIPR, // VZIP, right result 8263 OP_VTRNL, // VTRN, left result 8264 OP_VTRNR // VTRN, right result 8265 }; 8266 8267 static bool isLegalMVEShuffleOp(unsigned PFEntry) { 8268 unsigned OpNum = (PFEntry >> 26) & 0x0F; 8269 switch (OpNum) { 8270 case OP_COPY: 8271 case OP_VREV: 8272 case OP_VDUP0: 8273 case OP_VDUP1: 8274 case OP_VDUP2: 8275 case OP_VDUP3: 8276 return true; 8277 } 8278 return false; 8279 } 8280 8281 /// isShuffleMaskLegal - Targets can use this to indicate that they only 8282 /// support *some* VECTOR_SHUFFLE operations, those with specific masks. 8283 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 8284 /// are assumed to be legal. 8285 bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { 8286 if (VT.getVectorNumElements() == 4 && 8287 (VT.is128BitVector() || VT.is64BitVector())) { 8288 unsigned PFIndexes[4]; 8289 for (unsigned i = 0; i != 4; ++i) { 8290 if (M[i] < 0) 8291 PFIndexes[i] = 8; 8292 else 8293 PFIndexes[i] = M[i]; 8294 } 8295 8296 // Compute the index in the perfect shuffle table. 8297 unsigned PFTableIndex = 8298 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 8299 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 8300 unsigned Cost = (PFEntry >> 30); 8301 8302 if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry))) 8303 return true; 8304 } 8305 8306 bool ReverseVEXT, isV_UNDEF; 8307 unsigned Imm, WhichResult; 8308 8309 unsigned EltSize = VT.getScalarSizeInBits(); 8310 if (EltSize >= 32 || 8311 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 8312 ShuffleVectorInst::isIdentityMask(M) || 8313 isVREVMask(M, VT, 64) || 8314 isVREVMask(M, VT, 32) || 8315 isVREVMask(M, VT, 16)) 8316 return true; 8317 else if (Subtarget->hasNEON() && 8318 (isVEXTMask(M, VT, ReverseVEXT, Imm) || 8319 isVTBLMask(M, VT) || 8320 isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF))) 8321 return true; 8322 else if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) && 8323 isReverseMask(M, VT)) 8324 return true; 8325 else if (Subtarget->hasMVEIntegerOps() && 8326 (isVMOVNMask(M, VT, true, false) || 8327 isVMOVNMask(M, VT, false, false) || isVMOVNMask(M, VT, true, true))) 8328 return true; 8329 else 8330 return false; 8331 } 8332 8333 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit 8334 /// the specified operations to build the shuffle. 8335 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, 8336 SDValue RHS, SelectionDAG &DAG, 8337 const SDLoc &dl) { 8338 unsigned OpNum = (PFEntry >> 26) & 0x0F; 8339 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); 8340 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); 8341 8342 if (OpNum == OP_COPY) { 8343 if (LHSID == (1*9+2)*9+3) return LHS; 8344 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); 8345 return RHS; 8346 } 8347 8348 SDValue OpLHS, OpRHS; 8349 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); 8350 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); 8351 EVT VT = OpLHS.getValueType(); 8352 8353 switch (OpNum) { 8354 default: llvm_unreachable("Unknown shuffle opcode!"); 8355 case OP_VREV: 8356 // VREV divides the vector in half and swaps within the half. 8357 if (VT.getVectorElementType() == MVT::i32 || 8358 VT.getVectorElementType() == MVT::f32) 8359 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS); 8360 // vrev <4 x i16> -> VREV32 8361 if (VT.getVectorElementType() == MVT::i16 || 8362 VT.getVectorElementType() == MVT::f16) 8363 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS); 8364 // vrev <4 x i8> -> VREV16 8365 assert(VT.getVectorElementType() == MVT::i8); 8366 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS); 8367 case OP_VDUP0: 8368 case OP_VDUP1: 8369 case OP_VDUP2: 8370 case OP_VDUP3: 8371 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, 8372 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32)); 8373 case OP_VEXT1: 8374 case OP_VEXT2: 8375 case OP_VEXT3: 8376 return DAG.getNode(ARMISD::VEXT, dl, VT, 8377 OpLHS, OpRHS, 8378 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32)); 8379 case OP_VUZPL: 8380 case OP_VUZPR: 8381 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), 8382 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL); 8383 case OP_VZIPL: 8384 case OP_VZIPR: 8385 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), 8386 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL); 8387 case OP_VTRNL: 8388 case OP_VTRNR: 8389 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), 8390 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL); 8391 } 8392 } 8393 8394 static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, 8395 ArrayRef<int> ShuffleMask, 8396 SelectionDAG &DAG) { 8397 // Check to see if we can use the VTBL instruction. 8398 SDValue V1 = Op.getOperand(0); 8399 SDValue V2 = Op.getOperand(1); 8400 SDLoc DL(Op); 8401 8402 SmallVector<SDValue, 8> VTBLMask; 8403 for (int I : ShuffleMask) 8404 VTBLMask.push_back(DAG.getConstant(I, DL, MVT::i32)); 8405 8406 if (V2.getNode()->isUndef()) 8407 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1, 8408 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); 8409 8410 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2, 8411 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); 8412 } 8413 8414 static SDValue LowerReverse_VECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { 8415 SDLoc DL(Op); 8416 EVT VT = Op.getValueType(); 8417 8418 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) && 8419 "Expect an v8i16/v16i8 type"); 8420 SDValue OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, Op.getOperand(0)); 8421 // For a v16i8 type: After the VREV, we have got <7, ..., 0, 15, ..., 8>. Now, 8422 // extract the first 8 bytes into the top double word and the last 8 bytes 8423 // into the bottom double word, through a new vector shuffle that will be 8424 // turned into a VEXT on Neon, or a couple of VMOVDs on MVE. 8425 std::vector<int> NewMask; 8426 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) 8427 NewMask.push_back(VT.getVectorNumElements() / 2 + i); 8428 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) 8429 NewMask.push_back(i); 8430 return DAG.getVectorShuffle(VT, DL, OpLHS, OpLHS, NewMask); 8431 } 8432 8433 static EVT getVectorTyFromPredicateVector(EVT VT) { 8434 switch (VT.getSimpleVT().SimpleTy) { 8435 case MVT::v2i1: 8436 return MVT::v2f64; 8437 case MVT::v4i1: 8438 return MVT::v4i32; 8439 case MVT::v8i1: 8440 return MVT::v8i16; 8441 case MVT::v16i1: 8442 return MVT::v16i8; 8443 default: 8444 llvm_unreachable("Unexpected vector predicate type"); 8445 } 8446 } 8447 8448 static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT, 8449 SelectionDAG &DAG) { 8450 // Converting from boolean predicates to integers involves creating a vector 8451 // of all ones or all zeroes and selecting the lanes based upon the real 8452 // predicate. 8453 SDValue AllOnes = 8454 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32); 8455 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes); 8456 8457 SDValue AllZeroes = 8458 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32); 8459 AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes); 8460 8461 // Get full vector type from predicate type 8462 EVT NewVT = getVectorTyFromPredicateVector(VT); 8463 8464 SDValue RecastV1; 8465 // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast 8466 // this to a v16i1. This cannot be done with an ordinary bitcast because the 8467 // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node, 8468 // since we know in hardware the sizes are really the same. 8469 if (VT != MVT::v16i1) 8470 RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred); 8471 else 8472 RecastV1 = Pred; 8473 8474 // Select either all ones or zeroes depending upon the real predicate bits. 8475 SDValue PredAsVector = 8476 DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes); 8477 8478 // Recast our new predicate-as-integer v16i8 vector into something 8479 // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate. 8480 return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector); 8481 } 8482 8483 static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, 8484 const ARMSubtarget *ST) { 8485 EVT VT = Op.getValueType(); 8486 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 8487 ArrayRef<int> ShuffleMask = SVN->getMask(); 8488 8489 assert(ST->hasMVEIntegerOps() && 8490 "No support for vector shuffle of boolean predicates"); 8491 8492 SDValue V1 = Op.getOperand(0); 8493 SDLoc dl(Op); 8494 if (isReverseMask(ShuffleMask, VT)) { 8495 SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1); 8496 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast); 8497 SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit, 8498 DAG.getConstant(16, dl, MVT::i32)); 8499 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl); 8500 } 8501 8502 // Until we can come up with optimised cases for every single vector 8503 // shuffle in existence we have chosen the least painful strategy. This is 8504 // to essentially promote the boolean predicate to a 8-bit integer, where 8505 // each predicate represents a byte. Then we fall back on a normal integer 8506 // vector shuffle and convert the result back into a predicate vector. In 8507 // many cases the generated code might be even better than scalar code 8508 // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit 8509 // fields in a register into 8 other arbitrary 2-bit fields! 8510 SDValue PredAsVector = PromoteMVEPredVector(dl, V1, VT, DAG); 8511 EVT NewVT = PredAsVector.getValueType(); 8512 8513 // Do the shuffle! 8514 SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector, 8515 DAG.getUNDEF(NewVT), ShuffleMask); 8516 8517 // Now return the result of comparing the shuffled vector with zero, 8518 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1 8519 // we convert to a v4i1 compare to fill in the two halves of the i64 as i32s. 8520 if (VT == MVT::v2i1) { 8521 SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Shuffled); 8522 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC, 8523 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 8524 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp); 8525 } 8526 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled, 8527 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 8528 } 8529 8530 static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op, 8531 ArrayRef<int> ShuffleMask, 8532 SelectionDAG &DAG) { 8533 // Attempt to lower the vector shuffle using as many whole register movs as 8534 // possible. This is useful for types smaller than 32bits, which would 8535 // often otherwise become a series for grp movs. 8536 SDLoc dl(Op); 8537 EVT VT = Op.getValueType(); 8538 if (VT.getScalarSizeInBits() >= 32) 8539 return SDValue(); 8540 8541 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) && 8542 "Unexpected vector type"); 8543 int NumElts = VT.getVectorNumElements(); 8544 int QuarterSize = NumElts / 4; 8545 // The four final parts of the vector, as i32's 8546 SDValue Parts[4]; 8547 8548 // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not 8549 // <u,u,u,u>), returning the vmov lane index 8550 auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) { 8551 // Detect which mov lane this would be from the first non-undef element. 8552 int MovIdx = -1; 8553 for (int i = 0; i < Length; i++) { 8554 if (ShuffleMask[Start + i] >= 0) { 8555 if (ShuffleMask[Start + i] % Length != i) 8556 return -1; 8557 MovIdx = ShuffleMask[Start + i] / Length; 8558 break; 8559 } 8560 } 8561 // If all items are undef, leave this for other combines 8562 if (MovIdx == -1) 8563 return -1; 8564 // Check the remaining values are the correct part of the same mov 8565 for (int i = 1; i < Length; i++) { 8566 if (ShuffleMask[Start + i] >= 0 && 8567 (ShuffleMask[Start + i] / Length != MovIdx || 8568 ShuffleMask[Start + i] % Length != i)) 8569 return -1; 8570 } 8571 return MovIdx; 8572 }; 8573 8574 for (int Part = 0; Part < 4; ++Part) { 8575 // Does this part look like a mov 8576 int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize); 8577 if (Elt != -1) { 8578 SDValue Input = Op->getOperand(0); 8579 if (Elt >= 4) { 8580 Input = Op->getOperand(1); 8581 Elt -= 4; 8582 } 8583 SDValue BitCast = DAG.getBitcast(MVT::v4f32, Input); 8584 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, BitCast, 8585 DAG.getConstant(Elt, dl, MVT::i32)); 8586 } 8587 } 8588 8589 // Nothing interesting found, just return 8590 if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3]) 8591 return SDValue(); 8592 8593 // The other parts need to be built with the old shuffle vector, cast to a 8594 // v4i32 and extract_vector_elts 8595 if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) { 8596 SmallVector<int, 16> NewShuffleMask; 8597 for (int Part = 0; Part < 4; ++Part) 8598 for (int i = 0; i < QuarterSize; i++) 8599 NewShuffleMask.push_back( 8600 Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]); 8601 SDValue NewShuffle = DAG.getVectorShuffle( 8602 VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask); 8603 SDValue BitCast = DAG.getBitcast(MVT::v4f32, NewShuffle); 8604 8605 for (int Part = 0; Part < 4; ++Part) 8606 if (!Parts[Part]) 8607 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, 8608 BitCast, DAG.getConstant(Part, dl, MVT::i32)); 8609 } 8610 // Build a vector out of the various parts and bitcast it back to the original 8611 // type. 8612 SDValue NewVec = DAG.getNode(ARMISD::BUILD_VECTOR, dl, MVT::v4f32, Parts); 8613 return DAG.getBitcast(VT, NewVec); 8614 } 8615 8616 static SDValue LowerVECTOR_SHUFFLEUsingOneOff(SDValue Op, 8617 ArrayRef<int> ShuffleMask, 8618 SelectionDAG &DAG) { 8619 SDValue V1 = Op.getOperand(0); 8620 SDValue V2 = Op.getOperand(1); 8621 EVT VT = Op.getValueType(); 8622 unsigned NumElts = VT.getVectorNumElements(); 8623 8624 // An One-Off Identity mask is one that is mostly an identity mask from as 8625 // single source but contains a single element out-of-place, either from a 8626 // different vector or from another position in the same vector. As opposed to 8627 // lowering this via a ARMISD::BUILD_VECTOR we can generate an extract/insert 8628 // pair directly. 8629 auto isOneOffIdentityMask = [](ArrayRef<int> Mask, EVT VT, int BaseOffset, 8630 int &OffElement) { 8631 OffElement = -1; 8632 int NonUndef = 0; 8633 for (int i = 0, NumMaskElts = Mask.size(); i < NumMaskElts; ++i) { 8634 if (Mask[i] == -1) 8635 continue; 8636 NonUndef++; 8637 if (Mask[i] != i + BaseOffset) { 8638 if (OffElement == -1) 8639 OffElement = i; 8640 else 8641 return false; 8642 } 8643 } 8644 return NonUndef > 2 && OffElement != -1; 8645 }; 8646 int OffElement; 8647 SDValue VInput; 8648 if (isOneOffIdentityMask(ShuffleMask, VT, 0, OffElement)) 8649 VInput = V1; 8650 else if (isOneOffIdentityMask(ShuffleMask, VT, NumElts, OffElement)) 8651 VInput = V2; 8652 else 8653 return SDValue(); 8654 8655 SDLoc dl(Op); 8656 EVT SVT = VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16 8657 ? MVT::i32 8658 : VT.getScalarType(); 8659 SDValue Elt = DAG.getNode( 8660 ISD::EXTRACT_VECTOR_ELT, dl, SVT, 8661 ShuffleMask[OffElement] < (int)NumElts ? V1 : V2, 8662 DAG.getVectorIdxConstant(ShuffleMask[OffElement] % NumElts, dl)); 8663 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, VInput, Elt, 8664 DAG.getVectorIdxConstant(OffElement % NumElts, dl)); 8665 } 8666 8667 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, 8668 const ARMSubtarget *ST) { 8669 SDValue V1 = Op.getOperand(0); 8670 SDValue V2 = Op.getOperand(1); 8671 SDLoc dl(Op); 8672 EVT VT = Op.getValueType(); 8673 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 8674 unsigned EltSize = VT.getScalarSizeInBits(); 8675 8676 if (ST->hasMVEIntegerOps() && EltSize == 1) 8677 return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST); 8678 8679 // Convert shuffles that are directly supported on NEON to target-specific 8680 // DAG nodes, instead of keeping them as shuffles and matching them again 8681 // during code selection. This is more efficient and avoids the possibility 8682 // of inconsistencies between legalization and selection. 8683 // FIXME: floating-point vectors should be canonicalized to integer vectors 8684 // of the same time so that they get CSEd properly. 8685 ArrayRef<int> ShuffleMask = SVN->getMask(); 8686 8687 if (EltSize <= 32) { 8688 if (SVN->isSplat()) { 8689 int Lane = SVN->getSplatIndex(); 8690 // If this is undef splat, generate it via "just" vdup, if possible. 8691 if (Lane == -1) Lane = 0; 8692 8693 // Test if V1 is a SCALAR_TO_VECTOR. 8694 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { 8695 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 8696 } 8697 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR 8698 // (and probably will turn into a SCALAR_TO_VECTOR once legalization 8699 // reaches it). 8700 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR && 8701 !isa<ConstantSDNode>(V1.getOperand(0))) { 8702 bool IsScalarToVector = true; 8703 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i) 8704 if (!V1.getOperand(i).isUndef()) { 8705 IsScalarToVector = false; 8706 break; 8707 } 8708 if (IsScalarToVector) 8709 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 8710 } 8711 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1, 8712 DAG.getConstant(Lane, dl, MVT::i32)); 8713 } 8714 8715 bool ReverseVEXT = false; 8716 unsigned Imm = 0; 8717 if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { 8718 if (ReverseVEXT) 8719 std::swap(V1, V2); 8720 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, 8721 DAG.getConstant(Imm, dl, MVT::i32)); 8722 } 8723 8724 if (isVREVMask(ShuffleMask, VT, 64)) 8725 return DAG.getNode(ARMISD::VREV64, dl, VT, V1); 8726 if (isVREVMask(ShuffleMask, VT, 32)) 8727 return DAG.getNode(ARMISD::VREV32, dl, VT, V1); 8728 if (isVREVMask(ShuffleMask, VT, 16)) 8729 return DAG.getNode(ARMISD::VREV16, dl, VT, V1); 8730 8731 if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) { 8732 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1, 8733 DAG.getConstant(Imm, dl, MVT::i32)); 8734 } 8735 8736 // Check for Neon shuffles that modify both input vectors in place. 8737 // If both results are used, i.e., if there are two shuffles with the same 8738 // source operands and with masks corresponding to both results of one of 8739 // these operations, DAG memoization will ensure that a single node is 8740 // used for both shuffles. 8741 unsigned WhichResult = 0; 8742 bool isV_UNDEF = false; 8743 if (ST->hasNEON()) { 8744 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( 8745 ShuffleMask, VT, WhichResult, isV_UNDEF)) { 8746 if (isV_UNDEF) 8747 V2 = V1; 8748 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2) 8749 .getValue(WhichResult); 8750 } 8751 } 8752 if (ST->hasMVEIntegerOps()) { 8753 if (isVMOVNMask(ShuffleMask, VT, false, false)) 8754 return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1, 8755 DAG.getConstant(0, dl, MVT::i32)); 8756 if (isVMOVNMask(ShuffleMask, VT, true, false)) 8757 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2, 8758 DAG.getConstant(1, dl, MVT::i32)); 8759 if (isVMOVNMask(ShuffleMask, VT, true, true)) 8760 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V1, 8761 DAG.getConstant(1, dl, MVT::i32)); 8762 } 8763 8764 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize 8765 // shuffles that produce a result larger than their operands with: 8766 // shuffle(concat(v1, undef), concat(v2, undef)) 8767 // -> 8768 // shuffle(concat(v1, v2), undef) 8769 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine). 8770 // 8771 // This is useful in the general case, but there are special cases where 8772 // native shuffles produce larger results: the two-result ops. 8773 // 8774 // Look through the concat when lowering them: 8775 // shuffle(concat(v1, v2), undef) 8776 // -> 8777 // concat(VZIP(v1, v2):0, :1) 8778 // 8779 if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) { 8780 SDValue SubV1 = V1->getOperand(0); 8781 SDValue SubV2 = V1->getOperand(1); 8782 EVT SubVT = SubV1.getValueType(); 8783 8784 // We expect these to have been canonicalized to -1. 8785 assert(llvm::all_of(ShuffleMask, [&](int i) { 8786 return i < (int)VT.getVectorNumElements(); 8787 }) && "Unexpected shuffle index into UNDEF operand!"); 8788 8789 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( 8790 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) { 8791 if (isV_UNDEF) 8792 SubV2 = SubV1; 8793 assert((WhichResult == 0) && 8794 "In-place shuffle of concat can only have one result!"); 8795 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT), 8796 SubV1, SubV2); 8797 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0), 8798 Res.getValue(1)); 8799 } 8800 } 8801 } 8802 8803 if (ST->hasMVEIntegerOps() && EltSize <= 32) 8804 if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG)) 8805 return V; 8806 8807 // If the shuffle is not directly supported and it has 4 elements, use 8808 // the PerfectShuffle-generated table to synthesize it from other shuffles. 8809 unsigned NumElts = VT.getVectorNumElements(); 8810 if (NumElts == 4) { 8811 unsigned PFIndexes[4]; 8812 for (unsigned i = 0; i != 4; ++i) { 8813 if (ShuffleMask[i] < 0) 8814 PFIndexes[i] = 8; 8815 else 8816 PFIndexes[i] = ShuffleMask[i]; 8817 } 8818 8819 // Compute the index in the perfect shuffle table. 8820 unsigned PFTableIndex = 8821 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 8822 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 8823 unsigned Cost = (PFEntry >> 30); 8824 8825 if (Cost <= 4) { 8826 if (ST->hasNEON()) 8827 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 8828 else if (isLegalMVEShuffleOp(PFEntry)) { 8829 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); 8830 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); 8831 unsigned PFEntryLHS = PerfectShuffleTable[LHSID]; 8832 unsigned PFEntryRHS = PerfectShuffleTable[RHSID]; 8833 if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS)) 8834 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 8835 } 8836 } 8837 } 8838 8839 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs. 8840 if (EltSize >= 32) { 8841 // Do the expansion with floating-point types, since that is what the VFP 8842 // registers are defined to use, and since i64 is not legal. 8843 EVT EltVT = EVT::getFloatingPointVT(EltSize); 8844 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 8845 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1); 8846 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2); 8847 SmallVector<SDValue, 8> Ops; 8848 for (unsigned i = 0; i < NumElts; ++i) { 8849 if (ShuffleMask[i] < 0) 8850 Ops.push_back(DAG.getUNDEF(EltVT)); 8851 else 8852 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, 8853 ShuffleMask[i] < (int)NumElts ? V1 : V2, 8854 DAG.getConstant(ShuffleMask[i] & (NumElts-1), 8855 dl, MVT::i32))); 8856 } 8857 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); 8858 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 8859 } 8860 8861 if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) && 8862 isReverseMask(ShuffleMask, VT)) 8863 return LowerReverse_VECTOR_SHUFFLE(Op, DAG); 8864 8865 if (ST->hasNEON() && VT == MVT::v8i8) 8866 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG)) 8867 return NewOp; 8868 8869 if (ST->hasMVEIntegerOps()) 8870 if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG)) 8871 return NewOp; 8872 8873 return SDValue(); 8874 } 8875 8876 static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, 8877 const ARMSubtarget *ST) { 8878 EVT VecVT = Op.getOperand(0).getValueType(); 8879 SDLoc dl(Op); 8880 8881 assert(ST->hasMVEIntegerOps() && 8882 "LowerINSERT_VECTOR_ELT_i1 called without MVE!"); 8883 8884 SDValue Conv = 8885 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0)); 8886 unsigned Lane = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 8887 unsigned LaneWidth = 8888 getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8; 8889 unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth; 8890 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, 8891 Op.getOperand(1), DAG.getValueType(MVT::i1)); 8892 SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext, 8893 DAG.getConstant(~Mask, dl, MVT::i32)); 8894 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI); 8895 } 8896 8897 SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 8898 SelectionDAG &DAG) const { 8899 // INSERT_VECTOR_ELT is legal only for immediate indexes. 8900 SDValue Lane = Op.getOperand(2); 8901 if (!isa<ConstantSDNode>(Lane)) 8902 return SDValue(); 8903 8904 SDValue Elt = Op.getOperand(1); 8905 EVT EltVT = Elt.getValueType(); 8906 8907 if (Subtarget->hasMVEIntegerOps() && 8908 Op.getValueType().getScalarSizeInBits() == 1) 8909 return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget); 8910 8911 if (getTypeAction(*DAG.getContext(), EltVT) == 8912 TargetLowering::TypePromoteFloat) { 8913 // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32, 8914 // but the type system will try to do that if we don't intervene. 8915 // Reinterpret any such vector-element insertion as one with the 8916 // corresponding integer types. 8917 8918 SDLoc dl(Op); 8919 8920 EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits()); 8921 assert(getTypeAction(*DAG.getContext(), IEltVT) != 8922 TargetLowering::TypePromoteFloat); 8923 8924 SDValue VecIn = Op.getOperand(0); 8925 EVT VecVT = VecIn.getValueType(); 8926 EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT, 8927 VecVT.getVectorNumElements()); 8928 8929 SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt); 8930 SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn); 8931 SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT, 8932 IVecIn, IElt, Lane); 8933 return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut); 8934 } 8935 8936 return Op; 8937 } 8938 8939 static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, 8940 const ARMSubtarget *ST) { 8941 EVT VecVT = Op.getOperand(0).getValueType(); 8942 SDLoc dl(Op); 8943 8944 assert(ST->hasMVEIntegerOps() && 8945 "LowerINSERT_VECTOR_ELT_i1 called without MVE!"); 8946 8947 SDValue Conv = 8948 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0)); 8949 unsigned Lane = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 8950 unsigned LaneWidth = 8951 getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8; 8952 SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv, 8953 DAG.getConstant(Lane * LaneWidth, dl, MVT::i32)); 8954 return Shift; 8955 } 8956 8957 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG, 8958 const ARMSubtarget *ST) { 8959 // EXTRACT_VECTOR_ELT is legal only for immediate indexes. 8960 SDValue Lane = Op.getOperand(1); 8961 if (!isa<ConstantSDNode>(Lane)) 8962 return SDValue(); 8963 8964 SDValue Vec = Op.getOperand(0); 8965 EVT VT = Vec.getValueType(); 8966 8967 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) 8968 return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST); 8969 8970 if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) { 8971 SDLoc dl(Op); 8972 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); 8973 } 8974 8975 return Op; 8976 } 8977 8978 static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, 8979 const ARMSubtarget *ST) { 8980 SDLoc dl(Op); 8981 assert(Op.getValueType().getScalarSizeInBits() == 1 && 8982 "Unexpected custom CONCAT_VECTORS lowering"); 8983 assert(isPowerOf2_32(Op.getNumOperands()) && 8984 "Unexpected custom CONCAT_VECTORS lowering"); 8985 assert(ST->hasMVEIntegerOps() && 8986 "CONCAT_VECTORS lowering only supported for MVE"); 8987 8988 auto ConcatPair = [&](SDValue V1, SDValue V2) { 8989 EVT Op1VT = V1.getValueType(); 8990 EVT Op2VT = V2.getValueType(); 8991 assert(Op1VT == Op2VT && "Operand types don't match!"); 8992 EVT VT = Op1VT.getDoubleNumVectorElementsVT(*DAG.getContext()); 8993 8994 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); 8995 SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG); 8996 8997 // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets 8998 // promoted to v8i16, etc. 8999 MVT ElType = 9000 getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); 9001 unsigned NumElts = 2 * Op1VT.getVectorNumElements(); 9002 9003 // Extract the vector elements from Op1 and Op2 one by one and truncate them 9004 // to be the right size for the destination. For example, if Op1 is v4i1 9005 // then the promoted vector is v4i32. The result of concatentation gives a 9006 // v8i1, which when promoted is v8i16. That means each i32 element from Op1 9007 // needs truncating to i16 and inserting in the result. 9008 EVT ConcatVT = MVT::getVectorVT(ElType, NumElts); 9009 SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT); 9010 auto ExtractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) { 9011 EVT NewVT = NewV.getValueType(); 9012 EVT ConcatVT = ConVec.getValueType(); 9013 for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) { 9014 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV, 9015 DAG.getIntPtrConstant(i, dl)); 9016 ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt, 9017 DAG.getConstant(j, dl, MVT::i32)); 9018 } 9019 return ConVec; 9020 }; 9021 unsigned j = 0; 9022 ConVec = ExtractInto(NewV1, ConVec, j); 9023 ConVec = ExtractInto(NewV2, ConVec, j); 9024 9025 // Now return the result of comparing the subvector with zero, which will 9026 // generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1 we 9027 // convert to a v4i1 compare to fill in the two halves of the i64 as i32s. 9028 if (VT == MVT::v2i1) { 9029 SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, ConVec); 9030 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC, 9031 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 9032 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp); 9033 } 9034 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec, 9035 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 9036 }; 9037 9038 // Concat each pair of subvectors and pack into the lower half of the array. 9039 SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end()); 9040 while (ConcatOps.size() > 1) { 9041 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) { 9042 SDValue V1 = ConcatOps[I]; 9043 SDValue V2 = ConcatOps[I + 1]; 9044 ConcatOps[I / 2] = ConcatPair(V1, V2); 9045 } 9046 ConcatOps.resize(ConcatOps.size() / 2); 9047 } 9048 return ConcatOps[0]; 9049 } 9050 9051 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, 9052 const ARMSubtarget *ST) { 9053 EVT VT = Op->getValueType(0); 9054 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) 9055 return LowerCONCAT_VECTORS_i1(Op, DAG, ST); 9056 9057 // The only time a CONCAT_VECTORS operation can have legal types is when 9058 // two 64-bit vectors are concatenated to a 128-bit vector. 9059 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 && 9060 "unexpected CONCAT_VECTORS"); 9061 SDLoc dl(Op); 9062 SDValue Val = DAG.getUNDEF(MVT::v2f64); 9063 SDValue Op0 = Op.getOperand(0); 9064 SDValue Op1 = Op.getOperand(1); 9065 if (!Op0.isUndef()) 9066 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 9067 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0), 9068 DAG.getIntPtrConstant(0, dl)); 9069 if (!Op1.isUndef()) 9070 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 9071 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1), 9072 DAG.getIntPtrConstant(1, dl)); 9073 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val); 9074 } 9075 9076 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, 9077 const ARMSubtarget *ST) { 9078 SDValue V1 = Op.getOperand(0); 9079 SDValue V2 = Op.getOperand(1); 9080 SDLoc dl(Op); 9081 EVT VT = Op.getValueType(); 9082 EVT Op1VT = V1.getValueType(); 9083 unsigned NumElts = VT.getVectorNumElements(); 9084 unsigned Index = cast<ConstantSDNode>(V2)->getZExtValue(); 9085 9086 assert(VT.getScalarSizeInBits() == 1 && 9087 "Unexpected custom EXTRACT_SUBVECTOR lowering"); 9088 assert(ST->hasMVEIntegerOps() && 9089 "EXTRACT_SUBVECTOR lowering only supported for MVE"); 9090 9091 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); 9092 9093 // We now have Op1 promoted to a vector of integers, where v8i1 gets 9094 // promoted to v8i16, etc. 9095 9096 MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); 9097 9098 if (NumElts == 2) { 9099 EVT SubVT = MVT::v4i32; 9100 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT); 9101 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j += 2) { 9102 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1, 9103 DAG.getIntPtrConstant(i, dl)); 9104 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt, 9105 DAG.getConstant(j, dl, MVT::i32)); 9106 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt, 9107 DAG.getConstant(j + 1, dl, MVT::i32)); 9108 } 9109 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, SubVec, 9110 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 9111 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp); 9112 } 9113 9114 EVT SubVT = MVT::getVectorVT(ElType, NumElts); 9115 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT); 9116 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) { 9117 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1, 9118 DAG.getIntPtrConstant(i, dl)); 9119 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt, 9120 DAG.getConstant(j, dl, MVT::i32)); 9121 } 9122 9123 // Now return the result of comparing the subvector with zero, 9124 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. 9125 return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec, 9126 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 9127 } 9128 9129 // Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0). 9130 static SDValue LowerTruncatei1(SDNode *N, SelectionDAG &DAG, 9131 const ARMSubtarget *ST) { 9132 assert(ST->hasMVEIntegerOps() && "Expected MVE!"); 9133 EVT VT = N->getValueType(0); 9134 assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) && 9135 "Expected a vector i1 type!"); 9136 SDValue Op = N->getOperand(0); 9137 EVT FromVT = Op.getValueType(); 9138 SDLoc DL(N); 9139 9140 SDValue And = 9141 DAG.getNode(ISD::AND, DL, FromVT, Op, DAG.getConstant(1, DL, FromVT)); 9142 return DAG.getNode(ISD::SETCC, DL, VT, And, DAG.getConstant(0, DL, FromVT), 9143 DAG.getCondCode(ISD::SETNE)); 9144 } 9145 9146 static SDValue LowerTruncate(SDNode *N, SelectionDAG &DAG, 9147 const ARMSubtarget *Subtarget) { 9148 if (!Subtarget->hasMVEIntegerOps()) 9149 return SDValue(); 9150 9151 EVT ToVT = N->getValueType(0); 9152 if (ToVT.getScalarType() == MVT::i1) 9153 return LowerTruncatei1(N, DAG, Subtarget); 9154 9155 // MVE does not have a single instruction to perform the truncation of a v4i32 9156 // into the lower half of a v8i16, in the same way that a NEON vmovn would. 9157 // Most of the instructions in MVE follow the 'Beats' system, where moving 9158 // values from different lanes is usually something that the instructions 9159 // avoid. 9160 // 9161 // Instead it has top/bottom instructions such as VMOVLT/B and VMOVNT/B, 9162 // which take a the top/bottom half of a larger lane and extend it (or do the 9163 // opposite, truncating into the top/bottom lane from a larger lane). Note 9164 // that because of the way we widen lanes, a v4i16 is really a v4i32 using the 9165 // bottom 16bits from each vector lane. This works really well with T/B 9166 // instructions, but that doesn't extend to v8i32->v8i16 where the lanes need 9167 // to move order. 9168 // 9169 // But truncates and sext/zext are always going to be fairly common from llvm. 9170 // We have several options for how to deal with them: 9171 // - Wherever possible combine them into an instruction that makes them 9172 // "free". This includes loads/stores, which can perform the trunc as part 9173 // of the memory operation. Or certain shuffles that can be turned into 9174 // VMOVN/VMOVL. 9175 // - Lane Interleaving to transform blocks surrounded by ext/trunc. So 9176 // trunc(mul(sext(a), sext(b))) may become 9177 // VMOVNT(VMUL(VMOVLB(a), VMOVLB(b)), VMUL(VMOVLT(a), VMOVLT(b))). (Which in 9178 // this case can use VMULL). This is performed in the 9179 // MVELaneInterleavingPass. 9180 // - Otherwise we have an option. By default we would expand the 9181 // zext/sext/trunc into a series of lane extract/inserts going via GPR 9182 // registers. One for each vector lane in the vector. This can obviously be 9183 // very expensive. 9184 // - The other option is to use the fact that loads/store can extend/truncate 9185 // to turn a trunc into two truncating stack stores and a stack reload. This 9186 // becomes 3 back-to-back memory operations, but at least that is less than 9187 // all the insert/extracts. 9188 // 9189 // In order to do the last, we convert certain trunc's into MVETRUNC, which 9190 // are either optimized where they can be, or eventually lowered into stack 9191 // stores/loads. This prevents us from splitting a v8i16 trunc into two stores 9192 // two early, where other instructions would be better, and stops us from 9193 // having to reconstruct multiple buildvector shuffles into loads/stores. 9194 if (ToVT != MVT::v8i16 && ToVT != MVT::v16i8) 9195 return SDValue(); 9196 EVT FromVT = N->getOperand(0).getValueType(); 9197 if (FromVT != MVT::v8i32 && FromVT != MVT::v16i16) 9198 return SDValue(); 9199 9200 SDValue Lo, Hi; 9201 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); 9202 SDLoc DL(N); 9203 return DAG.getNode(ARMISD::MVETRUNC, DL, ToVT, Lo, Hi); 9204 } 9205 9206 static SDValue LowerVectorExtend(SDNode *N, SelectionDAG &DAG, 9207 const ARMSubtarget *Subtarget) { 9208 if (!Subtarget->hasMVEIntegerOps()) 9209 return SDValue(); 9210 9211 // See LowerTruncate above for an explanation of MVEEXT/MVETRUNC. 9212 9213 EVT ToVT = N->getValueType(0); 9214 if (ToVT != MVT::v16i32 && ToVT != MVT::v8i32 && ToVT != MVT::v16i16) 9215 return SDValue(); 9216 SDValue Op = N->getOperand(0); 9217 EVT FromVT = Op.getValueType(); 9218 if (FromVT != MVT::v8i16 && FromVT != MVT::v16i8) 9219 return SDValue(); 9220 9221 SDLoc DL(N); 9222 EVT ExtVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext()); 9223 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) 9224 ExtVT = MVT::v8i16; 9225 9226 unsigned Opcode = 9227 N->getOpcode() == ISD::SIGN_EXTEND ? ARMISD::MVESEXT : ARMISD::MVEZEXT; 9228 SDValue Ext = DAG.getNode(Opcode, DL, DAG.getVTList(ExtVT, ExtVT), Op); 9229 SDValue Ext1 = Ext.getValue(1); 9230 9231 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) { 9232 Ext = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext); 9233 Ext1 = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext1); 9234 } 9235 9236 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Ext, Ext1); 9237 } 9238 9239 /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each 9240 /// element has been zero/sign-extended, depending on the isSigned parameter, 9241 /// from an integer type half its size. 9242 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, 9243 bool isSigned) { 9244 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32. 9245 EVT VT = N->getValueType(0); 9246 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) { 9247 SDNode *BVN = N->getOperand(0).getNode(); 9248 if (BVN->getValueType(0) != MVT::v4i32 || 9249 BVN->getOpcode() != ISD::BUILD_VECTOR) 9250 return false; 9251 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; 9252 unsigned HiElt = 1 - LoElt; 9253 ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt)); 9254 ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt)); 9255 ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2)); 9256 ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2)); 9257 if (!Lo0 || !Hi0 || !Lo1 || !Hi1) 9258 return false; 9259 if (isSigned) { 9260 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 && 9261 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32) 9262 return true; 9263 } else { 9264 if (Hi0->isZero() && Hi1->isZero()) 9265 return true; 9266 } 9267 return false; 9268 } 9269 9270 if (N->getOpcode() != ISD::BUILD_VECTOR) 9271 return false; 9272 9273 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 9274 SDNode *Elt = N->getOperand(i).getNode(); 9275 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { 9276 unsigned EltSize = VT.getScalarSizeInBits(); 9277 unsigned HalfSize = EltSize / 2; 9278 if (isSigned) { 9279 if (!isIntN(HalfSize, C->getSExtValue())) 9280 return false; 9281 } else { 9282 if (!isUIntN(HalfSize, C->getZExtValue())) 9283 return false; 9284 } 9285 continue; 9286 } 9287 return false; 9288 } 9289 9290 return true; 9291 } 9292 9293 /// isSignExtended - Check if a node is a vector value that is sign-extended 9294 /// or a constant BUILD_VECTOR with sign-extended elements. 9295 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { 9296 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N)) 9297 return true; 9298 if (isExtendedBUILD_VECTOR(N, DAG, true)) 9299 return true; 9300 return false; 9301 } 9302 9303 /// isZeroExtended - Check if a node is a vector value that is zero-extended (or 9304 /// any-extended) or a constant BUILD_VECTOR with zero-extended elements. 9305 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { 9306 if (N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND || 9307 ISD::isZEXTLoad(N)) 9308 return true; 9309 if (isExtendedBUILD_VECTOR(N, DAG, false)) 9310 return true; 9311 return false; 9312 } 9313 9314 static EVT getExtensionTo64Bits(const EVT &OrigVT) { 9315 if (OrigVT.getSizeInBits() >= 64) 9316 return OrigVT; 9317 9318 assert(OrigVT.isSimple() && "Expecting a simple value type"); 9319 9320 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; 9321 switch (OrigSimpleTy) { 9322 default: llvm_unreachable("Unexpected Vector Type"); 9323 case MVT::v2i8: 9324 case MVT::v2i16: 9325 return MVT::v2i32; 9326 case MVT::v4i8: 9327 return MVT::v4i16; 9328 } 9329 } 9330 9331 /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total 9332 /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL. 9333 /// We insert the required extension here to get the vector to fill a D register. 9334 static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, 9335 const EVT &OrigTy, 9336 const EVT &ExtTy, 9337 unsigned ExtOpcode) { 9338 // The vector originally had a size of OrigTy. It was then extended to ExtTy. 9339 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than 9340 // 64-bits we need to insert a new extension so that it will be 64-bits. 9341 assert(ExtTy.is128BitVector() && "Unexpected extension size"); 9342 if (OrigTy.getSizeInBits() >= 64) 9343 return N; 9344 9345 // Must extend size to at least 64 bits to be used as an operand for VMULL. 9346 EVT NewVT = getExtensionTo64Bits(OrigTy); 9347 9348 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); 9349 } 9350 9351 /// SkipLoadExtensionForVMULL - return a load of the original vector size that 9352 /// does not do any sign/zero extension. If the original vector is less 9353 /// than 64 bits, an appropriate extension will be added after the load to 9354 /// reach a total size of 64 bits. We have to add the extension separately 9355 /// because ARM does not have a sign/zero extending load for vectors. 9356 static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) { 9357 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT()); 9358 9359 // The load already has the right type. 9360 if (ExtendedTy == LD->getMemoryVT()) 9361 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(), 9362 LD->getBasePtr(), LD->getPointerInfo(), 9363 LD->getAlignment(), LD->getMemOperand()->getFlags()); 9364 9365 // We need to create a zextload/sextload. We cannot just create a load 9366 // followed by a zext/zext node because LowerMUL is also run during normal 9367 // operation legalization where we can't create illegal types. 9368 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy, 9369 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), 9370 LD->getMemoryVT(), LD->getAlignment(), 9371 LD->getMemOperand()->getFlags()); 9372 } 9373 9374 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, 9375 /// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return 9376 /// the unextended value. The unextended vector should be 64 bits so that it can 9377 /// be used as an operand to a VMULL instruction. If the original vector size 9378 /// before extension is less than 64 bits we add a an extension to resize 9379 /// the vector to 64 bits. 9380 static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { 9381 if (N->getOpcode() == ISD::SIGN_EXTEND || 9382 N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND) 9383 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG, 9384 N->getOperand(0)->getValueType(0), 9385 N->getValueType(0), 9386 N->getOpcode()); 9387 9388 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 9389 assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) && 9390 "Expected extending load"); 9391 9392 SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG); 9393 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1)); 9394 unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 9395 SDValue extLoad = 9396 DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad); 9397 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad); 9398 9399 return newLoad; 9400 } 9401 9402 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will 9403 // have been legalized as a BITCAST from v4i32. 9404 if (N->getOpcode() == ISD::BITCAST) { 9405 SDNode *BVN = N->getOperand(0).getNode(); 9406 assert(BVN->getOpcode() == ISD::BUILD_VECTOR && 9407 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR"); 9408 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; 9409 return DAG.getBuildVector( 9410 MVT::v2i32, SDLoc(N), 9411 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)}); 9412 } 9413 // Construct a new BUILD_VECTOR with elements truncated to half the size. 9414 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); 9415 EVT VT = N->getValueType(0); 9416 unsigned EltSize = VT.getScalarSizeInBits() / 2; 9417 unsigned NumElts = VT.getVectorNumElements(); 9418 MVT TruncVT = MVT::getIntegerVT(EltSize); 9419 SmallVector<SDValue, 8> Ops; 9420 SDLoc dl(N); 9421 for (unsigned i = 0; i != NumElts; ++i) { 9422 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); 9423 const APInt &CInt = C->getAPIntValue(); 9424 // Element types smaller than 32 bits are not legal, so use i32 elements. 9425 // The values are implicitly truncated so sext vs. zext doesn't matter. 9426 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); 9427 } 9428 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops); 9429 } 9430 9431 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { 9432 unsigned Opcode = N->getOpcode(); 9433 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 9434 SDNode *N0 = N->getOperand(0).getNode(); 9435 SDNode *N1 = N->getOperand(1).getNode(); 9436 return N0->hasOneUse() && N1->hasOneUse() && 9437 isSignExtended(N0, DAG) && isSignExtended(N1, DAG); 9438 } 9439 return false; 9440 } 9441 9442 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { 9443 unsigned Opcode = N->getOpcode(); 9444 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 9445 SDNode *N0 = N->getOperand(0).getNode(); 9446 SDNode *N1 = N->getOperand(1).getNode(); 9447 return N0->hasOneUse() && N1->hasOneUse() && 9448 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); 9449 } 9450 return false; 9451 } 9452 9453 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { 9454 // Multiplications are only custom-lowered for 128-bit vectors so that 9455 // VMULL can be detected. Otherwise v2i64 multiplications are not legal. 9456 EVT VT = Op.getValueType(); 9457 assert(VT.is128BitVector() && VT.isInteger() && 9458 "unexpected type for custom-lowering ISD::MUL"); 9459 SDNode *N0 = Op.getOperand(0).getNode(); 9460 SDNode *N1 = Op.getOperand(1).getNode(); 9461 unsigned NewOpc = 0; 9462 bool isMLA = false; 9463 bool isN0SExt = isSignExtended(N0, DAG); 9464 bool isN1SExt = isSignExtended(N1, DAG); 9465 if (isN0SExt && isN1SExt) 9466 NewOpc = ARMISD::VMULLs; 9467 else { 9468 bool isN0ZExt = isZeroExtended(N0, DAG); 9469 bool isN1ZExt = isZeroExtended(N1, DAG); 9470 if (isN0ZExt && isN1ZExt) 9471 NewOpc = ARMISD::VMULLu; 9472 else if (isN1SExt || isN1ZExt) { 9473 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these 9474 // into (s/zext A * s/zext C) + (s/zext B * s/zext C) 9475 if (isN1SExt && isAddSubSExt(N0, DAG)) { 9476 NewOpc = ARMISD::VMULLs; 9477 isMLA = true; 9478 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { 9479 NewOpc = ARMISD::VMULLu; 9480 isMLA = true; 9481 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { 9482 std::swap(N0, N1); 9483 NewOpc = ARMISD::VMULLu; 9484 isMLA = true; 9485 } 9486 } 9487 9488 if (!NewOpc) { 9489 if (VT == MVT::v2i64) 9490 // Fall through to expand this. It is not legal. 9491 return SDValue(); 9492 else 9493 // Other vector multiplications are legal. 9494 return Op; 9495 } 9496 } 9497 9498 // Legalize to a VMULL instruction. 9499 SDLoc DL(Op); 9500 SDValue Op0; 9501 SDValue Op1 = SkipExtensionForVMULL(N1, DAG); 9502 if (!isMLA) { 9503 Op0 = SkipExtensionForVMULL(N0, DAG); 9504 assert(Op0.getValueType().is64BitVector() && 9505 Op1.getValueType().is64BitVector() && 9506 "unexpected types for extended operands to VMULL"); 9507 return DAG.getNode(NewOpc, DL, VT, Op0, Op1); 9508 } 9509 9510 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during 9511 // isel lowering to take advantage of no-stall back to back vmul + vmla. 9512 // vmull q0, d4, d6 9513 // vmlal q0, d5, d6 9514 // is faster than 9515 // vaddl q0, d4, d5 9516 // vmovl q1, d6 9517 // vmul q0, q0, q1 9518 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG); 9519 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG); 9520 EVT Op1VT = Op1.getValueType(); 9521 return DAG.getNode(N0->getOpcode(), DL, VT, 9522 DAG.getNode(NewOpc, DL, VT, 9523 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), 9524 DAG.getNode(NewOpc, DL, VT, 9525 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); 9526 } 9527 9528 static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, 9529 SelectionDAG &DAG) { 9530 // TODO: Should this propagate fast-math-flags? 9531 9532 // Convert to float 9533 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo)); 9534 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo)); 9535 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X); 9536 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y); 9537 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X); 9538 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y); 9539 // Get reciprocal estimate. 9540 // float4 recip = vrecpeq_f32(yf); 9541 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 9542 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 9543 Y); 9544 // Because char has a smaller range than uchar, we can actually get away 9545 // without any newton steps. This requires that we use a weird bias 9546 // of 0xb000, however (again, this has been exhaustively tested). 9547 // float4 result = as_float4(as_int4(xf*recip) + 0xb000); 9548 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y); 9549 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X); 9550 Y = DAG.getConstant(0xb000, dl, MVT::v4i32); 9551 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y); 9552 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X); 9553 // Convert back to short. 9554 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X); 9555 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X); 9556 return X; 9557 } 9558 9559 static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, 9560 SelectionDAG &DAG) { 9561 // TODO: Should this propagate fast-math-flags? 9562 9563 SDValue N2; 9564 // Convert to float. 9565 // float4 yf = vcvt_f32_s32(vmovl_s16(y)); 9566 // float4 xf = vcvt_f32_s32(vmovl_s16(x)); 9567 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0); 9568 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1); 9569 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 9570 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 9571 9572 // Use reciprocal estimate and one refinement step. 9573 // float4 recip = vrecpeq_f32(yf); 9574 // recip *= vrecpsq_f32(yf, recip); 9575 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 9576 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 9577 N1); 9578 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 9579 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 9580 N1, N2); 9581 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 9582 // Because short has a smaller range than ushort, we can actually get away 9583 // with only a single newton step. This requires that we use a weird bias 9584 // of 89, however (again, this has been exhaustively tested). 9585 // float4 result = as_float4(as_int4(xf*recip) + 0x89); 9586 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 9587 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 9588 N1 = DAG.getConstant(0x89, dl, MVT::v4i32); 9589 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 9590 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 9591 // Convert back to integer and return. 9592 // return vmovn_s32(vcvt_s32_f32(result)); 9593 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 9594 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 9595 return N0; 9596 } 9597 9598 static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG, 9599 const ARMSubtarget *ST) { 9600 EVT VT = Op.getValueType(); 9601 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 9602 "unexpected type for custom-lowering ISD::SDIV"); 9603 9604 SDLoc dl(Op); 9605 SDValue N0 = Op.getOperand(0); 9606 SDValue N1 = Op.getOperand(1); 9607 SDValue N2, N3; 9608 9609 if (VT == MVT::v8i8) { 9610 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0); 9611 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1); 9612 9613 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 9614 DAG.getIntPtrConstant(4, dl)); 9615 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 9616 DAG.getIntPtrConstant(4, dl)); 9617 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 9618 DAG.getIntPtrConstant(0, dl)); 9619 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 9620 DAG.getIntPtrConstant(0, dl)); 9621 9622 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16 9623 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16 9624 9625 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 9626 N0 = LowerCONCAT_VECTORS(N0, DAG, ST); 9627 9628 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0); 9629 return N0; 9630 } 9631 return LowerSDIV_v4i16(N0, N1, dl, DAG); 9632 } 9633 9634 static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG, 9635 const ARMSubtarget *ST) { 9636 // TODO: Should this propagate fast-math-flags? 9637 EVT VT = Op.getValueType(); 9638 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 9639 "unexpected type for custom-lowering ISD::UDIV"); 9640 9641 SDLoc dl(Op); 9642 SDValue N0 = Op.getOperand(0); 9643 SDValue N1 = Op.getOperand(1); 9644 SDValue N2, N3; 9645 9646 if (VT == MVT::v8i8) { 9647 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0); 9648 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1); 9649 9650 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 9651 DAG.getIntPtrConstant(4, dl)); 9652 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 9653 DAG.getIntPtrConstant(4, dl)); 9654 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 9655 DAG.getIntPtrConstant(0, dl)); 9656 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 9657 DAG.getIntPtrConstant(0, dl)); 9658 9659 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16 9660 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16 9661 9662 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 9663 N0 = LowerCONCAT_VECTORS(N0, DAG, ST); 9664 9665 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8, 9666 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl, 9667 MVT::i32), 9668 N0); 9669 return N0; 9670 } 9671 9672 // v4i16 sdiv ... Convert to float. 9673 // float4 yf = vcvt_f32_s32(vmovl_u16(y)); 9674 // float4 xf = vcvt_f32_s32(vmovl_u16(x)); 9675 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0); 9676 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1); 9677 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 9678 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 9679 9680 // Use reciprocal estimate and two refinement steps. 9681 // float4 recip = vrecpeq_f32(yf); 9682 // recip *= vrecpsq_f32(yf, recip); 9683 // recip *= vrecpsq_f32(yf, recip); 9684 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 9685 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 9686 BN1); 9687 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 9688 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 9689 BN1, N2); 9690 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 9691 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 9692 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 9693 BN1, N2); 9694 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 9695 // Simply multiplying by the reciprocal estimate can leave us a few ulps 9696 // too low, so we add 2 ulps (exhaustive testing shows that this is enough, 9697 // and that it will never cause us to return an answer too large). 9698 // float4 result = as_float4(as_int4(xf*recip) + 2); 9699 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 9700 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 9701 N1 = DAG.getConstant(2, dl, MVT::v4i32); 9702 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 9703 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 9704 // Convert back to integer and return. 9705 // return vmovn_u32(vcvt_s32_f32(result)); 9706 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 9707 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 9708 return N0; 9709 } 9710 9711 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) { 9712 SDNode *N = Op.getNode(); 9713 EVT VT = N->getValueType(0); 9714 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 9715 9716 SDValue Carry = Op.getOperand(2); 9717 9718 SDLoc DL(Op); 9719 9720 SDValue Result; 9721 if (Op.getOpcode() == ISD::ADDCARRY) { 9722 // This converts the boolean value carry into the carry flag. 9723 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 9724 9725 // Do the addition proper using the carry flag we wanted. 9726 Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0), 9727 Op.getOperand(1), Carry); 9728 9729 // Now convert the carry flag into a boolean value. 9730 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG); 9731 } else { 9732 // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we 9733 // have to invert the carry first. 9734 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 9735 DAG.getConstant(1, DL, MVT::i32), Carry); 9736 // This converts the boolean value carry into the carry flag. 9737 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 9738 9739 // Do the subtraction proper using the carry flag we wanted. 9740 Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0), 9741 Op.getOperand(1), Carry); 9742 9743 // Now convert the carry flag into a boolean value. 9744 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG); 9745 // But the carry returned by ARMISD::SUBE is not a borrow as expected 9746 // by ISD::SUBCARRY, so compute 1 - C. 9747 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 9748 DAG.getConstant(1, DL, MVT::i32), Carry); 9749 } 9750 9751 // Return both values. 9752 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry); 9753 } 9754 9755 SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { 9756 assert(Subtarget->isTargetDarwin()); 9757 9758 // For iOS, we want to call an alternative entry point: __sincos_stret, 9759 // return values are passed via sret. 9760 SDLoc dl(Op); 9761 SDValue Arg = Op.getOperand(0); 9762 EVT ArgVT = Arg.getValueType(); 9763 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 9764 auto PtrVT = getPointerTy(DAG.getDataLayout()); 9765 9766 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 9767 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9768 9769 // Pair of floats / doubles used to pass the result. 9770 Type *RetTy = StructType::get(ArgTy, ArgTy); 9771 auto &DL = DAG.getDataLayout(); 9772 9773 ArgListTy Args; 9774 bool ShouldUseSRet = Subtarget->isAPCS_ABI(); 9775 SDValue SRet; 9776 if (ShouldUseSRet) { 9777 // Create stack object for sret. 9778 const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); 9779 const Align StackAlign = DL.getPrefTypeAlign(RetTy); 9780 int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); 9781 SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL)); 9782 9783 ArgListEntry Entry; 9784 Entry.Node = SRet; 9785 Entry.Ty = RetTy->getPointerTo(); 9786 Entry.IsSExt = false; 9787 Entry.IsZExt = false; 9788 Entry.IsSRet = true; 9789 Args.push_back(Entry); 9790 RetTy = Type::getVoidTy(*DAG.getContext()); 9791 } 9792 9793 ArgListEntry Entry; 9794 Entry.Node = Arg; 9795 Entry.Ty = ArgTy; 9796 Entry.IsSExt = false; 9797 Entry.IsZExt = false; 9798 Args.push_back(Entry); 9799 9800 RTLIB::Libcall LC = 9801 (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; 9802 const char *LibcallName = getLibcallName(LC); 9803 CallingConv::ID CC = getLibcallCallingConv(LC); 9804 SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL)); 9805 9806 TargetLowering::CallLoweringInfo CLI(DAG); 9807 CLI.setDebugLoc(dl) 9808 .setChain(DAG.getEntryNode()) 9809 .setCallee(CC, RetTy, Callee, std::move(Args)) 9810 .setDiscardResult(ShouldUseSRet); 9811 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 9812 9813 if (!ShouldUseSRet) 9814 return CallResult.first; 9815 9816 SDValue LoadSin = 9817 DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo()); 9818 9819 // Address of cos field. 9820 SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet, 9821 DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl)); 9822 SDValue LoadCos = 9823 DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo()); 9824 9825 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); 9826 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, 9827 LoadSin.getValue(0), LoadCos.getValue(0)); 9828 } 9829 9830 SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, 9831 bool Signed, 9832 SDValue &Chain) const { 9833 EVT VT = Op.getValueType(); 9834 assert((VT == MVT::i32 || VT == MVT::i64) && 9835 "unexpected type for custom lowering DIV"); 9836 SDLoc dl(Op); 9837 9838 const auto &DL = DAG.getDataLayout(); 9839 const auto &TLI = DAG.getTargetLoweringInfo(); 9840 9841 const char *Name = nullptr; 9842 if (Signed) 9843 Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64"; 9844 else 9845 Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64"; 9846 9847 SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL)); 9848 9849 ARMTargetLowering::ArgListTy Args; 9850 9851 for (auto AI : {1, 0}) { 9852 ArgListEntry Arg; 9853 Arg.Node = Op.getOperand(AI); 9854 Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext()); 9855 Args.push_back(Arg); 9856 } 9857 9858 CallLoweringInfo CLI(DAG); 9859 CLI.setDebugLoc(dl) 9860 .setChain(Chain) 9861 .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()), 9862 ES, std::move(Args)); 9863 9864 return LowerCallTo(CLI).first; 9865 } 9866 9867 // This is a code size optimisation: return the original SDIV node to 9868 // DAGCombiner when we don't want to expand SDIV into a sequence of 9869 // instructions, and an empty node otherwise which will cause the 9870 // SDIV to be expanded in DAGCombine. 9871 SDValue 9872 ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, 9873 SelectionDAG &DAG, 9874 SmallVectorImpl<SDNode *> &Created) const { 9875 // TODO: Support SREM 9876 if (N->getOpcode() != ISD::SDIV) 9877 return SDValue(); 9878 9879 const auto &ST = static_cast<const ARMSubtarget&>(DAG.getSubtarget()); 9880 const bool MinSize = ST.hasMinSize(); 9881 const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode() 9882 : ST.hasDivideInARMMode(); 9883 9884 // Don't touch vector types; rewriting this may lead to scalarizing 9885 // the int divs. 9886 if (N->getOperand(0).getValueType().isVector()) 9887 return SDValue(); 9888 9889 // Bail if MinSize is not set, and also for both ARM and Thumb mode we need 9890 // hwdiv support for this to be really profitable. 9891 if (!(MinSize && HasDivide)) 9892 return SDValue(); 9893 9894 // ARM mode is a bit simpler than Thumb: we can handle large power 9895 // of 2 immediates with 1 mov instruction; no further checks required, 9896 // just return the sdiv node. 9897 if (!ST.isThumb()) 9898 return SDValue(N, 0); 9899 9900 // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV, 9901 // and thus lose the code size benefits of a MOVS that requires only 2. 9902 // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here, 9903 // but as it's doing exactly this, it's not worth the trouble to get TTI. 9904 if (Divisor.sgt(128)) 9905 return SDValue(); 9906 9907 return SDValue(N, 0); 9908 } 9909 9910 SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, 9911 bool Signed) const { 9912 assert(Op.getValueType() == MVT::i32 && 9913 "unexpected type for custom lowering DIV"); 9914 SDLoc dl(Op); 9915 9916 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, 9917 DAG.getEntryNode(), Op.getOperand(1)); 9918 9919 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); 9920 } 9921 9922 static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain) { 9923 SDLoc DL(N); 9924 SDValue Op = N->getOperand(1); 9925 if (N->getValueType(0) == MVT::i32) 9926 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op); 9927 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op, 9928 DAG.getConstant(0, DL, MVT::i32)); 9929 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op, 9930 DAG.getConstant(1, DL, MVT::i32)); 9931 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, 9932 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi)); 9933 } 9934 9935 void ARMTargetLowering::ExpandDIV_Windows( 9936 SDValue Op, SelectionDAG &DAG, bool Signed, 9937 SmallVectorImpl<SDValue> &Results) const { 9938 const auto &DL = DAG.getDataLayout(); 9939 const auto &TLI = DAG.getTargetLoweringInfo(); 9940 9941 assert(Op.getValueType() == MVT::i64 && 9942 "unexpected type for custom lowering DIV"); 9943 SDLoc dl(Op); 9944 9945 SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode()); 9946 9947 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); 9948 9949 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result); 9950 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result, 9951 DAG.getConstant(32, dl, TLI.getPointerTy(DL))); 9952 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper); 9953 9954 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper)); 9955 } 9956 9957 static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) { 9958 LoadSDNode *LD = cast<LoadSDNode>(Op.getNode()); 9959 EVT MemVT = LD->getMemoryVT(); 9960 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || 9961 MemVT == MVT::v16i1) && 9962 "Expected a predicate type!"); 9963 assert(MemVT == Op.getValueType()); 9964 assert(LD->getExtensionType() == ISD::NON_EXTLOAD && 9965 "Expected a non-extending load"); 9966 assert(LD->isUnindexed() && "Expected a unindexed load"); 9967 9968 // The basic MVE VLDR on a v2i1/v4i1/v8i1 actually loads the entire 16bit 9969 // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We 9970 // need to make sure that 8/4/2 bits are actually loaded into the correct 9971 // place, which means loading the value and then shuffling the values into 9972 // the bottom bits of the predicate. 9973 // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect 9974 // for BE). 9975 // Speaking of BE, apparently the rest of llvm will assume a reverse order to 9976 // a natural VMSR(load), so needs to be reversed. 9977 9978 SDLoc dl(Op); 9979 SDValue Load = DAG.getExtLoad( 9980 ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(), 9981 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()), 9982 LD->getMemOperand()); 9983 SDValue Val = Load; 9984 if (DAG.getDataLayout().isBigEndian()) 9985 Val = DAG.getNode(ISD::SRL, dl, MVT::i32, 9986 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Load), 9987 DAG.getConstant(32 - MemVT.getSizeInBits(), dl, MVT::i32)); 9988 SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Val); 9989 if (MemVT != MVT::v16i1) 9990 Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred, 9991 DAG.getConstant(0, dl, MVT::i32)); 9992 return DAG.getMergeValues({Pred, Load.getValue(1)}, dl); 9993 } 9994 9995 void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results, 9996 SelectionDAG &DAG) const { 9997 LoadSDNode *LD = cast<LoadSDNode>(N); 9998 EVT MemVT = LD->getMemoryVT(); 9999 assert(LD->isUnindexed() && "Loads should be unindexed at this point."); 10000 10001 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() && 10002 !Subtarget->isThumb1Only() && LD->isVolatile()) { 10003 SDLoc dl(N); 10004 SDValue Result = DAG.getMemIntrinsicNode( 10005 ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}), 10006 {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand()); 10007 SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1); 10008 SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0); 10009 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 10010 Results.append({Pair, Result.getValue(2)}); 10011 } 10012 } 10013 10014 static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) { 10015 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode()); 10016 EVT MemVT = ST->getMemoryVT(); 10017 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || 10018 MemVT == MVT::v16i1) && 10019 "Expected a predicate type!"); 10020 assert(MemVT == ST->getValue().getValueType()); 10021 assert(!ST->isTruncatingStore() && "Expected a non-extending store"); 10022 assert(ST->isUnindexed() && "Expected a unindexed store"); 10023 10024 // Only store the v2i1 or v4i1 or v8i1 worth of bits, via a buildvector with 10025 // top bits unset and a scalar store. 10026 SDLoc dl(Op); 10027 SDValue Build = ST->getValue(); 10028 if (MemVT != MVT::v16i1) { 10029 SmallVector<SDValue, 16> Ops; 10030 for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) { 10031 unsigned Elt = DAG.getDataLayout().isBigEndian() 10032 ? MemVT.getVectorNumElements() - I - 1 10033 : I; 10034 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build, 10035 DAG.getConstant(Elt, dl, MVT::i32))); 10036 } 10037 for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++) 10038 Ops.push_back(DAG.getUNDEF(MVT::i32)); 10039 Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops); 10040 } 10041 SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build); 10042 if (MemVT == MVT::v16i1 && DAG.getDataLayout().isBigEndian()) 10043 GRP = DAG.getNode(ISD::SRL, dl, MVT::i32, 10044 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, GRP), 10045 DAG.getConstant(16, dl, MVT::i32)); 10046 return DAG.getTruncStore( 10047 ST->getChain(), dl, GRP, ST->getBasePtr(), 10048 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()), 10049 ST->getMemOperand()); 10050 } 10051 10052 static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG, 10053 const ARMSubtarget *Subtarget) { 10054 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode()); 10055 EVT MemVT = ST->getMemoryVT(); 10056 assert(ST->isUnindexed() && "Stores should be unindexed at this point."); 10057 10058 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() && 10059 !Subtarget->isThumb1Only() && ST->isVolatile()) { 10060 SDNode *N = Op.getNode(); 10061 SDLoc dl(N); 10062 10063 SDValue Lo = DAG.getNode( 10064 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(), 10065 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl, 10066 MVT::i32)); 10067 SDValue Hi = DAG.getNode( 10068 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(), 10069 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl, 10070 MVT::i32)); 10071 10072 return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other), 10073 {ST->getChain(), Lo, Hi, ST->getBasePtr()}, 10074 MemVT, ST->getMemOperand()); 10075 } else if (Subtarget->hasMVEIntegerOps() && 10076 ((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || 10077 MemVT == MVT::v16i1))) { 10078 return LowerPredicateStore(Op, DAG); 10079 } 10080 10081 return SDValue(); 10082 } 10083 10084 static bool isZeroVector(SDValue N) { 10085 return (ISD::isBuildVectorAllZeros(N.getNode()) || 10086 (N->getOpcode() == ARMISD::VMOVIMM && 10087 isNullConstant(N->getOperand(0)))); 10088 } 10089 10090 static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) { 10091 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode()); 10092 MVT VT = Op.getSimpleValueType(); 10093 SDValue Mask = N->getMask(); 10094 SDValue PassThru = N->getPassThru(); 10095 SDLoc dl(Op); 10096 10097 if (isZeroVector(PassThru)) 10098 return Op; 10099 10100 // MVE Masked loads use zero as the passthru value. Here we convert undef to 10101 // zero too, and other values are lowered to a select. 10102 SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 10103 DAG.getTargetConstant(0, dl, MVT::i32)); 10104 SDValue NewLoad = DAG.getMaskedLoad( 10105 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec, 10106 N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(), 10107 N->getExtensionType(), N->isExpandingLoad()); 10108 SDValue Combo = NewLoad; 10109 bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST || 10110 PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) && 10111 isZeroVector(PassThru->getOperand(0)); 10112 if (!PassThru.isUndef() && !PassThruIsCastZero) 10113 Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru); 10114 return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl); 10115 } 10116 10117 static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG, 10118 const ARMSubtarget *ST) { 10119 if (!ST->hasMVEIntegerOps()) 10120 return SDValue(); 10121 10122 SDLoc dl(Op); 10123 unsigned BaseOpcode = 0; 10124 switch (Op->getOpcode()) { 10125 default: llvm_unreachable("Expected VECREDUCE opcode"); 10126 case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break; 10127 case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break; 10128 case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break; 10129 case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break; 10130 case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break; 10131 case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break; 10132 case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break; 10133 case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break; 10134 } 10135 10136 SDValue Op0 = Op->getOperand(0); 10137 EVT VT = Op0.getValueType(); 10138 EVT EltVT = VT.getVectorElementType(); 10139 unsigned NumElts = VT.getVectorNumElements(); 10140 unsigned NumActiveLanes = NumElts; 10141 10142 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 || 10143 NumActiveLanes == 2) && 10144 "Only expected a power 2 vector size"); 10145 10146 // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements 10147 // allows us to easily extract vector elements from the lanes. 10148 while (NumActiveLanes > 4) { 10149 unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32; 10150 SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0); 10151 Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev); 10152 NumActiveLanes /= 2; 10153 } 10154 10155 SDValue Res; 10156 if (NumActiveLanes == 4) { 10157 // The remaining 4 elements are summed sequentially 10158 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, 10159 DAG.getConstant(0 * NumElts / 4, dl, MVT::i32)); 10160 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, 10161 DAG.getConstant(1 * NumElts / 4, dl, MVT::i32)); 10162 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, 10163 DAG.getConstant(2 * NumElts / 4, dl, MVT::i32)); 10164 SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, 10165 DAG.getConstant(3 * NumElts / 4, dl, MVT::i32)); 10166 SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags()); 10167 SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags()); 10168 Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags()); 10169 } else { 10170 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, 10171 DAG.getConstant(0, dl, MVT::i32)); 10172 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, 10173 DAG.getConstant(1, dl, MVT::i32)); 10174 Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags()); 10175 } 10176 10177 // Result type may be wider than element type. 10178 if (EltVT != Op->getValueType(0)) 10179 Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res); 10180 return Res; 10181 } 10182 10183 static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG, 10184 const ARMSubtarget *ST) { 10185 if (!ST->hasMVEFloatOps()) 10186 return SDValue(); 10187 return LowerVecReduce(Op, DAG, ST); 10188 } 10189 10190 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { 10191 if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getSuccessOrdering())) 10192 // Acquire/Release load/store is not legal for targets without a dmb or 10193 // equivalent available. 10194 return SDValue(); 10195 10196 // Monotonic load/store is legal for all targets. 10197 return Op; 10198 } 10199 10200 static void ReplaceREADCYCLECOUNTER(SDNode *N, 10201 SmallVectorImpl<SDValue> &Results, 10202 SelectionDAG &DAG, 10203 const ARMSubtarget *Subtarget) { 10204 SDLoc DL(N); 10205 // Under Power Management extensions, the cycle-count is: 10206 // mrc p15, #0, <Rt>, c9, c13, #0 10207 SDValue Ops[] = { N->getOperand(0), // Chain 10208 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32), 10209 DAG.getTargetConstant(15, DL, MVT::i32), 10210 DAG.getTargetConstant(0, DL, MVT::i32), 10211 DAG.getTargetConstant(9, DL, MVT::i32), 10212 DAG.getTargetConstant(13, DL, MVT::i32), 10213 DAG.getTargetConstant(0, DL, MVT::i32) 10214 }; 10215 10216 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, 10217 DAG.getVTList(MVT::i32, MVT::Other), Ops); 10218 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32, 10219 DAG.getConstant(0, DL, MVT::i32))); 10220 Results.push_back(Cycles32.getValue(1)); 10221 } 10222 10223 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) { 10224 SDLoc dl(V.getNode()); 10225 SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32); 10226 SDValue VHi = DAG.getAnyExtOrTrunc( 10227 DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)), 10228 dl, MVT::i32); 10229 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 10230 if (isBigEndian) 10231 std::swap (VLo, VHi); 10232 SDValue RegClass = 10233 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32); 10234 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32); 10235 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32); 10236 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 }; 10237 return SDValue( 10238 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0); 10239 } 10240 10241 static void ReplaceCMP_SWAP_64Results(SDNode *N, 10242 SmallVectorImpl<SDValue> & Results, 10243 SelectionDAG &DAG) { 10244 assert(N->getValueType(0) == MVT::i64 && 10245 "AtomicCmpSwap on types less than 64 should be legal"); 10246 SDValue Ops[] = {N->getOperand(1), 10247 createGPRPairNode(DAG, N->getOperand(2)), 10248 createGPRPairNode(DAG, N->getOperand(3)), 10249 N->getOperand(0)}; 10250 SDNode *CmpSwap = DAG.getMachineNode( 10251 ARM::CMP_SWAP_64, SDLoc(N), 10252 DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops); 10253 10254 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand(); 10255 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp}); 10256 10257 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 10258 10259 SDValue Lo = 10260 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0, 10261 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)); 10262 SDValue Hi = 10263 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1, 10264 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)); 10265 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi)); 10266 Results.push_back(SDValue(CmpSwap, 2)); 10267 } 10268 10269 SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const { 10270 SDLoc dl(Op); 10271 EVT VT = Op.getValueType(); 10272 SDValue Chain = Op.getOperand(0); 10273 SDValue LHS = Op.getOperand(1); 10274 SDValue RHS = Op.getOperand(2); 10275 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get(); 10276 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS; 10277 10278 // If we don't have instructions of this float type then soften to a libcall 10279 // and use SETCC instead. 10280 if (isUnsupportedFloatingType(LHS.getValueType())) { 10281 DAG.getTargetLoweringInfo().softenSetCCOperands( 10282 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS, Chain, IsSignaling); 10283 if (!RHS.getNode()) { 10284 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 10285 CC = ISD::SETNE; 10286 } 10287 SDValue Result = DAG.getNode(ISD::SETCC, dl, VT, LHS, RHS, 10288 DAG.getCondCode(CC)); 10289 return DAG.getMergeValues({Result, Chain}, dl); 10290 } 10291 10292 ARMCC::CondCodes CondCode, CondCode2; 10293 FPCCToARMCC(CC, CondCode, CondCode2); 10294 10295 // FIXME: Chain is not handled correctly here. Currently the FPSCR is implicit 10296 // in CMPFP and CMPFPE, but instead it should be made explicit by these 10297 // instructions using a chain instead of glue. This would also fix the problem 10298 // here (and also in LowerSELECT_CC) where we generate two comparisons when 10299 // CondCode2 != AL. 10300 SDValue True = DAG.getConstant(1, dl, VT); 10301 SDValue False = DAG.getConstant(0, dl, VT); 10302 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 10303 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 10304 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling); 10305 SDValue Result = getCMOV(dl, VT, False, True, ARMcc, CCR, Cmp, DAG); 10306 if (CondCode2 != ARMCC::AL) { 10307 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32); 10308 Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling); 10309 Result = getCMOV(dl, VT, Result, True, ARMcc, CCR, Cmp, DAG); 10310 } 10311 return DAG.getMergeValues({Result, Chain}, dl); 10312 } 10313 10314 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 10315 LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump()); 10316 switch (Op.getOpcode()) { 10317 default: llvm_unreachable("Don't know how to custom lower this!"); 10318 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG); 10319 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 10320 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 10321 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 10322 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 10323 case ISD::SELECT: return LowerSELECT(Op, DAG); 10324 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 10325 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 10326 case ISD::BR_CC: return LowerBR_CC(Op, DAG); 10327 case ISD::BR_JT: return LowerBR_JT(Op, DAG); 10328 case ISD::VASTART: return LowerVASTART(Op, DAG); 10329 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget); 10330 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget); 10331 case ISD::SINT_TO_FP: 10332 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); 10333 case ISD::STRICT_FP_TO_SINT: 10334 case ISD::STRICT_FP_TO_UINT: 10335 case ISD::FP_TO_SINT: 10336 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); 10337 case ISD::FP_TO_SINT_SAT: 10338 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG, Subtarget); 10339 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 10340 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 10341 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 10342 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG); 10343 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG); 10344 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); 10345 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget); 10346 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, 10347 Subtarget); 10348 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget); 10349 case ISD::SHL: 10350 case ISD::SRL: 10351 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget); 10352 case ISD::SREM: return LowerREM(Op.getNode(), DAG); 10353 case ISD::UREM: return LowerREM(Op.getNode(), DAG); 10354 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG); 10355 case ISD::SRL_PARTS: 10356 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); 10357 case ISD::CTTZ: 10358 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget); 10359 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget); 10360 case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget); 10361 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); 10362 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); 10363 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget); 10364 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget); 10365 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget); 10366 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 10367 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget); 10368 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget); 10369 case ISD::TRUNCATE: return LowerTruncate(Op.getNode(), DAG, Subtarget); 10370 case ISD::SIGN_EXTEND: 10371 case ISD::ZERO_EXTEND: return LowerVectorExtend(Op.getNode(), DAG, Subtarget); 10372 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 10373 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG); 10374 case ISD::MUL: return LowerMUL(Op, DAG); 10375 case ISD::SDIV: 10376 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) 10377 return LowerDIV_Windows(Op, DAG, /* Signed */ true); 10378 return LowerSDIV(Op, DAG, Subtarget); 10379 case ISD::UDIV: 10380 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) 10381 return LowerDIV_Windows(Op, DAG, /* Signed */ false); 10382 return LowerUDIV(Op, DAG, Subtarget); 10383 case ISD::ADDCARRY: 10384 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG); 10385 case ISD::SADDO: 10386 case ISD::SSUBO: 10387 return LowerSignedALUO(Op, DAG); 10388 case ISD::UADDO: 10389 case ISD::USUBO: 10390 return LowerUnsignedALUO(Op, DAG); 10391 case ISD::SADDSAT: 10392 case ISD::SSUBSAT: 10393 case ISD::UADDSAT: 10394 case ISD::USUBSAT: 10395 return LowerADDSUBSAT(Op, DAG, Subtarget); 10396 case ISD::LOAD: 10397 return LowerPredicateLoad(Op, DAG); 10398 case ISD::STORE: 10399 return LowerSTORE(Op, DAG, Subtarget); 10400 case ISD::MLOAD: 10401 return LowerMLOAD(Op, DAG); 10402 case ISD::VECREDUCE_MUL: 10403 case ISD::VECREDUCE_AND: 10404 case ISD::VECREDUCE_OR: 10405 case ISD::VECREDUCE_XOR: 10406 return LowerVecReduce(Op, DAG, Subtarget); 10407 case ISD::VECREDUCE_FADD: 10408 case ISD::VECREDUCE_FMUL: 10409 case ISD::VECREDUCE_FMIN: 10410 case ISD::VECREDUCE_FMAX: 10411 return LowerVecReduceF(Op, DAG, Subtarget); 10412 case ISD::ATOMIC_LOAD: 10413 case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); 10414 case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); 10415 case ISD::SDIVREM: 10416 case ISD::UDIVREM: return LowerDivRem(Op, DAG); 10417 case ISD::DYNAMIC_STACKALLOC: 10418 if (Subtarget->isTargetWindows()) 10419 return LowerDYNAMIC_STACKALLOC(Op, DAG); 10420 llvm_unreachable("Don't know how to custom lower this!"); 10421 case ISD::STRICT_FP_ROUND: 10422 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); 10423 case ISD::STRICT_FP_EXTEND: 10424 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); 10425 case ISD::STRICT_FSETCC: 10426 case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG); 10427 case ARMISD::WIN__DBZCHK: return SDValue(); 10428 } 10429 } 10430 10431 static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results, 10432 SelectionDAG &DAG) { 10433 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 10434 unsigned Opc = 0; 10435 if (IntNo == Intrinsic::arm_smlald) 10436 Opc = ARMISD::SMLALD; 10437 else if (IntNo == Intrinsic::arm_smlaldx) 10438 Opc = ARMISD::SMLALDX; 10439 else if (IntNo == Intrinsic::arm_smlsld) 10440 Opc = ARMISD::SMLSLD; 10441 else if (IntNo == Intrinsic::arm_smlsldx) 10442 Opc = ARMISD::SMLSLDX; 10443 else 10444 return; 10445 10446 SDLoc dl(N); 10447 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 10448 N->getOperand(3), 10449 DAG.getConstant(0, dl, MVT::i32)); 10450 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 10451 N->getOperand(3), 10452 DAG.getConstant(1, dl, MVT::i32)); 10453 10454 SDValue LongMul = DAG.getNode(Opc, dl, 10455 DAG.getVTList(MVT::i32, MVT::i32), 10456 N->getOperand(1), N->getOperand(2), 10457 Lo, Hi); 10458 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, 10459 LongMul.getValue(0), LongMul.getValue(1))); 10460 } 10461 10462 /// ReplaceNodeResults - Replace the results of node with an illegal result 10463 /// type with new values built out of custom code. 10464 void ARMTargetLowering::ReplaceNodeResults(SDNode *N, 10465 SmallVectorImpl<SDValue> &Results, 10466 SelectionDAG &DAG) const { 10467 SDValue Res; 10468 switch (N->getOpcode()) { 10469 default: 10470 llvm_unreachable("Don't know how to custom expand this!"); 10471 case ISD::READ_REGISTER: 10472 ExpandREAD_REGISTER(N, Results, DAG); 10473 break; 10474 case ISD::BITCAST: 10475 Res = ExpandBITCAST(N, DAG, Subtarget); 10476 break; 10477 case ISD::SRL: 10478 case ISD::SRA: 10479 case ISD::SHL: 10480 Res = Expand64BitShift(N, DAG, Subtarget); 10481 break; 10482 case ISD::SREM: 10483 case ISD::UREM: 10484 Res = LowerREM(N, DAG); 10485 break; 10486 case ISD::SDIVREM: 10487 case ISD::UDIVREM: 10488 Res = LowerDivRem(SDValue(N, 0), DAG); 10489 assert(Res.getNumOperands() == 2 && "DivRem needs two values"); 10490 Results.push_back(Res.getValue(0)); 10491 Results.push_back(Res.getValue(1)); 10492 return; 10493 case ISD::SADDSAT: 10494 case ISD::SSUBSAT: 10495 case ISD::UADDSAT: 10496 case ISD::USUBSAT: 10497 Res = LowerADDSUBSAT(SDValue(N, 0), DAG, Subtarget); 10498 break; 10499 case ISD::READCYCLECOUNTER: 10500 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget); 10501 return; 10502 case ISD::UDIV: 10503 case ISD::SDIV: 10504 assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows"); 10505 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV, 10506 Results); 10507 case ISD::ATOMIC_CMP_SWAP: 10508 ReplaceCMP_SWAP_64Results(N, Results, DAG); 10509 return; 10510 case ISD::INTRINSIC_WO_CHAIN: 10511 return ReplaceLongIntrinsic(N, Results, DAG); 10512 case ISD::ABS: 10513 lowerABS(N, Results, DAG); 10514 return ; 10515 case ISD::LOAD: 10516 LowerLOAD(N, Results, DAG); 10517 break; 10518 case ISD::TRUNCATE: 10519 Res = LowerTruncate(N, DAG, Subtarget); 10520 break; 10521 case ISD::SIGN_EXTEND: 10522 case ISD::ZERO_EXTEND: 10523 Res = LowerVectorExtend(N, DAG, Subtarget); 10524 break; 10525 case ISD::FP_TO_SINT_SAT: 10526 case ISD::FP_TO_UINT_SAT: 10527 Res = LowerFP_TO_INT_SAT(SDValue(N, 0), DAG, Subtarget); 10528 break; 10529 } 10530 if (Res.getNode()) 10531 Results.push_back(Res); 10532 } 10533 10534 //===----------------------------------------------------------------------===// 10535 // ARM Scheduler Hooks 10536 //===----------------------------------------------------------------------===// 10537 10538 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and 10539 /// registers the function context. 10540 void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, 10541 MachineBasicBlock *MBB, 10542 MachineBasicBlock *DispatchBB, 10543 int FI) const { 10544 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 10545 "ROPI/RWPI not currently supported with SjLj"); 10546 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 10547 DebugLoc dl = MI.getDebugLoc(); 10548 MachineFunction *MF = MBB->getParent(); 10549 MachineRegisterInfo *MRI = &MF->getRegInfo(); 10550 MachineConstantPool *MCP = MF->getConstantPool(); 10551 ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>(); 10552 const Function &F = MF->getFunction(); 10553 10554 bool isThumb = Subtarget->isThumb(); 10555 bool isThumb2 = Subtarget->isThumb2(); 10556 10557 unsigned PCLabelId = AFI->createPICLabelUId(); 10558 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8; 10559 ARMConstantPoolValue *CPV = 10560 ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj); 10561 unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4)); 10562 10563 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass 10564 : &ARM::GPRRegClass; 10565 10566 // Grab constant pool and fixed stack memory operands. 10567 MachineMemOperand *CPMMO = 10568 MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), 10569 MachineMemOperand::MOLoad, 4, Align(4)); 10570 10571 MachineMemOperand *FIMMOSt = 10572 MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI), 10573 MachineMemOperand::MOStore, 4, Align(4)); 10574 10575 // Load the address of the dispatch MBB into the jump buffer. 10576 if (isThumb2) { 10577 // Incoming value: jbuf 10578 // ldr.n r5, LCPI1_1 10579 // orr r5, r5, #1 10580 // add r5, pc 10581 // str r5, [$jbuf, #+4] ; &jbuf[1] 10582 Register NewVReg1 = MRI->createVirtualRegister(TRC); 10583 BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1) 10584 .addConstantPoolIndex(CPI) 10585 .addMemOperand(CPMMO) 10586 .add(predOps(ARMCC::AL)); 10587 // Set the low bit because of thumb mode. 10588 Register NewVReg2 = MRI->createVirtualRegister(TRC); 10589 BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2) 10590 .addReg(NewVReg1, RegState::Kill) 10591 .addImm(0x01) 10592 .add(predOps(ARMCC::AL)) 10593 .add(condCodeOp()); 10594 Register NewVReg3 = MRI->createVirtualRegister(TRC); 10595 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3) 10596 .addReg(NewVReg2, RegState::Kill) 10597 .addImm(PCLabelId); 10598 BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12)) 10599 .addReg(NewVReg3, RegState::Kill) 10600 .addFrameIndex(FI) 10601 .addImm(36) // &jbuf[1] :: pc 10602 .addMemOperand(FIMMOSt) 10603 .add(predOps(ARMCC::AL)); 10604 } else if (isThumb) { 10605 // Incoming value: jbuf 10606 // ldr.n r1, LCPI1_4 10607 // add r1, pc 10608 // mov r2, #1 10609 // orrs r1, r2 10610 // add r2, $jbuf, #+4 ; &jbuf[1] 10611 // str r1, [r2] 10612 Register NewVReg1 = MRI->createVirtualRegister(TRC); 10613 BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1) 10614 .addConstantPoolIndex(CPI) 10615 .addMemOperand(CPMMO) 10616 .add(predOps(ARMCC::AL)); 10617 Register NewVReg2 = MRI->createVirtualRegister(TRC); 10618 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2) 10619 .addReg(NewVReg1, RegState::Kill) 10620 .addImm(PCLabelId); 10621 // Set the low bit because of thumb mode. 10622 Register NewVReg3 = MRI->createVirtualRegister(TRC); 10623 BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3) 10624 .addReg(ARM::CPSR, RegState::Define) 10625 .addImm(1) 10626 .add(predOps(ARMCC::AL)); 10627 Register NewVReg4 = MRI->createVirtualRegister(TRC); 10628 BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4) 10629 .addReg(ARM::CPSR, RegState::Define) 10630 .addReg(NewVReg2, RegState::Kill) 10631 .addReg(NewVReg3, RegState::Kill) 10632 .add(predOps(ARMCC::AL)); 10633 Register NewVReg5 = MRI->createVirtualRegister(TRC); 10634 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5) 10635 .addFrameIndex(FI) 10636 .addImm(36); // &jbuf[1] :: pc 10637 BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi)) 10638 .addReg(NewVReg4, RegState::Kill) 10639 .addReg(NewVReg5, RegState::Kill) 10640 .addImm(0) 10641 .addMemOperand(FIMMOSt) 10642 .add(predOps(ARMCC::AL)); 10643 } else { 10644 // Incoming value: jbuf 10645 // ldr r1, LCPI1_1 10646 // add r1, pc, r1 10647 // str r1, [$jbuf, #+4] ; &jbuf[1] 10648 Register NewVReg1 = MRI->createVirtualRegister(TRC); 10649 BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1) 10650 .addConstantPoolIndex(CPI) 10651 .addImm(0) 10652 .addMemOperand(CPMMO) 10653 .add(predOps(ARMCC::AL)); 10654 Register NewVReg2 = MRI->createVirtualRegister(TRC); 10655 BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2) 10656 .addReg(NewVReg1, RegState::Kill) 10657 .addImm(PCLabelId) 10658 .add(predOps(ARMCC::AL)); 10659 BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12)) 10660 .addReg(NewVReg2, RegState::Kill) 10661 .addFrameIndex(FI) 10662 .addImm(36) // &jbuf[1] :: pc 10663 .addMemOperand(FIMMOSt) 10664 .add(predOps(ARMCC::AL)); 10665 } 10666 } 10667 10668 void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, 10669 MachineBasicBlock *MBB) const { 10670 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 10671 DebugLoc dl = MI.getDebugLoc(); 10672 MachineFunction *MF = MBB->getParent(); 10673 MachineRegisterInfo *MRI = &MF->getRegInfo(); 10674 MachineFrameInfo &MFI = MF->getFrameInfo(); 10675 int FI = MFI.getFunctionContextIndex(); 10676 10677 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass 10678 : &ARM::GPRnopcRegClass; 10679 10680 // Get a mapping of the call site numbers to all of the landing pads they're 10681 // associated with. 10682 DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad; 10683 unsigned MaxCSNum = 0; 10684 for (MachineBasicBlock &BB : *MF) { 10685 if (!BB.isEHPad()) 10686 continue; 10687 10688 // FIXME: We should assert that the EH_LABEL is the first MI in the landing 10689 // pad. 10690 for (MachineInstr &II : BB) { 10691 if (!II.isEHLabel()) 10692 continue; 10693 10694 MCSymbol *Sym = II.getOperand(0).getMCSymbol(); 10695 if (!MF->hasCallSiteLandingPad(Sym)) continue; 10696 10697 SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym); 10698 for (unsigned Idx : CallSiteIdxs) { 10699 CallSiteNumToLPad[Idx].push_back(&BB); 10700 MaxCSNum = std::max(MaxCSNum, Idx); 10701 } 10702 break; 10703 } 10704 } 10705 10706 // Get an ordered list of the machine basic blocks for the jump table. 10707 std::vector<MachineBasicBlock*> LPadList; 10708 SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs; 10709 LPadList.reserve(CallSiteNumToLPad.size()); 10710 for (unsigned I = 1; I <= MaxCSNum; ++I) { 10711 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I]; 10712 for (MachineBasicBlock *MBB : MBBList) { 10713 LPadList.push_back(MBB); 10714 InvokeBBs.insert(MBB->pred_begin(), MBB->pred_end()); 10715 } 10716 } 10717 10718 assert(!LPadList.empty() && 10719 "No landing pad destinations for the dispatch jump table!"); 10720 10721 // Create the jump table and associated information. 10722 MachineJumpTableInfo *JTI = 10723 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline); 10724 unsigned MJTI = JTI->createJumpTableIndex(LPadList); 10725 10726 // Create the MBBs for the dispatch code. 10727 10728 // Shove the dispatch's address into the return slot in the function context. 10729 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock(); 10730 DispatchBB->setIsEHPad(); 10731 10732 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 10733 unsigned trap_opcode; 10734 if (Subtarget->isThumb()) 10735 trap_opcode = ARM::tTRAP; 10736 else 10737 trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP; 10738 10739 BuildMI(TrapBB, dl, TII->get(trap_opcode)); 10740 DispatchBB->addSuccessor(TrapBB); 10741 10742 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock(); 10743 DispatchBB->addSuccessor(DispContBB); 10744 10745 // Insert and MBBs. 10746 MF->insert(MF->end(), DispatchBB); 10747 MF->insert(MF->end(), DispContBB); 10748 MF->insert(MF->end(), TrapBB); 10749 10750 // Insert code into the entry block that creates and registers the function 10751 // context. 10752 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI); 10753 10754 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand( 10755 MachinePointerInfo::getFixedStack(*MF, FI), 10756 MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, Align(4)); 10757 10758 MachineInstrBuilder MIB; 10759 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup)); 10760 10761 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII); 10762 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo(); 10763 10764 // Add a register mask with no preserved registers. This results in all 10765 // registers being marked as clobbered. This can't work if the dispatch block 10766 // is in a Thumb1 function and is linked with ARM code which uses the FP 10767 // registers, as there is no way to preserve the FP registers in Thumb1 mode. 10768 MIB.addRegMask(RI.getSjLjDispatchPreservedMask(*MF)); 10769 10770 bool IsPositionIndependent = isPositionIndependent(); 10771 unsigned NumLPads = LPadList.size(); 10772 if (Subtarget->isThumb2()) { 10773 Register NewVReg1 = MRI->createVirtualRegister(TRC); 10774 BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1) 10775 .addFrameIndex(FI) 10776 .addImm(4) 10777 .addMemOperand(FIMMOLd) 10778 .add(predOps(ARMCC::AL)); 10779 10780 if (NumLPads < 256) { 10781 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri)) 10782 .addReg(NewVReg1) 10783 .addImm(LPadList.size()) 10784 .add(predOps(ARMCC::AL)); 10785 } else { 10786 Register VReg1 = MRI->createVirtualRegister(TRC); 10787 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1) 10788 .addImm(NumLPads & 0xFFFF) 10789 .add(predOps(ARMCC::AL)); 10790 10791 unsigned VReg2 = VReg1; 10792 if ((NumLPads & 0xFFFF0000) != 0) { 10793 VReg2 = MRI->createVirtualRegister(TRC); 10794 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2) 10795 .addReg(VReg1) 10796 .addImm(NumLPads >> 16) 10797 .add(predOps(ARMCC::AL)); 10798 } 10799 10800 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr)) 10801 .addReg(NewVReg1) 10802 .addReg(VReg2) 10803 .add(predOps(ARMCC::AL)); 10804 } 10805 10806 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc)) 10807 .addMBB(TrapBB) 10808 .addImm(ARMCC::HI) 10809 .addReg(ARM::CPSR); 10810 10811 Register NewVReg3 = MRI->createVirtualRegister(TRC); 10812 BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3) 10813 .addJumpTableIndex(MJTI) 10814 .add(predOps(ARMCC::AL)); 10815 10816 Register NewVReg4 = MRI->createVirtualRegister(TRC); 10817 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4) 10818 .addReg(NewVReg3, RegState::Kill) 10819 .addReg(NewVReg1) 10820 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) 10821 .add(predOps(ARMCC::AL)) 10822 .add(condCodeOp()); 10823 10824 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT)) 10825 .addReg(NewVReg4, RegState::Kill) 10826 .addReg(NewVReg1) 10827 .addJumpTableIndex(MJTI); 10828 } else if (Subtarget->isThumb()) { 10829 Register NewVReg1 = MRI->createVirtualRegister(TRC); 10830 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1) 10831 .addFrameIndex(FI) 10832 .addImm(1) 10833 .addMemOperand(FIMMOLd) 10834 .add(predOps(ARMCC::AL)); 10835 10836 if (NumLPads < 256) { 10837 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8)) 10838 .addReg(NewVReg1) 10839 .addImm(NumLPads) 10840 .add(predOps(ARMCC::AL)); 10841 } else { 10842 MachineConstantPool *ConstantPool = MF->getConstantPool(); 10843 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 10844 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 10845 10846 // MachineConstantPool wants an explicit alignment. 10847 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty); 10848 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment); 10849 10850 Register VReg1 = MRI->createVirtualRegister(TRC); 10851 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci)) 10852 .addReg(VReg1, RegState::Define) 10853 .addConstantPoolIndex(Idx) 10854 .add(predOps(ARMCC::AL)); 10855 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr)) 10856 .addReg(NewVReg1) 10857 .addReg(VReg1) 10858 .add(predOps(ARMCC::AL)); 10859 } 10860 10861 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc)) 10862 .addMBB(TrapBB) 10863 .addImm(ARMCC::HI) 10864 .addReg(ARM::CPSR); 10865 10866 Register NewVReg2 = MRI->createVirtualRegister(TRC); 10867 BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2) 10868 .addReg(ARM::CPSR, RegState::Define) 10869 .addReg(NewVReg1) 10870 .addImm(2) 10871 .add(predOps(ARMCC::AL)); 10872 10873 Register NewVReg3 = MRI->createVirtualRegister(TRC); 10874 BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3) 10875 .addJumpTableIndex(MJTI) 10876 .add(predOps(ARMCC::AL)); 10877 10878 Register NewVReg4 = MRI->createVirtualRegister(TRC); 10879 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4) 10880 .addReg(ARM::CPSR, RegState::Define) 10881 .addReg(NewVReg2, RegState::Kill) 10882 .addReg(NewVReg3) 10883 .add(predOps(ARMCC::AL)); 10884 10885 MachineMemOperand *JTMMOLd = 10886 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF), 10887 MachineMemOperand::MOLoad, 4, Align(4)); 10888 10889 Register NewVReg5 = MRI->createVirtualRegister(TRC); 10890 BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5) 10891 .addReg(NewVReg4, RegState::Kill) 10892 .addImm(0) 10893 .addMemOperand(JTMMOLd) 10894 .add(predOps(ARMCC::AL)); 10895 10896 unsigned NewVReg6 = NewVReg5; 10897 if (IsPositionIndependent) { 10898 NewVReg6 = MRI->createVirtualRegister(TRC); 10899 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6) 10900 .addReg(ARM::CPSR, RegState::Define) 10901 .addReg(NewVReg5, RegState::Kill) 10902 .addReg(NewVReg3) 10903 .add(predOps(ARMCC::AL)); 10904 } 10905 10906 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr)) 10907 .addReg(NewVReg6, RegState::Kill) 10908 .addJumpTableIndex(MJTI); 10909 } else { 10910 Register NewVReg1 = MRI->createVirtualRegister(TRC); 10911 BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1) 10912 .addFrameIndex(FI) 10913 .addImm(4) 10914 .addMemOperand(FIMMOLd) 10915 .add(predOps(ARMCC::AL)); 10916 10917 if (NumLPads < 256) { 10918 BuildMI(DispatchBB, dl, TII->get(ARM::CMPri)) 10919 .addReg(NewVReg1) 10920 .addImm(NumLPads) 10921 .add(predOps(ARMCC::AL)); 10922 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) { 10923 Register VReg1 = MRI->createVirtualRegister(TRC); 10924 BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1) 10925 .addImm(NumLPads & 0xFFFF) 10926 .add(predOps(ARMCC::AL)); 10927 10928 unsigned VReg2 = VReg1; 10929 if ((NumLPads & 0xFFFF0000) != 0) { 10930 VReg2 = MRI->createVirtualRegister(TRC); 10931 BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2) 10932 .addReg(VReg1) 10933 .addImm(NumLPads >> 16) 10934 .add(predOps(ARMCC::AL)); 10935 } 10936 10937 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 10938 .addReg(NewVReg1) 10939 .addReg(VReg2) 10940 .add(predOps(ARMCC::AL)); 10941 } else { 10942 MachineConstantPool *ConstantPool = MF->getConstantPool(); 10943 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 10944 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 10945 10946 // MachineConstantPool wants an explicit alignment. 10947 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty); 10948 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment); 10949 10950 Register VReg1 = MRI->createVirtualRegister(TRC); 10951 BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp)) 10952 .addReg(VReg1, RegState::Define) 10953 .addConstantPoolIndex(Idx) 10954 .addImm(0) 10955 .add(predOps(ARMCC::AL)); 10956 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 10957 .addReg(NewVReg1) 10958 .addReg(VReg1, RegState::Kill) 10959 .add(predOps(ARMCC::AL)); 10960 } 10961 10962 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc)) 10963 .addMBB(TrapBB) 10964 .addImm(ARMCC::HI) 10965 .addReg(ARM::CPSR); 10966 10967 Register NewVReg3 = MRI->createVirtualRegister(TRC); 10968 BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3) 10969 .addReg(NewVReg1) 10970 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) 10971 .add(predOps(ARMCC::AL)) 10972 .add(condCodeOp()); 10973 Register NewVReg4 = MRI->createVirtualRegister(TRC); 10974 BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4) 10975 .addJumpTableIndex(MJTI) 10976 .add(predOps(ARMCC::AL)); 10977 10978 MachineMemOperand *JTMMOLd = 10979 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF), 10980 MachineMemOperand::MOLoad, 4, Align(4)); 10981 Register NewVReg5 = MRI->createVirtualRegister(TRC); 10982 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5) 10983 .addReg(NewVReg3, RegState::Kill) 10984 .addReg(NewVReg4) 10985 .addImm(0) 10986 .addMemOperand(JTMMOLd) 10987 .add(predOps(ARMCC::AL)); 10988 10989 if (IsPositionIndependent) { 10990 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd)) 10991 .addReg(NewVReg5, RegState::Kill) 10992 .addReg(NewVReg4) 10993 .addJumpTableIndex(MJTI); 10994 } else { 10995 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr)) 10996 .addReg(NewVReg5, RegState::Kill) 10997 .addJumpTableIndex(MJTI); 10998 } 10999 } 11000 11001 // Add the jump table entries as successors to the MBB. 11002 SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs; 11003 for (MachineBasicBlock *CurMBB : LPadList) { 11004 if (SeenMBBs.insert(CurMBB).second) 11005 DispContBB->addSuccessor(CurMBB); 11006 } 11007 11008 // N.B. the order the invoke BBs are processed in doesn't matter here. 11009 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF); 11010 SmallVector<MachineBasicBlock*, 64> MBBLPads; 11011 for (MachineBasicBlock *BB : InvokeBBs) { 11012 11013 // Remove the landing pad successor from the invoke block and replace it 11014 // with the new dispatch block. 11015 SmallVector<MachineBasicBlock*, 4> Successors(BB->successors()); 11016 while (!Successors.empty()) { 11017 MachineBasicBlock *SMBB = Successors.pop_back_val(); 11018 if (SMBB->isEHPad()) { 11019 BB->removeSuccessor(SMBB); 11020 MBBLPads.push_back(SMBB); 11021 } 11022 } 11023 11024 BB->addSuccessor(DispatchBB, BranchProbability::getZero()); 11025 BB->normalizeSuccProbs(); 11026 11027 // Find the invoke call and mark all of the callee-saved registers as 11028 // 'implicit defined' so that they're spilled. This prevents code from 11029 // moving instructions to before the EH block, where they will never be 11030 // executed. 11031 for (MachineBasicBlock::reverse_iterator 11032 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) { 11033 if (!II->isCall()) continue; 11034 11035 DenseMap<unsigned, bool> DefRegs; 11036 for (MachineInstr::mop_iterator 11037 OI = II->operands_begin(), OE = II->operands_end(); 11038 OI != OE; ++OI) { 11039 if (!OI->isReg()) continue; 11040 DefRegs[OI->getReg()] = true; 11041 } 11042 11043 MachineInstrBuilder MIB(*MF, &*II); 11044 11045 for (unsigned i = 0; SavedRegs[i] != 0; ++i) { 11046 unsigned Reg = SavedRegs[i]; 11047 if (Subtarget->isThumb2() && 11048 !ARM::tGPRRegClass.contains(Reg) && 11049 !ARM::hGPRRegClass.contains(Reg)) 11050 continue; 11051 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg)) 11052 continue; 11053 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg)) 11054 continue; 11055 if (!DefRegs[Reg]) 11056 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead); 11057 } 11058 11059 break; 11060 } 11061 } 11062 11063 // Mark all former landing pads as non-landing pads. The dispatch is the only 11064 // landing pad now. 11065 for (MachineBasicBlock *MBBLPad : MBBLPads) 11066 MBBLPad->setIsEHPad(false); 11067 11068 // The instruction is gone now. 11069 MI.eraseFromParent(); 11070 } 11071 11072 static 11073 MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { 11074 for (MachineBasicBlock *S : MBB->successors()) 11075 if (S != Succ) 11076 return S; 11077 llvm_unreachable("Expecting a BB with two successors!"); 11078 } 11079 11080 /// Return the load opcode for a given load size. If load size >= 8, 11081 /// neon opcode will be returned. 11082 static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) { 11083 if (LdSize >= 8) 11084 return LdSize == 16 ? ARM::VLD1q32wb_fixed 11085 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0; 11086 if (IsThumb1) 11087 return LdSize == 4 ? ARM::tLDRi 11088 : LdSize == 2 ? ARM::tLDRHi 11089 : LdSize == 1 ? ARM::tLDRBi : 0; 11090 if (IsThumb2) 11091 return LdSize == 4 ? ARM::t2LDR_POST 11092 : LdSize == 2 ? ARM::t2LDRH_POST 11093 : LdSize == 1 ? ARM::t2LDRB_POST : 0; 11094 return LdSize == 4 ? ARM::LDR_POST_IMM 11095 : LdSize == 2 ? ARM::LDRH_POST 11096 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0; 11097 } 11098 11099 /// Return the store opcode for a given store size. If store size >= 8, 11100 /// neon opcode will be returned. 11101 static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) { 11102 if (StSize >= 8) 11103 return StSize == 16 ? ARM::VST1q32wb_fixed 11104 : StSize == 8 ? ARM::VST1d32wb_fixed : 0; 11105 if (IsThumb1) 11106 return StSize == 4 ? ARM::tSTRi 11107 : StSize == 2 ? ARM::tSTRHi 11108 : StSize == 1 ? ARM::tSTRBi : 0; 11109 if (IsThumb2) 11110 return StSize == 4 ? ARM::t2STR_POST 11111 : StSize == 2 ? ARM::t2STRH_POST 11112 : StSize == 1 ? ARM::t2STRB_POST : 0; 11113 return StSize == 4 ? ARM::STR_POST_IMM 11114 : StSize == 2 ? ARM::STRH_POST 11115 : StSize == 1 ? ARM::STRB_POST_IMM : 0; 11116 } 11117 11118 /// Emit a post-increment load operation with given size. The instructions 11119 /// will be added to BB at Pos. 11120 static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, 11121 const TargetInstrInfo *TII, const DebugLoc &dl, 11122 unsigned LdSize, unsigned Data, unsigned AddrIn, 11123 unsigned AddrOut, bool IsThumb1, bool IsThumb2) { 11124 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2); 11125 assert(LdOpc != 0 && "Should have a load opcode"); 11126 if (LdSize >= 8) { 11127 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 11128 .addReg(AddrOut, RegState::Define) 11129 .addReg(AddrIn) 11130 .addImm(0) 11131 .add(predOps(ARMCC::AL)); 11132 } else if (IsThumb1) { 11133 // load + update AddrIn 11134 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 11135 .addReg(AddrIn) 11136 .addImm(0) 11137 .add(predOps(ARMCC::AL)); 11138 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) 11139 .add(t1CondCodeOp()) 11140 .addReg(AddrIn) 11141 .addImm(LdSize) 11142 .add(predOps(ARMCC::AL)); 11143 } else if (IsThumb2) { 11144 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 11145 .addReg(AddrOut, RegState::Define) 11146 .addReg(AddrIn) 11147 .addImm(LdSize) 11148 .add(predOps(ARMCC::AL)); 11149 } else { // arm 11150 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 11151 .addReg(AddrOut, RegState::Define) 11152 .addReg(AddrIn) 11153 .addReg(0) 11154 .addImm(LdSize) 11155 .add(predOps(ARMCC::AL)); 11156 } 11157 } 11158 11159 /// Emit a post-increment store operation with given size. The instructions 11160 /// will be added to BB at Pos. 11161 static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, 11162 const TargetInstrInfo *TII, const DebugLoc &dl, 11163 unsigned StSize, unsigned Data, unsigned AddrIn, 11164 unsigned AddrOut, bool IsThumb1, bool IsThumb2) { 11165 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2); 11166 assert(StOpc != 0 && "Should have a store opcode"); 11167 if (StSize >= 8) { 11168 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 11169 .addReg(AddrIn) 11170 .addImm(0) 11171 .addReg(Data) 11172 .add(predOps(ARMCC::AL)); 11173 } else if (IsThumb1) { 11174 // store + update AddrIn 11175 BuildMI(*BB, Pos, dl, TII->get(StOpc)) 11176 .addReg(Data) 11177 .addReg(AddrIn) 11178 .addImm(0) 11179 .add(predOps(ARMCC::AL)); 11180 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) 11181 .add(t1CondCodeOp()) 11182 .addReg(AddrIn) 11183 .addImm(StSize) 11184 .add(predOps(ARMCC::AL)); 11185 } else if (IsThumb2) { 11186 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 11187 .addReg(Data) 11188 .addReg(AddrIn) 11189 .addImm(StSize) 11190 .add(predOps(ARMCC::AL)); 11191 } else { // arm 11192 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 11193 .addReg(Data) 11194 .addReg(AddrIn) 11195 .addReg(0) 11196 .addImm(StSize) 11197 .add(predOps(ARMCC::AL)); 11198 } 11199 } 11200 11201 MachineBasicBlock * 11202 ARMTargetLowering::EmitStructByval(MachineInstr &MI, 11203 MachineBasicBlock *BB) const { 11204 // This pseudo instruction has 3 operands: dst, src, size 11205 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold(). 11206 // Otherwise, we will generate unrolled scalar copies. 11207 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 11208 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 11209 MachineFunction::iterator It = ++BB->getIterator(); 11210 11211 Register dest = MI.getOperand(0).getReg(); 11212 Register src = MI.getOperand(1).getReg(); 11213 unsigned SizeVal = MI.getOperand(2).getImm(); 11214 unsigned Alignment = MI.getOperand(3).getImm(); 11215 DebugLoc dl = MI.getDebugLoc(); 11216 11217 MachineFunction *MF = BB->getParent(); 11218 MachineRegisterInfo &MRI = MF->getRegInfo(); 11219 unsigned UnitSize = 0; 11220 const TargetRegisterClass *TRC = nullptr; 11221 const TargetRegisterClass *VecTRC = nullptr; 11222 11223 bool IsThumb1 = Subtarget->isThumb1Only(); 11224 bool IsThumb2 = Subtarget->isThumb2(); 11225 bool IsThumb = Subtarget->isThumb(); 11226 11227 if (Alignment & 1) { 11228 UnitSize = 1; 11229 } else if (Alignment & 2) { 11230 UnitSize = 2; 11231 } else { 11232 // Check whether we can use NEON instructions. 11233 if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) && 11234 Subtarget->hasNEON()) { 11235 if ((Alignment % 16 == 0) && SizeVal >= 16) 11236 UnitSize = 16; 11237 else if ((Alignment % 8 == 0) && SizeVal >= 8) 11238 UnitSize = 8; 11239 } 11240 // Can't use NEON instructions. 11241 if (UnitSize == 0) 11242 UnitSize = 4; 11243 } 11244 11245 // Select the correct opcode and register class for unit size load/store 11246 bool IsNeon = UnitSize >= 8; 11247 TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass; 11248 if (IsNeon) 11249 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass 11250 : UnitSize == 8 ? &ARM::DPRRegClass 11251 : nullptr; 11252 11253 unsigned BytesLeft = SizeVal % UnitSize; 11254 unsigned LoopSize = SizeVal - BytesLeft; 11255 11256 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) { 11257 // Use LDR and STR to copy. 11258 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize) 11259 // [destOut] = STR_POST(scratch, destIn, UnitSize) 11260 unsigned srcIn = src; 11261 unsigned destIn = dest; 11262 for (unsigned i = 0; i < LoopSize; i+=UnitSize) { 11263 Register srcOut = MRI.createVirtualRegister(TRC); 11264 Register destOut = MRI.createVirtualRegister(TRC); 11265 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); 11266 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut, 11267 IsThumb1, IsThumb2); 11268 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut, 11269 IsThumb1, IsThumb2); 11270 srcIn = srcOut; 11271 destIn = destOut; 11272 } 11273 11274 // Handle the leftover bytes with LDRB and STRB. 11275 // [scratch, srcOut] = LDRB_POST(srcIn, 1) 11276 // [destOut] = STRB_POST(scratch, destIn, 1) 11277 for (unsigned i = 0; i < BytesLeft; i++) { 11278 Register srcOut = MRI.createVirtualRegister(TRC); 11279 Register destOut = MRI.createVirtualRegister(TRC); 11280 Register scratch = MRI.createVirtualRegister(TRC); 11281 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut, 11282 IsThumb1, IsThumb2); 11283 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut, 11284 IsThumb1, IsThumb2); 11285 srcIn = srcOut; 11286 destIn = destOut; 11287 } 11288 MI.eraseFromParent(); // The instruction is gone now. 11289 return BB; 11290 } 11291 11292 // Expand the pseudo op to a loop. 11293 // thisMBB: 11294 // ... 11295 // movw varEnd, # --> with thumb2 11296 // movt varEnd, # 11297 // ldrcp varEnd, idx --> without thumb2 11298 // fallthrough --> loopMBB 11299 // loopMBB: 11300 // PHI varPhi, varEnd, varLoop 11301 // PHI srcPhi, src, srcLoop 11302 // PHI destPhi, dst, destLoop 11303 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 11304 // [destLoop] = STR_POST(scratch, destPhi, UnitSize) 11305 // subs varLoop, varPhi, #UnitSize 11306 // bne loopMBB 11307 // fallthrough --> exitMBB 11308 // exitMBB: 11309 // epilogue to handle left-over bytes 11310 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 11311 // [destOut] = STRB_POST(scratch, destLoop, 1) 11312 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); 11313 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); 11314 MF->insert(It, loopMBB); 11315 MF->insert(It, exitMBB); 11316 11317 // Transfer the remainder of BB and its successor edges to exitMBB. 11318 exitMBB->splice(exitMBB->begin(), BB, 11319 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 11320 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 11321 11322 // Load an immediate to varEnd. 11323 Register varEnd = MRI.createVirtualRegister(TRC); 11324 if (Subtarget->useMovt()) { 11325 unsigned Vtmp = varEnd; 11326 if ((LoopSize & 0xFFFF0000) != 0) 11327 Vtmp = MRI.createVirtualRegister(TRC); 11328 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16), Vtmp) 11329 .addImm(LoopSize & 0xFFFF) 11330 .add(predOps(ARMCC::AL)); 11331 11332 if ((LoopSize & 0xFFFF0000) != 0) 11333 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16), varEnd) 11334 .addReg(Vtmp) 11335 .addImm(LoopSize >> 16) 11336 .add(predOps(ARMCC::AL)); 11337 } else { 11338 MachineConstantPool *ConstantPool = MF->getConstantPool(); 11339 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 11340 const Constant *C = ConstantInt::get(Int32Ty, LoopSize); 11341 11342 // MachineConstantPool wants an explicit alignment. 11343 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty); 11344 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment); 11345 MachineMemOperand *CPMMO = 11346 MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), 11347 MachineMemOperand::MOLoad, 4, Align(4)); 11348 11349 if (IsThumb) 11350 BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)) 11351 .addReg(varEnd, RegState::Define) 11352 .addConstantPoolIndex(Idx) 11353 .add(predOps(ARMCC::AL)) 11354 .addMemOperand(CPMMO); 11355 else 11356 BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)) 11357 .addReg(varEnd, RegState::Define) 11358 .addConstantPoolIndex(Idx) 11359 .addImm(0) 11360 .add(predOps(ARMCC::AL)) 11361 .addMemOperand(CPMMO); 11362 } 11363 BB->addSuccessor(loopMBB); 11364 11365 // Generate the loop body: 11366 // varPhi = PHI(varLoop, varEnd) 11367 // srcPhi = PHI(srcLoop, src) 11368 // destPhi = PHI(destLoop, dst) 11369 MachineBasicBlock *entryBB = BB; 11370 BB = loopMBB; 11371 Register varLoop = MRI.createVirtualRegister(TRC); 11372 Register varPhi = MRI.createVirtualRegister(TRC); 11373 Register srcLoop = MRI.createVirtualRegister(TRC); 11374 Register srcPhi = MRI.createVirtualRegister(TRC); 11375 Register destLoop = MRI.createVirtualRegister(TRC); 11376 Register destPhi = MRI.createVirtualRegister(TRC); 11377 11378 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi) 11379 .addReg(varLoop).addMBB(loopMBB) 11380 .addReg(varEnd).addMBB(entryBB); 11381 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi) 11382 .addReg(srcLoop).addMBB(loopMBB) 11383 .addReg(src).addMBB(entryBB); 11384 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi) 11385 .addReg(destLoop).addMBB(loopMBB) 11386 .addReg(dest).addMBB(entryBB); 11387 11388 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 11389 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz) 11390 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); 11391 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop, 11392 IsThumb1, IsThumb2); 11393 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop, 11394 IsThumb1, IsThumb2); 11395 11396 // Decrement loop variable by UnitSize. 11397 if (IsThumb1) { 11398 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop) 11399 .add(t1CondCodeOp()) 11400 .addReg(varPhi) 11401 .addImm(UnitSize) 11402 .add(predOps(ARMCC::AL)); 11403 } else { 11404 MachineInstrBuilder MIB = 11405 BuildMI(*BB, BB->end(), dl, 11406 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop); 11407 MIB.addReg(varPhi) 11408 .addImm(UnitSize) 11409 .add(predOps(ARMCC::AL)) 11410 .add(condCodeOp()); 11411 MIB->getOperand(5).setReg(ARM::CPSR); 11412 MIB->getOperand(5).setIsDef(true); 11413 } 11414 BuildMI(*BB, BB->end(), dl, 11415 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc)) 11416 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); 11417 11418 // loopMBB can loop back to loopMBB or fall through to exitMBB. 11419 BB->addSuccessor(loopMBB); 11420 BB->addSuccessor(exitMBB); 11421 11422 // Add epilogue to handle BytesLeft. 11423 BB = exitMBB; 11424 auto StartOfExit = exitMBB->begin(); 11425 11426 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 11427 // [destOut] = STRB_POST(scratch, destLoop, 1) 11428 unsigned srcIn = srcLoop; 11429 unsigned destIn = destLoop; 11430 for (unsigned i = 0; i < BytesLeft; i++) { 11431 Register srcOut = MRI.createVirtualRegister(TRC); 11432 Register destOut = MRI.createVirtualRegister(TRC); 11433 Register scratch = MRI.createVirtualRegister(TRC); 11434 emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut, 11435 IsThumb1, IsThumb2); 11436 emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut, 11437 IsThumb1, IsThumb2); 11438 srcIn = srcOut; 11439 destIn = destOut; 11440 } 11441 11442 MI.eraseFromParent(); // The instruction is gone now. 11443 return BB; 11444 } 11445 11446 MachineBasicBlock * 11447 ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI, 11448 MachineBasicBlock *MBB) const { 11449 const TargetMachine &TM = getTargetMachine(); 11450 const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); 11451 DebugLoc DL = MI.getDebugLoc(); 11452 11453 assert(Subtarget->isTargetWindows() && 11454 "__chkstk is only supported on Windows"); 11455 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode"); 11456 11457 // __chkstk takes the number of words to allocate on the stack in R4, and 11458 // returns the stack adjustment in number of bytes in R4. This will not 11459 // clober any other registers (other than the obvious lr). 11460 // 11461 // Although, technically, IP should be considered a register which may be 11462 // clobbered, the call itself will not touch it. Windows on ARM is a pure 11463 // thumb-2 environment, so there is no interworking required. As a result, we 11464 // do not expect a veneer to be emitted by the linker, clobbering IP. 11465 // 11466 // Each module receives its own copy of __chkstk, so no import thunk is 11467 // required, again, ensuring that IP is not clobbered. 11468 // 11469 // Finally, although some linkers may theoretically provide a trampoline for 11470 // out of range calls (which is quite common due to a 32M range limitation of 11471 // branches for Thumb), we can generate the long-call version via 11472 // -mcmodel=large, alleviating the need for the trampoline which may clobber 11473 // IP. 11474 11475 switch (TM.getCodeModel()) { 11476 case CodeModel::Tiny: 11477 llvm_unreachable("Tiny code model not available on ARM."); 11478 case CodeModel::Small: 11479 case CodeModel::Medium: 11480 case CodeModel::Kernel: 11481 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL)) 11482 .add(predOps(ARMCC::AL)) 11483 .addExternalSymbol("__chkstk") 11484 .addReg(ARM::R4, RegState::Implicit | RegState::Kill) 11485 .addReg(ARM::R4, RegState::Implicit | RegState::Define) 11486 .addReg(ARM::R12, 11487 RegState::Implicit | RegState::Define | RegState::Dead) 11488 .addReg(ARM::CPSR, 11489 RegState::Implicit | RegState::Define | RegState::Dead); 11490 break; 11491 case CodeModel::Large: { 11492 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 11493 Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11494 11495 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg) 11496 .addExternalSymbol("__chkstk"); 11497 BuildMI(*MBB, MI, DL, TII.get(gettBLXrOpcode(*MBB->getParent()))) 11498 .add(predOps(ARMCC::AL)) 11499 .addReg(Reg, RegState::Kill) 11500 .addReg(ARM::R4, RegState::Implicit | RegState::Kill) 11501 .addReg(ARM::R4, RegState::Implicit | RegState::Define) 11502 .addReg(ARM::R12, 11503 RegState::Implicit | RegState::Define | RegState::Dead) 11504 .addReg(ARM::CPSR, 11505 RegState::Implicit | RegState::Define | RegState::Dead); 11506 break; 11507 } 11508 } 11509 11510 BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP) 11511 .addReg(ARM::SP, RegState::Kill) 11512 .addReg(ARM::R4, RegState::Kill) 11513 .setMIFlags(MachineInstr::FrameSetup) 11514 .add(predOps(ARMCC::AL)) 11515 .add(condCodeOp()); 11516 11517 MI.eraseFromParent(); 11518 return MBB; 11519 } 11520 11521 MachineBasicBlock * 11522 ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI, 11523 MachineBasicBlock *MBB) const { 11524 DebugLoc DL = MI.getDebugLoc(); 11525 MachineFunction *MF = MBB->getParent(); 11526 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 11527 11528 MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock(); 11529 MF->insert(++MBB->getIterator(), ContBB); 11530 ContBB->splice(ContBB->begin(), MBB, 11531 std::next(MachineBasicBlock::iterator(MI)), MBB->end()); 11532 ContBB->transferSuccessorsAndUpdatePHIs(MBB); 11533 MBB->addSuccessor(ContBB); 11534 11535 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 11536 BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0)); 11537 MF->push_back(TrapBB); 11538 MBB->addSuccessor(TrapBB); 11539 11540 BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8)) 11541 .addReg(MI.getOperand(0).getReg()) 11542 .addImm(0) 11543 .add(predOps(ARMCC::AL)); 11544 BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc)) 11545 .addMBB(TrapBB) 11546 .addImm(ARMCC::EQ) 11547 .addReg(ARM::CPSR); 11548 11549 MI.eraseFromParent(); 11550 return ContBB; 11551 } 11552 11553 // The CPSR operand of SelectItr might be missing a kill marker 11554 // because there were multiple uses of CPSR, and ISel didn't know 11555 // which to mark. Figure out whether SelectItr should have had a 11556 // kill marker, and set it if it should. Returns the correct kill 11557 // marker value. 11558 static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr, 11559 MachineBasicBlock* BB, 11560 const TargetRegisterInfo* TRI) { 11561 // Scan forward through BB for a use/def of CPSR. 11562 MachineBasicBlock::iterator miI(std::next(SelectItr)); 11563 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { 11564 const MachineInstr& mi = *miI; 11565 if (mi.readsRegister(ARM::CPSR)) 11566 return false; 11567 if (mi.definesRegister(ARM::CPSR)) 11568 break; // Should have kill-flag - update below. 11569 } 11570 11571 // If we hit the end of the block, check whether CPSR is live into a 11572 // successor. 11573 if (miI == BB->end()) { 11574 for (MachineBasicBlock *Succ : BB->successors()) 11575 if (Succ->isLiveIn(ARM::CPSR)) 11576 return false; 11577 } 11578 11579 // We found a def, or hit the end of the basic block and CPSR wasn't live 11580 // out. SelectMI should have a kill flag on CPSR. 11581 SelectItr->addRegisterKilled(ARM::CPSR, TRI); 11582 return true; 11583 } 11584 11585 /// Adds logic in loop entry MBB to calculate loop iteration count and adds 11586 /// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop 11587 static Register genTPEntry(MachineBasicBlock *TpEntry, 11588 MachineBasicBlock *TpLoopBody, 11589 MachineBasicBlock *TpExit, Register OpSizeReg, 11590 const TargetInstrInfo *TII, DebugLoc Dl, 11591 MachineRegisterInfo &MRI) { 11592 // Calculates loop iteration count = ceil(n/16) = (n + 15) >> 4. 11593 Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11594 BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg) 11595 .addUse(OpSizeReg) 11596 .addImm(15) 11597 .add(predOps(ARMCC::AL)) 11598 .addReg(0); 11599 11600 Register LsrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11601 BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg) 11602 .addUse(AddDestReg, RegState::Kill) 11603 .addImm(4) 11604 .add(predOps(ARMCC::AL)) 11605 .addReg(0); 11606 11607 Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass); 11608 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg) 11609 .addUse(LsrDestReg, RegState::Kill); 11610 11611 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart)) 11612 .addUse(TotalIterationsReg) 11613 .addMBB(TpExit); 11614 11615 BuildMI(TpEntry, Dl, TII->get(ARM::t2B)) 11616 .addMBB(TpLoopBody) 11617 .add(predOps(ARMCC::AL)); 11618 11619 return TotalIterationsReg; 11620 } 11621 11622 /// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and 11623 /// t2DoLoopEnd. These are used by later passes to generate tail predicated 11624 /// loops. 11625 static void genTPLoopBody(MachineBasicBlock *TpLoopBody, 11626 MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit, 11627 const TargetInstrInfo *TII, DebugLoc Dl, 11628 MachineRegisterInfo &MRI, Register OpSrcReg, 11629 Register OpDestReg, Register ElementCountReg, 11630 Register TotalIterationsReg, bool IsMemcpy) { 11631 // First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest 11632 // array, loop iteration counter, predication counter. 11633 11634 Register SrcPhiReg, CurrSrcReg; 11635 if (IsMemcpy) { 11636 // Current position in the src array 11637 SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11638 CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11639 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg) 11640 .addUse(OpSrcReg) 11641 .addMBB(TpEntry) 11642 .addUse(CurrSrcReg) 11643 .addMBB(TpLoopBody); 11644 } 11645 11646 // Current position in the dest array 11647 Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11648 Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11649 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg) 11650 .addUse(OpDestReg) 11651 .addMBB(TpEntry) 11652 .addUse(CurrDestReg) 11653 .addMBB(TpLoopBody); 11654 11655 // Current loop counter 11656 Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass); 11657 Register RemainingLoopIterationsReg = 11658 MRI.createVirtualRegister(&ARM::GPRlrRegClass); 11659 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg) 11660 .addUse(TotalIterationsReg) 11661 .addMBB(TpEntry) 11662 .addUse(RemainingLoopIterationsReg) 11663 .addMBB(TpLoopBody); 11664 11665 // Predication counter 11666 Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11667 Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11668 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg) 11669 .addUse(ElementCountReg) 11670 .addMBB(TpEntry) 11671 .addUse(RemainingElementsReg) 11672 .addMBB(TpLoopBody); 11673 11674 // Pass predication counter to VCTP 11675 Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass); 11676 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg) 11677 .addUse(PredCounterPhiReg) 11678 .addImm(ARMVCC::None) 11679 .addReg(0) 11680 .addReg(0); 11681 11682 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg) 11683 .addUse(PredCounterPhiReg) 11684 .addImm(16) 11685 .add(predOps(ARMCC::AL)) 11686 .addReg(0); 11687 11688 // VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR 11689 Register SrcValueReg; 11690 if (IsMemcpy) { 11691 SrcValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass); 11692 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post)) 11693 .addDef(CurrSrcReg) 11694 .addDef(SrcValueReg) 11695 .addReg(SrcPhiReg) 11696 .addImm(16) 11697 .addImm(ARMVCC::Then) 11698 .addUse(VccrReg) 11699 .addReg(0); 11700 } else 11701 SrcValueReg = OpSrcReg; 11702 11703 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post)) 11704 .addDef(CurrDestReg) 11705 .addUse(SrcValueReg) 11706 .addReg(DestPhiReg) 11707 .addImm(16) 11708 .addImm(ARMVCC::Then) 11709 .addUse(VccrReg) 11710 .addReg(0); 11711 11712 // Add the pseudoInstrs for decrementing the loop counter and marking the 11713 // end:t2DoLoopDec and t2DoLoopEnd 11714 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg) 11715 .addUse(LoopCounterPhiReg) 11716 .addImm(1); 11717 11718 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd)) 11719 .addUse(RemainingLoopIterationsReg) 11720 .addMBB(TpLoopBody); 11721 11722 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B)) 11723 .addMBB(TpExit) 11724 .add(predOps(ARMCC::AL)); 11725 } 11726 11727 MachineBasicBlock * 11728 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, 11729 MachineBasicBlock *BB) const { 11730 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 11731 DebugLoc dl = MI.getDebugLoc(); 11732 bool isThumb2 = Subtarget->isThumb2(); 11733 switch (MI.getOpcode()) { 11734 default: { 11735 MI.print(errs()); 11736 llvm_unreachable("Unexpected instr type to insert"); 11737 } 11738 11739 // Thumb1 post-indexed loads are really just single-register LDMs. 11740 case ARM::tLDR_postidx: { 11741 MachineOperand Def(MI.getOperand(1)); 11742 BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD)) 11743 .add(Def) // Rn_wb 11744 .add(MI.getOperand(2)) // Rn 11745 .add(MI.getOperand(3)) // PredImm 11746 .add(MI.getOperand(4)) // PredReg 11747 .add(MI.getOperand(0)) // Rt 11748 .cloneMemRefs(MI); 11749 MI.eraseFromParent(); 11750 return BB; 11751 } 11752 11753 case ARM::MVE_MEMCPYLOOPINST: 11754 case ARM::MVE_MEMSETLOOPINST: { 11755 11756 // Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo 11757 // into a Tail Predicated (TP) Loop. It adds the instructions to calculate 11758 // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and 11759 // adds the relevant instructions in the TP loop Body for generation of a 11760 // WLSTP loop. 11761 11762 // Below is relevant portion of the CFG after the transformation. 11763 // The Machine Basic Blocks are shown along with branch conditions (in 11764 // brackets). Note that TP entry/exit MBBs depict the entry/exit of this 11765 // portion of the CFG and may not necessarily be the entry/exit of the 11766 // function. 11767 11768 // (Relevant) CFG after transformation: 11769 // TP entry MBB 11770 // | 11771 // |-----------------| 11772 // (n <= 0) (n > 0) 11773 // | | 11774 // | TP loop Body MBB<--| 11775 // | | | 11776 // \ |___________| 11777 // \ / 11778 // TP exit MBB 11779 11780 MachineFunction *MF = BB->getParent(); 11781 MachineFunctionProperties &Properties = MF->getProperties(); 11782 MachineRegisterInfo &MRI = MF->getRegInfo(); 11783 11784 Register OpDestReg = MI.getOperand(0).getReg(); 11785 Register OpSrcReg = MI.getOperand(1).getReg(); 11786 Register OpSizeReg = MI.getOperand(2).getReg(); 11787 11788 // Allocate the required MBBs and add to parent function. 11789 MachineBasicBlock *TpEntry = BB; 11790 MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock(); 11791 MachineBasicBlock *TpExit; 11792 11793 MF->push_back(TpLoopBody); 11794 11795 // If any instructions are present in the current block after 11796 // MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and 11797 // move the instructions into the newly created exit block. If there are no 11798 // instructions add an explicit branch to the FallThrough block and then 11799 // split. 11800 // 11801 // The split is required for two reasons: 11802 // 1) A terminator(t2WhileLoopStart) will be placed at that site. 11803 // 2) Since a TPLoopBody will be added later, any phis in successive blocks 11804 // need to be updated. splitAt() already handles this. 11805 TpExit = BB->splitAt(MI, false); 11806 if (TpExit == BB) { 11807 assert(BB->canFallThrough() && "Exit Block must be Fallthrough of the " 11808 "block containing memcpy/memset Pseudo"); 11809 TpExit = BB->getFallThrough(); 11810 BuildMI(BB, dl, TII->get(ARM::t2B)) 11811 .addMBB(TpExit) 11812 .add(predOps(ARMCC::AL)); 11813 TpExit = BB->splitAt(MI, false); 11814 } 11815 11816 // Add logic for iteration count 11817 Register TotalIterationsReg = 11818 genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI); 11819 11820 // Add the vectorized (and predicated) loads/store instructions 11821 bool IsMemcpy = MI.getOpcode() == ARM::MVE_MEMCPYLOOPINST; 11822 genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg, 11823 OpDestReg, OpSizeReg, TotalIterationsReg, IsMemcpy); 11824 11825 // Required to avoid conflict with the MachineVerifier during testing. 11826 Properties.reset(MachineFunctionProperties::Property::NoPHIs); 11827 11828 // Connect the blocks 11829 TpEntry->addSuccessor(TpLoopBody); 11830 TpLoopBody->addSuccessor(TpLoopBody); 11831 TpLoopBody->addSuccessor(TpExit); 11832 11833 // Reorder for a more natural layout 11834 TpLoopBody->moveAfter(TpEntry); 11835 TpExit->moveAfter(TpLoopBody); 11836 11837 // Finally, remove the memcpy Psuedo Instruction 11838 MI.eraseFromParent(); 11839 11840 // Return the exit block as it may contain other instructions requiring a 11841 // custom inserter 11842 return TpExit; 11843 } 11844 11845 // The Thumb2 pre-indexed stores have the same MI operands, they just 11846 // define them differently in the .td files from the isel patterns, so 11847 // they need pseudos. 11848 case ARM::t2STR_preidx: 11849 MI.setDesc(TII->get(ARM::t2STR_PRE)); 11850 return BB; 11851 case ARM::t2STRB_preidx: 11852 MI.setDesc(TII->get(ARM::t2STRB_PRE)); 11853 return BB; 11854 case ARM::t2STRH_preidx: 11855 MI.setDesc(TII->get(ARM::t2STRH_PRE)); 11856 return BB; 11857 11858 case ARM::STRi_preidx: 11859 case ARM::STRBi_preidx: { 11860 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM 11861 : ARM::STRB_PRE_IMM; 11862 // Decode the offset. 11863 unsigned Offset = MI.getOperand(4).getImm(); 11864 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub; 11865 Offset = ARM_AM::getAM2Offset(Offset); 11866 if (isSub) 11867 Offset = -Offset; 11868 11869 MachineMemOperand *MMO = *MI.memoperands_begin(); 11870 BuildMI(*BB, MI, dl, TII->get(NewOpc)) 11871 .add(MI.getOperand(0)) // Rn_wb 11872 .add(MI.getOperand(1)) // Rt 11873 .add(MI.getOperand(2)) // Rn 11874 .addImm(Offset) // offset (skip GPR==zero_reg) 11875 .add(MI.getOperand(5)) // pred 11876 .add(MI.getOperand(6)) 11877 .addMemOperand(MMO); 11878 MI.eraseFromParent(); 11879 return BB; 11880 } 11881 case ARM::STRr_preidx: 11882 case ARM::STRBr_preidx: 11883 case ARM::STRH_preidx: { 11884 unsigned NewOpc; 11885 switch (MI.getOpcode()) { 11886 default: llvm_unreachable("unexpected opcode!"); 11887 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break; 11888 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break; 11889 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break; 11890 } 11891 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc)); 11892 for (const MachineOperand &MO : MI.operands()) 11893 MIB.add(MO); 11894 MI.eraseFromParent(); 11895 return BB; 11896 } 11897 11898 case ARM::tMOVCCr_pseudo: { 11899 // To "insert" a SELECT_CC instruction, we actually have to insert the 11900 // diamond control-flow pattern. The incoming instruction knows the 11901 // destination vreg to set, the condition code register to branch on, the 11902 // true/false values to select between, and a branch opcode to use. 11903 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 11904 MachineFunction::iterator It = ++BB->getIterator(); 11905 11906 // thisMBB: 11907 // ... 11908 // TrueVal = ... 11909 // cmpTY ccX, r1, r2 11910 // bCC copy1MBB 11911 // fallthrough --> copy0MBB 11912 MachineBasicBlock *thisMBB = BB; 11913 MachineFunction *F = BB->getParent(); 11914 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 11915 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 11916 F->insert(It, copy0MBB); 11917 F->insert(It, sinkMBB); 11918 11919 // Check whether CPSR is live past the tMOVCCr_pseudo. 11920 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); 11921 if (!MI.killsRegister(ARM::CPSR) && 11922 !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) { 11923 copy0MBB->addLiveIn(ARM::CPSR); 11924 sinkMBB->addLiveIn(ARM::CPSR); 11925 } 11926 11927 // Transfer the remainder of BB and its successor edges to sinkMBB. 11928 sinkMBB->splice(sinkMBB->begin(), BB, 11929 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 11930 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 11931 11932 BB->addSuccessor(copy0MBB); 11933 BB->addSuccessor(sinkMBB); 11934 11935 BuildMI(BB, dl, TII->get(ARM::tBcc)) 11936 .addMBB(sinkMBB) 11937 .addImm(MI.getOperand(3).getImm()) 11938 .addReg(MI.getOperand(4).getReg()); 11939 11940 // copy0MBB: 11941 // %FalseValue = ... 11942 // # fallthrough to sinkMBB 11943 BB = copy0MBB; 11944 11945 // Update machine-CFG edges 11946 BB->addSuccessor(sinkMBB); 11947 11948 // sinkMBB: 11949 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 11950 // ... 11951 BB = sinkMBB; 11952 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg()) 11953 .addReg(MI.getOperand(1).getReg()) 11954 .addMBB(copy0MBB) 11955 .addReg(MI.getOperand(2).getReg()) 11956 .addMBB(thisMBB); 11957 11958 MI.eraseFromParent(); // The pseudo instruction is gone now. 11959 return BB; 11960 } 11961 11962 case ARM::BCCi64: 11963 case ARM::BCCZi64: { 11964 // If there is an unconditional branch to the other successor, remove it. 11965 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end()); 11966 11967 // Compare both parts that make up the double comparison separately for 11968 // equality. 11969 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64; 11970 11971 Register LHS1 = MI.getOperand(1).getReg(); 11972 Register LHS2 = MI.getOperand(2).getReg(); 11973 if (RHSisZero) { 11974 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 11975 .addReg(LHS1) 11976 .addImm(0) 11977 .add(predOps(ARMCC::AL)); 11978 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 11979 .addReg(LHS2).addImm(0) 11980 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 11981 } else { 11982 Register RHS1 = MI.getOperand(3).getReg(); 11983 Register RHS2 = MI.getOperand(4).getReg(); 11984 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 11985 .addReg(LHS1) 11986 .addReg(RHS1) 11987 .add(predOps(ARMCC::AL)); 11988 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 11989 .addReg(LHS2).addReg(RHS2) 11990 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 11991 } 11992 11993 MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB(); 11994 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB); 11995 if (MI.getOperand(0).getImm() == ARMCC::NE) 11996 std::swap(destMBB, exitMBB); 11997 11998 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) 11999 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR); 12000 if (isThumb2) 12001 BuildMI(BB, dl, TII->get(ARM::t2B)) 12002 .addMBB(exitMBB) 12003 .add(predOps(ARMCC::AL)); 12004 else 12005 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB); 12006 12007 MI.eraseFromParent(); // The pseudo instruction is gone now. 12008 return BB; 12009 } 12010 12011 case ARM::Int_eh_sjlj_setjmp: 12012 case ARM::Int_eh_sjlj_setjmp_nofp: 12013 case ARM::tInt_eh_sjlj_setjmp: 12014 case ARM::t2Int_eh_sjlj_setjmp: 12015 case ARM::t2Int_eh_sjlj_setjmp_nofp: 12016 return BB; 12017 12018 case ARM::Int_eh_sjlj_setup_dispatch: 12019 EmitSjLjDispatchBlock(MI, BB); 12020 return BB; 12021 12022 case ARM::ABS: 12023 case ARM::t2ABS: { 12024 // To insert an ABS instruction, we have to insert the 12025 // diamond control-flow pattern. The incoming instruction knows the 12026 // source vreg to test against 0, the destination vreg to set, 12027 // the condition code register to branch on, the 12028 // true/false values to select between, and a branch opcode to use. 12029 // It transforms 12030 // V1 = ABS V0 12031 // into 12032 // V2 = MOVS V0 12033 // BCC (branch to SinkBB if V0 >= 0) 12034 // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0) 12035 // SinkBB: V1 = PHI(V2, V3) 12036 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 12037 MachineFunction::iterator BBI = ++BB->getIterator(); 12038 MachineFunction *Fn = BB->getParent(); 12039 MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB); 12040 MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB); 12041 Fn->insert(BBI, RSBBB); 12042 Fn->insert(BBI, SinkBB); 12043 12044 Register ABSSrcReg = MI.getOperand(1).getReg(); 12045 Register ABSDstReg = MI.getOperand(0).getReg(); 12046 bool ABSSrcKIll = MI.getOperand(1).isKill(); 12047 bool isThumb2 = Subtarget->isThumb2(); 12048 MachineRegisterInfo &MRI = Fn->getRegInfo(); 12049 // In Thumb mode S must not be specified if source register is the SP or 12050 // PC and if destination register is the SP, so restrict register class 12051 Register NewRsbDstReg = MRI.createVirtualRegister( 12052 isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass); 12053 12054 // Transfer the remainder of BB and its successor edges to sinkMBB. 12055 SinkBB->splice(SinkBB->begin(), BB, 12056 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 12057 SinkBB->transferSuccessorsAndUpdatePHIs(BB); 12058 12059 BB->addSuccessor(RSBBB); 12060 BB->addSuccessor(SinkBB); 12061 12062 // fall through to SinkMBB 12063 RSBBB->addSuccessor(SinkBB); 12064 12065 // insert a cmp at the end of BB 12066 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 12067 .addReg(ABSSrcReg) 12068 .addImm(0) 12069 .add(predOps(ARMCC::AL)); 12070 12071 // insert a bcc with opposite CC to ARMCC::MI at the end of BB 12072 BuildMI(BB, dl, 12073 TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB) 12074 .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR); 12075 12076 // insert rsbri in RSBBB 12077 // Note: BCC and rsbri will be converted into predicated rsbmi 12078 // by if-conversion pass 12079 BuildMI(*RSBBB, RSBBB->begin(), dl, 12080 TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg) 12081 .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0) 12082 .addImm(0) 12083 .add(predOps(ARMCC::AL)) 12084 .add(condCodeOp()); 12085 12086 // insert PHI in SinkBB, 12087 // reuse ABSDstReg to not change uses of ABS instruction 12088 BuildMI(*SinkBB, SinkBB->begin(), dl, 12089 TII->get(ARM::PHI), ABSDstReg) 12090 .addReg(NewRsbDstReg).addMBB(RSBBB) 12091 .addReg(ABSSrcReg).addMBB(BB); 12092 12093 // remove ABS instruction 12094 MI.eraseFromParent(); 12095 12096 // return last added BB 12097 return SinkBB; 12098 } 12099 case ARM::COPY_STRUCT_BYVAL_I32: 12100 ++NumLoopByVals; 12101 return EmitStructByval(MI, BB); 12102 case ARM::WIN__CHKSTK: 12103 return EmitLowered__chkstk(MI, BB); 12104 case ARM::WIN__DBZCHK: 12105 return EmitLowered__dbzchk(MI, BB); 12106 } 12107 } 12108 12109 /// Attaches vregs to MEMCPY that it will use as scratch registers 12110 /// when it is expanded into LDM/STM. This is done as a post-isel lowering 12111 /// instead of as a custom inserter because we need the use list from the SDNode. 12112 static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, 12113 MachineInstr &MI, const SDNode *Node) { 12114 bool isThumb1 = Subtarget->isThumb1Only(); 12115 12116 DebugLoc DL = MI.getDebugLoc(); 12117 MachineFunction *MF = MI.getParent()->getParent(); 12118 MachineRegisterInfo &MRI = MF->getRegInfo(); 12119 MachineInstrBuilder MIB(*MF, MI); 12120 12121 // If the new dst/src is unused mark it as dead. 12122 if (!Node->hasAnyUseOfValue(0)) { 12123 MI.getOperand(0).setIsDead(true); 12124 } 12125 if (!Node->hasAnyUseOfValue(1)) { 12126 MI.getOperand(1).setIsDead(true); 12127 } 12128 12129 // The MEMCPY both defines and kills the scratch registers. 12130 for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) { 12131 Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass 12132 : &ARM::GPRRegClass); 12133 MIB.addReg(TmpReg, RegState::Define|RegState::Dead); 12134 } 12135 } 12136 12137 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, 12138 SDNode *Node) const { 12139 if (MI.getOpcode() == ARM::MEMCPY) { 12140 attachMEMCPYScratchRegs(Subtarget, MI, Node); 12141 return; 12142 } 12143 12144 const MCInstrDesc *MCID = &MI.getDesc(); 12145 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB, 12146 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional 12147 // operand is still set to noreg. If needed, set the optional operand's 12148 // register to CPSR, and remove the redundant implicit def. 12149 // 12150 // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR). 12151 12152 // Rename pseudo opcodes. 12153 unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode()); 12154 unsigned ccOutIdx; 12155 if (NewOpc) { 12156 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo(); 12157 MCID = &TII->get(NewOpc); 12158 12159 assert(MCID->getNumOperands() == 12160 MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize() 12161 && "converted opcode should be the same except for cc_out" 12162 " (and, on Thumb1, pred)"); 12163 12164 MI.setDesc(*MCID); 12165 12166 // Add the optional cc_out operand 12167 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true)); 12168 12169 // On Thumb1, move all input operands to the end, then add the predicate 12170 if (Subtarget->isThumb1Only()) { 12171 for (unsigned c = MCID->getNumOperands() - 4; c--;) { 12172 MI.addOperand(MI.getOperand(1)); 12173 MI.RemoveOperand(1); 12174 } 12175 12176 // Restore the ties 12177 for (unsigned i = MI.getNumOperands(); i--;) { 12178 const MachineOperand& op = MI.getOperand(i); 12179 if (op.isReg() && op.isUse()) { 12180 int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO); 12181 if (DefIdx != -1) 12182 MI.tieOperands(DefIdx, i); 12183 } 12184 } 12185 12186 MI.addOperand(MachineOperand::CreateImm(ARMCC::AL)); 12187 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false)); 12188 ccOutIdx = 1; 12189 } else 12190 ccOutIdx = MCID->getNumOperands() - 1; 12191 } else 12192 ccOutIdx = MCID->getNumOperands() - 1; 12193 12194 // Any ARM instruction that sets the 's' bit should specify an optional 12195 // "cc_out" operand in the last operand position. 12196 if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) { 12197 assert(!NewOpc && "Optional cc_out operand required"); 12198 return; 12199 } 12200 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it 12201 // since we already have an optional CPSR def. 12202 bool definesCPSR = false; 12203 bool deadCPSR = false; 12204 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e; 12205 ++i) { 12206 const MachineOperand &MO = MI.getOperand(i); 12207 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) { 12208 definesCPSR = true; 12209 if (MO.isDead()) 12210 deadCPSR = true; 12211 MI.RemoveOperand(i); 12212 break; 12213 } 12214 } 12215 if (!definesCPSR) { 12216 assert(!NewOpc && "Optional cc_out operand required"); 12217 return; 12218 } 12219 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag"); 12220 if (deadCPSR) { 12221 assert(!MI.getOperand(ccOutIdx).getReg() && 12222 "expect uninitialized optional cc_out operand"); 12223 // Thumb1 instructions must have the S bit even if the CPSR is dead. 12224 if (!Subtarget->isThumb1Only()) 12225 return; 12226 } 12227 12228 // If this instruction was defined with an optional CPSR def and its dag node 12229 // had a live implicit CPSR def, then activate the optional CPSR def. 12230 MachineOperand &MO = MI.getOperand(ccOutIdx); 12231 MO.setReg(ARM::CPSR); 12232 MO.setIsDef(true); 12233 } 12234 12235 //===----------------------------------------------------------------------===// 12236 // ARM Optimization Hooks 12237 //===----------------------------------------------------------------------===// 12238 12239 // Helper function that checks if N is a null or all ones constant. 12240 static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) { 12241 return AllOnes ? isAllOnesConstant(N) : isNullConstant(N); 12242 } 12243 12244 // Return true if N is conditionally 0 or all ones. 12245 // Detects these expressions where cc is an i1 value: 12246 // 12247 // (select cc 0, y) [AllOnes=0] 12248 // (select cc y, 0) [AllOnes=0] 12249 // (zext cc) [AllOnes=0] 12250 // (sext cc) [AllOnes=0/1] 12251 // (select cc -1, y) [AllOnes=1] 12252 // (select cc y, -1) [AllOnes=1] 12253 // 12254 // Invert is set when N is the null/all ones constant when CC is false. 12255 // OtherOp is set to the alternative value of N. 12256 static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, 12257 SDValue &CC, bool &Invert, 12258 SDValue &OtherOp, 12259 SelectionDAG &DAG) { 12260 switch (N->getOpcode()) { 12261 default: return false; 12262 case ISD::SELECT: { 12263 CC = N->getOperand(0); 12264 SDValue N1 = N->getOperand(1); 12265 SDValue N2 = N->getOperand(2); 12266 if (isZeroOrAllOnes(N1, AllOnes)) { 12267 Invert = false; 12268 OtherOp = N2; 12269 return true; 12270 } 12271 if (isZeroOrAllOnes(N2, AllOnes)) { 12272 Invert = true; 12273 OtherOp = N1; 12274 return true; 12275 } 12276 return false; 12277 } 12278 case ISD::ZERO_EXTEND: 12279 // (zext cc) can never be the all ones value. 12280 if (AllOnes) 12281 return false; 12282 LLVM_FALLTHROUGH; 12283 case ISD::SIGN_EXTEND: { 12284 SDLoc dl(N); 12285 EVT VT = N->getValueType(0); 12286 CC = N->getOperand(0); 12287 if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC) 12288 return false; 12289 Invert = !AllOnes; 12290 if (AllOnes) 12291 // When looking for an AllOnes constant, N is an sext, and the 'other' 12292 // value is 0. 12293 OtherOp = DAG.getConstant(0, dl, VT); 12294 else if (N->getOpcode() == ISD::ZERO_EXTEND) 12295 // When looking for a 0 constant, N can be zext or sext. 12296 OtherOp = DAG.getConstant(1, dl, VT); 12297 else 12298 OtherOp = DAG.getAllOnesConstant(dl, VT); 12299 return true; 12300 } 12301 } 12302 } 12303 12304 // Combine a constant select operand into its use: 12305 // 12306 // (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 12307 // (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 12308 // (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1] 12309 // (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 12310 // (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 12311 // 12312 // The transform is rejected if the select doesn't have a constant operand that 12313 // is null, or all ones when AllOnes is set. 12314 // 12315 // Also recognize sext/zext from i1: 12316 // 12317 // (add (zext cc), x) -> (select cc (add x, 1), x) 12318 // (add (sext cc), x) -> (select cc (add x, -1), x) 12319 // 12320 // These transformations eventually create predicated instructions. 12321 // 12322 // @param N The node to transform. 12323 // @param Slct The N operand that is a select. 12324 // @param OtherOp The other N operand (x above). 12325 // @param DCI Context. 12326 // @param AllOnes Require the select constant to be all ones instead of null. 12327 // @returns The new node, or SDValue() on failure. 12328 static 12329 SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, 12330 TargetLowering::DAGCombinerInfo &DCI, 12331 bool AllOnes = false) { 12332 SelectionDAG &DAG = DCI.DAG; 12333 EVT VT = N->getValueType(0); 12334 SDValue NonConstantVal; 12335 SDValue CCOp; 12336 bool SwapSelectOps; 12337 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps, 12338 NonConstantVal, DAG)) 12339 return SDValue(); 12340 12341 // Slct is now know to be the desired identity constant when CC is true. 12342 SDValue TrueVal = OtherOp; 12343 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT, 12344 OtherOp, NonConstantVal); 12345 // Unless SwapSelectOps says CC should be false. 12346 if (SwapSelectOps) 12347 std::swap(TrueVal, FalseVal); 12348 12349 return DAG.getNode(ISD::SELECT, SDLoc(N), VT, 12350 CCOp, TrueVal, FalseVal); 12351 } 12352 12353 // Attempt combineSelectAndUse on each operand of a commutative operator N. 12354 static 12355 SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, 12356 TargetLowering::DAGCombinerInfo &DCI) { 12357 SDValue N0 = N->getOperand(0); 12358 SDValue N1 = N->getOperand(1); 12359 if (N0.getNode()->hasOneUse()) 12360 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes)) 12361 return Result; 12362 if (N1.getNode()->hasOneUse()) 12363 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes)) 12364 return Result; 12365 return SDValue(); 12366 } 12367 12368 static bool IsVUZPShuffleNode(SDNode *N) { 12369 // VUZP shuffle node. 12370 if (N->getOpcode() == ARMISD::VUZP) 12371 return true; 12372 12373 // "VUZP" on i32 is an alias for VTRN. 12374 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32) 12375 return true; 12376 12377 return false; 12378 } 12379 12380 static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, 12381 TargetLowering::DAGCombinerInfo &DCI, 12382 const ARMSubtarget *Subtarget) { 12383 // Look for ADD(VUZP.0, VUZP.1). 12384 if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() || 12385 N0 == N1) 12386 return SDValue(); 12387 12388 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD. 12389 if (!N->getValueType(0).is64BitVector()) 12390 return SDValue(); 12391 12392 // Generate vpadd. 12393 SelectionDAG &DAG = DCI.DAG; 12394 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12395 SDLoc dl(N); 12396 SDNode *Unzip = N0.getNode(); 12397 EVT VT = N->getValueType(0); 12398 12399 SmallVector<SDValue, 8> Ops; 12400 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl, 12401 TLI.getPointerTy(DAG.getDataLayout()))); 12402 Ops.push_back(Unzip->getOperand(0)); 12403 Ops.push_back(Unzip->getOperand(1)); 12404 12405 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); 12406 } 12407 12408 static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, 12409 TargetLowering::DAGCombinerInfo &DCI, 12410 const ARMSubtarget *Subtarget) { 12411 // Check for two extended operands. 12412 if (!(N0.getOpcode() == ISD::SIGN_EXTEND && 12413 N1.getOpcode() == ISD::SIGN_EXTEND) && 12414 !(N0.getOpcode() == ISD::ZERO_EXTEND && 12415 N1.getOpcode() == ISD::ZERO_EXTEND)) 12416 return SDValue(); 12417 12418 SDValue N00 = N0.getOperand(0); 12419 SDValue N10 = N1.getOperand(0); 12420 12421 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1)) 12422 if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() || 12423 N00 == N10) 12424 return SDValue(); 12425 12426 // We only recognize Q register paddl here; this can't be reached until 12427 // after type legalization. 12428 if (!N00.getValueType().is64BitVector() || 12429 !N0.getValueType().is128BitVector()) 12430 return SDValue(); 12431 12432 // Generate vpaddl. 12433 SelectionDAG &DAG = DCI.DAG; 12434 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12435 SDLoc dl(N); 12436 EVT VT = N->getValueType(0); 12437 12438 SmallVector<SDValue, 8> Ops; 12439 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension. 12440 unsigned Opcode; 12441 if (N0.getOpcode() == ISD::SIGN_EXTEND) 12442 Opcode = Intrinsic::arm_neon_vpaddls; 12443 else 12444 Opcode = Intrinsic::arm_neon_vpaddlu; 12445 Ops.push_back(DAG.getConstant(Opcode, dl, 12446 TLI.getPointerTy(DAG.getDataLayout()))); 12447 EVT ElemTy = N00.getValueType().getVectorElementType(); 12448 unsigned NumElts = VT.getVectorNumElements(); 12449 EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2); 12450 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT, 12451 N00.getOperand(0), N00.getOperand(1)); 12452 Ops.push_back(Concat); 12453 12454 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); 12455 } 12456 12457 // FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in 12458 // an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is 12459 // much easier to match. 12460 static SDValue 12461 AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, 12462 TargetLowering::DAGCombinerInfo &DCI, 12463 const ARMSubtarget *Subtarget) { 12464 // Only perform optimization if after legalize, and if NEON is available. We 12465 // also expected both operands to be BUILD_VECTORs. 12466 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON() 12467 || N0.getOpcode() != ISD::BUILD_VECTOR 12468 || N1.getOpcode() != ISD::BUILD_VECTOR) 12469 return SDValue(); 12470 12471 // Check output type since VPADDL operand elements can only be 8, 16, or 32. 12472 EVT VT = N->getValueType(0); 12473 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64) 12474 return SDValue(); 12475 12476 // Check that the vector operands are of the right form. 12477 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR 12478 // operands, where N is the size of the formed vector. 12479 // Each EXTRACT_VECTOR should have the same input vector and odd or even 12480 // index such that we have a pair wise add pattern. 12481 12482 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing. 12483 if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 12484 return SDValue(); 12485 SDValue Vec = N0->getOperand(0)->getOperand(0); 12486 SDNode *V = Vec.getNode(); 12487 unsigned nextIndex = 0; 12488 12489 // For each operands to the ADD which are BUILD_VECTORs, 12490 // check to see if each of their operands are an EXTRACT_VECTOR with 12491 // the same vector and appropriate index. 12492 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) { 12493 if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT 12494 && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 12495 12496 SDValue ExtVec0 = N0->getOperand(i); 12497 SDValue ExtVec1 = N1->getOperand(i); 12498 12499 // First operand is the vector, verify its the same. 12500 if (V != ExtVec0->getOperand(0).getNode() || 12501 V != ExtVec1->getOperand(0).getNode()) 12502 return SDValue(); 12503 12504 // Second is the constant, verify its correct. 12505 ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1)); 12506 ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1)); 12507 12508 // For the constant, we want to see all the even or all the odd. 12509 if (!C0 || !C1 || C0->getZExtValue() != nextIndex 12510 || C1->getZExtValue() != nextIndex+1) 12511 return SDValue(); 12512 12513 // Increment index. 12514 nextIndex+=2; 12515 } else 12516 return SDValue(); 12517 } 12518 12519 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure 12520 // we're using the entire input vector, otherwise there's a size/legality 12521 // mismatch somewhere. 12522 if (nextIndex != Vec.getValueType().getVectorNumElements() || 12523 Vec.getValueType().getVectorElementType() == VT.getVectorElementType()) 12524 return SDValue(); 12525 12526 // Create VPADDL node. 12527 SelectionDAG &DAG = DCI.DAG; 12528 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12529 12530 SDLoc dl(N); 12531 12532 // Build operand list. 12533 SmallVector<SDValue, 8> Ops; 12534 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl, 12535 TLI.getPointerTy(DAG.getDataLayout()))); 12536 12537 // Input is the vector. 12538 Ops.push_back(Vec); 12539 12540 // Get widened type and narrowed type. 12541 MVT widenType; 12542 unsigned numElem = VT.getVectorNumElements(); 12543 12544 EVT inputLaneType = Vec.getValueType().getVectorElementType(); 12545 switch (inputLaneType.getSimpleVT().SimpleTy) { 12546 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break; 12547 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break; 12548 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break; 12549 default: 12550 llvm_unreachable("Invalid vector element type for padd optimization."); 12551 } 12552 12553 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops); 12554 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE; 12555 return DAG.getNode(ExtOp, dl, VT, tmp); 12556 } 12557 12558 static SDValue findMUL_LOHI(SDValue V) { 12559 if (V->getOpcode() == ISD::UMUL_LOHI || 12560 V->getOpcode() == ISD::SMUL_LOHI) 12561 return V; 12562 return SDValue(); 12563 } 12564 12565 static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, 12566 TargetLowering::DAGCombinerInfo &DCI, 12567 const ARMSubtarget *Subtarget) { 12568 if (!Subtarget->hasBaseDSP()) 12569 return SDValue(); 12570 12571 // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and 12572 // accumulates the product into a 64-bit value. The 16-bit values will 12573 // be sign extended somehow or SRA'd into 32-bit values 12574 // (addc (adde (mul 16bit, 16bit), lo), hi) 12575 SDValue Mul = AddcNode->getOperand(0); 12576 SDValue Lo = AddcNode->getOperand(1); 12577 if (Mul.getOpcode() != ISD::MUL) { 12578 Lo = AddcNode->getOperand(0); 12579 Mul = AddcNode->getOperand(1); 12580 if (Mul.getOpcode() != ISD::MUL) 12581 return SDValue(); 12582 } 12583 12584 SDValue SRA = AddeNode->getOperand(0); 12585 SDValue Hi = AddeNode->getOperand(1); 12586 if (SRA.getOpcode() != ISD::SRA) { 12587 SRA = AddeNode->getOperand(1); 12588 Hi = AddeNode->getOperand(0); 12589 if (SRA.getOpcode() != ISD::SRA) 12590 return SDValue(); 12591 } 12592 if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) { 12593 if (Const->getZExtValue() != 31) 12594 return SDValue(); 12595 } else 12596 return SDValue(); 12597 12598 if (SRA.getOperand(0) != Mul) 12599 return SDValue(); 12600 12601 SelectionDAG &DAG = DCI.DAG; 12602 SDLoc dl(AddcNode); 12603 unsigned Opcode = 0; 12604 SDValue Op0; 12605 SDValue Op1; 12606 12607 if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) { 12608 Opcode = ARMISD::SMLALBB; 12609 Op0 = Mul.getOperand(0); 12610 Op1 = Mul.getOperand(1); 12611 } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) { 12612 Opcode = ARMISD::SMLALBT; 12613 Op0 = Mul.getOperand(0); 12614 Op1 = Mul.getOperand(1).getOperand(0); 12615 } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) { 12616 Opcode = ARMISD::SMLALTB; 12617 Op0 = Mul.getOperand(0).getOperand(0); 12618 Op1 = Mul.getOperand(1); 12619 } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) { 12620 Opcode = ARMISD::SMLALTT; 12621 Op0 = Mul->getOperand(0).getOperand(0); 12622 Op1 = Mul->getOperand(1).getOperand(0); 12623 } 12624 12625 if (!Op0 || !Op1) 12626 return SDValue(); 12627 12628 SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32), 12629 Op0, Op1, Lo, Hi); 12630 // Replace the ADDs' nodes uses by the MLA node's values. 12631 SDValue HiMLALResult(SMLAL.getNode(), 1); 12632 SDValue LoMLALResult(SMLAL.getNode(), 0); 12633 12634 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult); 12635 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult); 12636 12637 // Return original node to notify the driver to stop replacing. 12638 SDValue resNode(AddcNode, 0); 12639 return resNode; 12640 } 12641 12642 static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode, 12643 TargetLowering::DAGCombinerInfo &DCI, 12644 const ARMSubtarget *Subtarget) { 12645 // Look for multiply add opportunities. 12646 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where 12647 // each add nodes consumes a value from ISD::UMUL_LOHI and there is 12648 // a glue link from the first add to the second add. 12649 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by 12650 // a S/UMLAL instruction. 12651 // UMUL_LOHI 12652 // / :lo \ :hi 12653 // V \ [no multiline comment] 12654 // loAdd -> ADDC | 12655 // \ :carry / 12656 // V V 12657 // ADDE <- hiAdd 12658 // 12659 // In the special case where only the higher part of a signed result is used 12660 // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts 12661 // a constant with the exact value of 0x80000000, we recognize we are dealing 12662 // with a "rounded multiply and add" (or subtract) and transform it into 12663 // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively. 12664 12665 assert((AddeSubeNode->getOpcode() == ARMISD::ADDE || 12666 AddeSubeNode->getOpcode() == ARMISD::SUBE) && 12667 "Expect an ADDE or SUBE"); 12668 12669 assert(AddeSubeNode->getNumOperands() == 3 && 12670 AddeSubeNode->getOperand(2).getValueType() == MVT::i32 && 12671 "ADDE node has the wrong inputs"); 12672 12673 // Check that we are chained to the right ADDC or SUBC node. 12674 SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode(); 12675 if ((AddeSubeNode->getOpcode() == ARMISD::ADDE && 12676 AddcSubcNode->getOpcode() != ARMISD::ADDC) || 12677 (AddeSubeNode->getOpcode() == ARMISD::SUBE && 12678 AddcSubcNode->getOpcode() != ARMISD::SUBC)) 12679 return SDValue(); 12680 12681 SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0); 12682 SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1); 12683 12684 // Check if the two operands are from the same mul_lohi node. 12685 if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode()) 12686 return SDValue(); 12687 12688 assert(AddcSubcNode->getNumValues() == 2 && 12689 AddcSubcNode->getValueType(0) == MVT::i32 && 12690 "Expect ADDC with two result values. First: i32"); 12691 12692 // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it 12693 // maybe a SMLAL which multiplies two 16-bit values. 12694 if (AddeSubeNode->getOpcode() == ARMISD::ADDE && 12695 AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI && 12696 AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI && 12697 AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI && 12698 AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI) 12699 return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget); 12700 12701 // Check for the triangle shape. 12702 SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0); 12703 SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1); 12704 12705 // Make sure that the ADDE/SUBE operands are not coming from the same node. 12706 if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode()) 12707 return SDValue(); 12708 12709 // Find the MUL_LOHI node walking up ADDE/SUBE's operands. 12710 bool IsLeftOperandMUL = false; 12711 SDValue MULOp = findMUL_LOHI(AddeSubeOp0); 12712 if (MULOp == SDValue()) 12713 MULOp = findMUL_LOHI(AddeSubeOp1); 12714 else 12715 IsLeftOperandMUL = true; 12716 if (MULOp == SDValue()) 12717 return SDValue(); 12718 12719 // Figure out the right opcode. 12720 unsigned Opc = MULOp->getOpcode(); 12721 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL; 12722 12723 // Figure out the high and low input values to the MLAL node. 12724 SDValue *HiAddSub = nullptr; 12725 SDValue *LoMul = nullptr; 12726 SDValue *LowAddSub = nullptr; 12727 12728 // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI. 12729 if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1))) 12730 return SDValue(); 12731 12732 if (IsLeftOperandMUL) 12733 HiAddSub = &AddeSubeOp1; 12734 else 12735 HiAddSub = &AddeSubeOp0; 12736 12737 // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node 12738 // whose low result is fed to the ADDC/SUBC we are checking. 12739 12740 if (AddcSubcOp0 == MULOp.getValue(0)) { 12741 LoMul = &AddcSubcOp0; 12742 LowAddSub = &AddcSubcOp1; 12743 } 12744 if (AddcSubcOp1 == MULOp.getValue(0)) { 12745 LoMul = &AddcSubcOp1; 12746 LowAddSub = &AddcSubcOp0; 12747 } 12748 12749 if (!LoMul) 12750 return SDValue(); 12751 12752 // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC 12753 // the replacement below will create a cycle. 12754 if (AddcSubcNode == HiAddSub->getNode() || 12755 AddcSubcNode->isPredecessorOf(HiAddSub->getNode())) 12756 return SDValue(); 12757 12758 // Create the merged node. 12759 SelectionDAG &DAG = DCI.DAG; 12760 12761 // Start building operand list. 12762 SmallVector<SDValue, 8> Ops; 12763 Ops.push_back(LoMul->getOperand(0)); 12764 Ops.push_back(LoMul->getOperand(1)); 12765 12766 // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be 12767 // the case, we must be doing signed multiplication and only use the higher 12768 // part of the result of the MLAL, furthermore the LowAddSub must be a constant 12769 // addition or subtraction with the value of 0x800000. 12770 if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() && 12771 FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) && 12772 LowAddSub->getNode()->getOpcode() == ISD::Constant && 12773 static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() == 12774 0x80000000) { 12775 Ops.push_back(*HiAddSub); 12776 if (AddcSubcNode->getOpcode() == ARMISD::SUBC) { 12777 FinalOpc = ARMISD::SMMLSR; 12778 } else { 12779 FinalOpc = ARMISD::SMMLAR; 12780 } 12781 SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops); 12782 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode); 12783 12784 return SDValue(AddeSubeNode, 0); 12785 } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC) 12786 // SMMLS is generated during instruction selection and the rest of this 12787 // function can not handle the case where AddcSubcNode is a SUBC. 12788 return SDValue(); 12789 12790 // Finish building the operand list for {U/S}MLAL 12791 Ops.push_back(*LowAddSub); 12792 Ops.push_back(*HiAddSub); 12793 12794 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), 12795 DAG.getVTList(MVT::i32, MVT::i32), Ops); 12796 12797 // Replace the ADDs' nodes uses by the MLA node's values. 12798 SDValue HiMLALResult(MLALNode.getNode(), 1); 12799 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult); 12800 12801 SDValue LoMLALResult(MLALNode.getNode(), 0); 12802 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult); 12803 12804 // Return original node to notify the driver to stop replacing. 12805 return SDValue(AddeSubeNode, 0); 12806 } 12807 12808 static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, 12809 TargetLowering::DAGCombinerInfo &DCI, 12810 const ARMSubtarget *Subtarget) { 12811 // UMAAL is similar to UMLAL except that it adds two unsigned values. 12812 // While trying to combine for the other MLAL nodes, first search for the 12813 // chance to use UMAAL. Check if Addc uses a node which has already 12814 // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde 12815 // as the addend, and it's handled in PerformUMLALCombine. 12816 12817 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) 12818 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); 12819 12820 // Check that we have a glued ADDC node. 12821 SDNode* AddcNode = AddeNode->getOperand(2).getNode(); 12822 if (AddcNode->getOpcode() != ARMISD::ADDC) 12823 return SDValue(); 12824 12825 // Find the converted UMAAL or quit if it doesn't exist. 12826 SDNode *UmlalNode = nullptr; 12827 SDValue AddHi; 12828 if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) { 12829 UmlalNode = AddcNode->getOperand(0).getNode(); 12830 AddHi = AddcNode->getOperand(1); 12831 } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) { 12832 UmlalNode = AddcNode->getOperand(1).getNode(); 12833 AddHi = AddcNode->getOperand(0); 12834 } else { 12835 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); 12836 } 12837 12838 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as 12839 // the ADDC as well as Zero. 12840 if (!isNullConstant(UmlalNode->getOperand(3))) 12841 return SDValue(); 12842 12843 if ((isNullConstant(AddeNode->getOperand(0)) && 12844 AddeNode->getOperand(1).getNode() == UmlalNode) || 12845 (AddeNode->getOperand(0).getNode() == UmlalNode && 12846 isNullConstant(AddeNode->getOperand(1)))) { 12847 SelectionDAG &DAG = DCI.DAG; 12848 SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1), 12849 UmlalNode->getOperand(2), AddHi }; 12850 SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode), 12851 DAG.getVTList(MVT::i32, MVT::i32), Ops); 12852 12853 // Replace the ADDs' nodes uses by the UMAAL node's values. 12854 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1)); 12855 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0)); 12856 12857 // Return original node to notify the driver to stop replacing. 12858 return SDValue(AddeNode, 0); 12859 } 12860 return SDValue(); 12861 } 12862 12863 static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG, 12864 const ARMSubtarget *Subtarget) { 12865 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) 12866 return SDValue(); 12867 12868 // Check that we have a pair of ADDC and ADDE as operands. 12869 // Both addends of the ADDE must be zero. 12870 SDNode* AddcNode = N->getOperand(2).getNode(); 12871 SDNode* AddeNode = N->getOperand(3).getNode(); 12872 if ((AddcNode->getOpcode() == ARMISD::ADDC) && 12873 (AddeNode->getOpcode() == ARMISD::ADDE) && 12874 isNullConstant(AddeNode->getOperand(0)) && 12875 isNullConstant(AddeNode->getOperand(1)) && 12876 (AddeNode->getOperand(2).getNode() == AddcNode)) 12877 return DAG.getNode(ARMISD::UMAAL, SDLoc(N), 12878 DAG.getVTList(MVT::i32, MVT::i32), 12879 {N->getOperand(0), N->getOperand(1), 12880 AddcNode->getOperand(0), AddcNode->getOperand(1)}); 12881 else 12882 return SDValue(); 12883 } 12884 12885 static SDValue PerformAddcSubcCombine(SDNode *N, 12886 TargetLowering::DAGCombinerInfo &DCI, 12887 const ARMSubtarget *Subtarget) { 12888 SelectionDAG &DAG(DCI.DAG); 12889 12890 if (N->getOpcode() == ARMISD::SUBC && N->hasAnyUseOfValue(1)) { 12891 // (SUBC (ADDE 0, 0, C), 1) -> C 12892 SDValue LHS = N->getOperand(0); 12893 SDValue RHS = N->getOperand(1); 12894 if (LHS->getOpcode() == ARMISD::ADDE && 12895 isNullConstant(LHS->getOperand(0)) && 12896 isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) { 12897 return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2)); 12898 } 12899 } 12900 12901 if (Subtarget->isThumb1Only()) { 12902 SDValue RHS = N->getOperand(1); 12903 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) { 12904 int32_t imm = C->getSExtValue(); 12905 if (imm < 0 && imm > std::numeric_limits<int>::min()) { 12906 SDLoc DL(N); 12907 RHS = DAG.getConstant(-imm, DL, MVT::i32); 12908 unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC 12909 : ARMISD::ADDC; 12910 return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS); 12911 } 12912 } 12913 } 12914 12915 return SDValue(); 12916 } 12917 12918 static SDValue PerformAddeSubeCombine(SDNode *N, 12919 TargetLowering::DAGCombinerInfo &DCI, 12920 const ARMSubtarget *Subtarget) { 12921 if (Subtarget->isThumb1Only()) { 12922 SelectionDAG &DAG = DCI.DAG; 12923 SDValue RHS = N->getOperand(1); 12924 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) { 12925 int64_t imm = C->getSExtValue(); 12926 if (imm < 0) { 12927 SDLoc DL(N); 12928 12929 // The with-carry-in form matches bitwise not instead of the negation. 12930 // Effectively, the inverse interpretation of the carry flag already 12931 // accounts for part of the negation. 12932 RHS = DAG.getConstant(~imm, DL, MVT::i32); 12933 12934 unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE 12935 : ARMISD::ADDE; 12936 return DAG.getNode(Opcode, DL, N->getVTList(), 12937 N->getOperand(0), RHS, N->getOperand(2)); 12938 } 12939 } 12940 } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) { 12941 return AddCombineTo64bitMLAL(N, DCI, Subtarget); 12942 } 12943 return SDValue(); 12944 } 12945 12946 static SDValue PerformSELECTCombine(SDNode *N, 12947 TargetLowering::DAGCombinerInfo &DCI, 12948 const ARMSubtarget *Subtarget) { 12949 if (!Subtarget->hasMVEIntegerOps()) 12950 return SDValue(); 12951 12952 SDLoc dl(N); 12953 SDValue SetCC; 12954 SDValue LHS; 12955 SDValue RHS; 12956 ISD::CondCode CC; 12957 SDValue TrueVal; 12958 SDValue FalseVal; 12959 12960 if (N->getOpcode() == ISD::SELECT && 12961 N->getOperand(0)->getOpcode() == ISD::SETCC) { 12962 SetCC = N->getOperand(0); 12963 LHS = SetCC->getOperand(0); 12964 RHS = SetCC->getOperand(1); 12965 CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get(); 12966 TrueVal = N->getOperand(1); 12967 FalseVal = N->getOperand(2); 12968 } else if (N->getOpcode() == ISD::SELECT_CC) { 12969 LHS = N->getOperand(0); 12970 RHS = N->getOperand(1); 12971 CC = cast<CondCodeSDNode>(N->getOperand(4))->get(); 12972 TrueVal = N->getOperand(2); 12973 FalseVal = N->getOperand(3); 12974 } else { 12975 return SDValue(); 12976 } 12977 12978 unsigned int Opcode = 0; 12979 if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMIN || 12980 FalseVal->getOpcode() == ISD::VECREDUCE_UMIN) && 12981 (CC == ISD::SETULT || CC == ISD::SETUGT)) { 12982 Opcode = ARMISD::VMINVu; 12983 if (CC == ISD::SETUGT) 12984 std::swap(TrueVal, FalseVal); 12985 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMIN || 12986 FalseVal->getOpcode() == ISD::VECREDUCE_SMIN) && 12987 (CC == ISD::SETLT || CC == ISD::SETGT)) { 12988 Opcode = ARMISD::VMINVs; 12989 if (CC == ISD::SETGT) 12990 std::swap(TrueVal, FalseVal); 12991 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMAX || 12992 FalseVal->getOpcode() == ISD::VECREDUCE_UMAX) && 12993 (CC == ISD::SETUGT || CC == ISD::SETULT)) { 12994 Opcode = ARMISD::VMAXVu; 12995 if (CC == ISD::SETULT) 12996 std::swap(TrueVal, FalseVal); 12997 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMAX || 12998 FalseVal->getOpcode() == ISD::VECREDUCE_SMAX) && 12999 (CC == ISD::SETGT || CC == ISD::SETLT)) { 13000 Opcode = ARMISD::VMAXVs; 13001 if (CC == ISD::SETLT) 13002 std::swap(TrueVal, FalseVal); 13003 } else 13004 return SDValue(); 13005 13006 // Normalise to the right hand side being the vector reduction 13007 switch (TrueVal->getOpcode()) { 13008 case ISD::VECREDUCE_UMIN: 13009 case ISD::VECREDUCE_SMIN: 13010 case ISD::VECREDUCE_UMAX: 13011 case ISD::VECREDUCE_SMAX: 13012 std::swap(LHS, RHS); 13013 std::swap(TrueVal, FalseVal); 13014 break; 13015 } 13016 13017 EVT VectorType = FalseVal->getOperand(0).getValueType(); 13018 13019 if (VectorType != MVT::v16i8 && VectorType != MVT::v8i16 && 13020 VectorType != MVT::v4i32) 13021 return SDValue(); 13022 13023 EVT VectorScalarType = VectorType.getVectorElementType(); 13024 13025 // The values being selected must also be the ones being compared 13026 if (TrueVal != LHS || FalseVal != RHS) 13027 return SDValue(); 13028 13029 EVT LeftType = LHS->getValueType(0); 13030 EVT RightType = RHS->getValueType(0); 13031 13032 // The types must match the reduced type too 13033 if (LeftType != VectorScalarType || RightType != VectorScalarType) 13034 return SDValue(); 13035 13036 // Legalise the scalar to an i32 13037 if (VectorScalarType != MVT::i32) 13038 LHS = DCI.DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 13039 13040 // Generate the reduction as an i32 for legalisation purposes 13041 auto Reduction = 13042 DCI.DAG.getNode(Opcode, dl, MVT::i32, LHS, RHS->getOperand(0)); 13043 13044 // The result isn't actually an i32 so truncate it back to its original type 13045 if (VectorScalarType != MVT::i32) 13046 Reduction = DCI.DAG.getNode(ISD::TRUNCATE, dl, VectorScalarType, Reduction); 13047 13048 return Reduction; 13049 } 13050 13051 // A special combine for the vqdmulh family of instructions. This is one of the 13052 // potential set of patterns that could patch this instruction. The base pattern 13053 // you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))). 13054 // This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))), 13055 // which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as 13056 // the max is unnecessary. 13057 static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG) { 13058 EVT VT = N->getValueType(0); 13059 SDValue Shft; 13060 ConstantSDNode *Clamp; 13061 13062 if (!VT.isVector() || VT.getScalarSizeInBits() > 64) 13063 return SDValue(); 13064 13065 if (N->getOpcode() == ISD::SMIN) { 13066 Shft = N->getOperand(0); 13067 Clamp = isConstOrConstSplat(N->getOperand(1)); 13068 } else if (N->getOpcode() == ISD::VSELECT) { 13069 // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin. 13070 SDValue Cmp = N->getOperand(0); 13071 if (Cmp.getOpcode() != ISD::SETCC || 13072 cast<CondCodeSDNode>(Cmp.getOperand(2))->get() != ISD::SETLT || 13073 Cmp.getOperand(0) != N->getOperand(1) || 13074 Cmp.getOperand(1) != N->getOperand(2)) 13075 return SDValue(); 13076 Shft = N->getOperand(1); 13077 Clamp = isConstOrConstSplat(N->getOperand(2)); 13078 } else 13079 return SDValue(); 13080 13081 if (!Clamp) 13082 return SDValue(); 13083 13084 MVT ScalarType; 13085 int ShftAmt = 0; 13086 switch (Clamp->getSExtValue()) { 13087 case (1 << 7) - 1: 13088 ScalarType = MVT::i8; 13089 ShftAmt = 7; 13090 break; 13091 case (1 << 15) - 1: 13092 ScalarType = MVT::i16; 13093 ShftAmt = 15; 13094 break; 13095 case (1ULL << 31) - 1: 13096 ScalarType = MVT::i32; 13097 ShftAmt = 31; 13098 break; 13099 default: 13100 return SDValue(); 13101 } 13102 13103 if (Shft.getOpcode() != ISD::SRA) 13104 return SDValue(); 13105 ConstantSDNode *N1 = isConstOrConstSplat(Shft.getOperand(1)); 13106 if (!N1 || N1->getSExtValue() != ShftAmt) 13107 return SDValue(); 13108 13109 SDValue Mul = Shft.getOperand(0); 13110 if (Mul.getOpcode() != ISD::MUL) 13111 return SDValue(); 13112 13113 SDValue Ext0 = Mul.getOperand(0); 13114 SDValue Ext1 = Mul.getOperand(1); 13115 if (Ext0.getOpcode() != ISD::SIGN_EXTEND || 13116 Ext1.getOpcode() != ISD::SIGN_EXTEND) 13117 return SDValue(); 13118 EVT VecVT = Ext0.getOperand(0).getValueType(); 13119 if (!VecVT.isPow2VectorType() || VecVT.getVectorNumElements() == 1) 13120 return SDValue(); 13121 if (Ext1.getOperand(0).getValueType() != VecVT || 13122 VecVT.getScalarType() != ScalarType || 13123 VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2) 13124 return SDValue(); 13125 13126 SDLoc DL(Mul); 13127 unsigned LegalLanes = 128 / (ShftAmt + 1); 13128 EVT LegalVecVT = MVT::getVectorVT(ScalarType, LegalLanes); 13129 // For types smaller than legal vectors extend to be legal and only use needed 13130 // lanes. 13131 if (VecVT.getSizeInBits() < 128) { 13132 EVT ExtVecVT = 13133 MVT::getVectorVT(MVT::getIntegerVT(128 / VecVT.getVectorNumElements()), 13134 VecVT.getVectorNumElements()); 13135 SDValue Inp0 = 13136 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext0.getOperand(0)); 13137 SDValue Inp1 = 13138 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext1.getOperand(0)); 13139 Inp0 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp0); 13140 Inp1 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp1); 13141 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1); 13142 SDValue Trunc = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, ExtVecVT, VQDMULH); 13143 Trunc = DAG.getNode(ISD::TRUNCATE, DL, VecVT, Trunc); 13144 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Trunc); 13145 } 13146 13147 // For larger types, split into legal sized chunks. 13148 assert(VecVT.getSizeInBits() % 128 == 0 && "Expected a power2 type"); 13149 unsigned NumParts = VecVT.getSizeInBits() / 128; 13150 SmallVector<SDValue> Parts; 13151 for (unsigned I = 0; I < NumParts; ++I) { 13152 SDValue Inp0 = 13153 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext0.getOperand(0), 13154 DAG.getVectorIdxConstant(I * LegalLanes, DL)); 13155 SDValue Inp1 = 13156 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext1.getOperand(0), 13157 DAG.getVectorIdxConstant(I * LegalLanes, DL)); 13158 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1); 13159 Parts.push_back(VQDMULH); 13160 } 13161 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, 13162 DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Parts)); 13163 } 13164 13165 static SDValue PerformVSELECTCombine(SDNode *N, 13166 TargetLowering::DAGCombinerInfo &DCI, 13167 const ARMSubtarget *Subtarget) { 13168 if (!Subtarget->hasMVEIntegerOps()) 13169 return SDValue(); 13170 13171 if (SDValue V = PerformVQDMULHCombine(N, DCI.DAG)) 13172 return V; 13173 13174 // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs). 13175 // 13176 // We need to re-implement this optimization here as the implementation in the 13177 // Target-Independent DAGCombiner does not handle the kind of constant we make 13178 // (it calls isConstOrConstSplat with AllowTruncation set to false - and for 13179 // good reason, allowing truncation there would break other targets). 13180 // 13181 // Currently, this is only done for MVE, as it's the only target that benefits 13182 // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL). 13183 if (N->getOperand(0).getOpcode() != ISD::XOR) 13184 return SDValue(); 13185 SDValue XOR = N->getOperand(0); 13186 13187 // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s. 13188 // It is important to check with truncation allowed as the BUILD_VECTORs we 13189 // generate in those situations will truncate their operands. 13190 ConstantSDNode *Const = 13191 isConstOrConstSplat(XOR->getOperand(1), /*AllowUndefs*/ false, 13192 /*AllowTruncation*/ true); 13193 if (!Const || !Const->isOne()) 13194 return SDValue(); 13195 13196 // Rewrite into vselect(cond, rhs, lhs). 13197 SDValue Cond = XOR->getOperand(0); 13198 SDValue LHS = N->getOperand(1); 13199 SDValue RHS = N->getOperand(2); 13200 EVT Type = N->getValueType(0); 13201 return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS); 13202 } 13203 13204 // Convert vsetcc([0,1,2,..], splat(n), ult) -> vctp n 13205 static SDValue PerformVSetCCToVCTPCombine(SDNode *N, 13206 TargetLowering::DAGCombinerInfo &DCI, 13207 const ARMSubtarget *Subtarget) { 13208 SDValue Op0 = N->getOperand(0); 13209 SDValue Op1 = N->getOperand(1); 13210 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); 13211 EVT VT = N->getValueType(0); 13212 13213 if (!Subtarget->hasMVEIntegerOps() || 13214 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 13215 return SDValue(); 13216 13217 if (CC == ISD::SETUGE) { 13218 std::swap(Op0, Op1); 13219 CC = ISD::SETULT; 13220 } 13221 13222 if (CC != ISD::SETULT || VT.getScalarSizeInBits() != 1 || 13223 Op0.getOpcode() != ISD::BUILD_VECTOR) 13224 return SDValue(); 13225 13226 // Check first operand is BuildVector of 0,1,2,... 13227 for (unsigned I = 0; I < VT.getVectorNumElements(); I++) { 13228 if (!Op0.getOperand(I).isUndef() && 13229 !(isa<ConstantSDNode>(Op0.getOperand(I)) && 13230 Op0.getConstantOperandVal(I) == I)) 13231 return SDValue(); 13232 } 13233 13234 // The second is a Splat of Op1S 13235 SDValue Op1S = DCI.DAG.getSplatValue(Op1); 13236 if (!Op1S) 13237 return SDValue(); 13238 13239 unsigned Opc; 13240 switch (VT.getVectorNumElements()) { 13241 case 2: 13242 Opc = Intrinsic::arm_mve_vctp64; 13243 break; 13244 case 4: 13245 Opc = Intrinsic::arm_mve_vctp32; 13246 break; 13247 case 8: 13248 Opc = Intrinsic::arm_mve_vctp16; 13249 break; 13250 case 16: 13251 Opc = Intrinsic::arm_mve_vctp8; 13252 break; 13253 default: 13254 return SDValue(); 13255 } 13256 13257 SDLoc DL(N); 13258 return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 13259 DCI.DAG.getConstant(Opc, DL, MVT::i32), 13260 DCI.DAG.getZExtOrTrunc(Op1S, DL, MVT::i32)); 13261 } 13262 13263 static SDValue PerformABSCombine(SDNode *N, 13264 TargetLowering::DAGCombinerInfo &DCI, 13265 const ARMSubtarget *Subtarget) { 13266 SelectionDAG &DAG = DCI.DAG; 13267 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13268 13269 if (TLI.isOperationLegal(N->getOpcode(), N->getValueType(0))) 13270 return SDValue(); 13271 13272 return TLI.expandABS(N, DAG); 13273 } 13274 13275 /// PerformADDECombine - Target-specific dag combine transform from 13276 /// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or 13277 /// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL 13278 static SDValue PerformADDECombine(SDNode *N, 13279 TargetLowering::DAGCombinerInfo &DCI, 13280 const ARMSubtarget *Subtarget) { 13281 // Only ARM and Thumb2 support UMLAL/SMLAL. 13282 if (Subtarget->isThumb1Only()) 13283 return PerformAddeSubeCombine(N, DCI, Subtarget); 13284 13285 // Only perform the checks after legalize when the pattern is available. 13286 if (DCI.isBeforeLegalize()) return SDValue(); 13287 13288 return AddCombineTo64bitUMAAL(N, DCI, Subtarget); 13289 } 13290 13291 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with 13292 /// operands N0 and N1. This is a helper for PerformADDCombine that is 13293 /// called with the default operands, and if that fails, with commuted 13294 /// operands. 13295 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, 13296 TargetLowering::DAGCombinerInfo &DCI, 13297 const ARMSubtarget *Subtarget){ 13298 // Attempt to create vpadd for this add. 13299 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget)) 13300 return Result; 13301 13302 // Attempt to create vpaddl for this add. 13303 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget)) 13304 return Result; 13305 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI, 13306 Subtarget)) 13307 return Result; 13308 13309 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 13310 if (N0.getNode()->hasOneUse()) 13311 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI)) 13312 return Result; 13313 return SDValue(); 13314 } 13315 13316 static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG) { 13317 EVT VT = N->getValueType(0); 13318 SDValue N0 = N->getOperand(0); 13319 SDValue N1 = N->getOperand(1); 13320 SDLoc dl(N); 13321 13322 auto IsVecReduce = [](SDValue Op) { 13323 switch (Op.getOpcode()) { 13324 case ISD::VECREDUCE_ADD: 13325 case ARMISD::VADDVs: 13326 case ARMISD::VADDVu: 13327 case ARMISD::VMLAVs: 13328 case ARMISD::VMLAVu: 13329 return true; 13330 } 13331 return false; 13332 }; 13333 13334 auto DistrubuteAddAddVecReduce = [&](SDValue N0, SDValue N1) { 13335 // Distribute add(X, add(vecreduce(Y), vecreduce(Z))) -> 13336 // add(add(X, vecreduce(Y)), vecreduce(Z)) 13337 // to make better use of vaddva style instructions. 13338 if (VT == MVT::i32 && N1.getOpcode() == ISD::ADD && !IsVecReduce(N0) && 13339 IsVecReduce(N1.getOperand(0)) && IsVecReduce(N1.getOperand(1)) && 13340 !isa<ConstantSDNode>(N0)) { 13341 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0, N1.getOperand(0)); 13342 return DAG.getNode(ISD::ADD, dl, VT, Add0, N1.getOperand(1)); 13343 } 13344 // And turn add(add(A, reduce(B)), add(C, reduce(D))) -> 13345 // add(add(add(A, C), reduce(B)), reduce(D)) 13346 if (VT == MVT::i32 && N0.getOpcode() == ISD::ADD && 13347 N1.getOpcode() == ISD::ADD) { 13348 unsigned N0RedOp = 0; 13349 if (!IsVecReduce(N0.getOperand(N0RedOp))) { 13350 N0RedOp = 1; 13351 if (!IsVecReduce(N0.getOperand(N0RedOp))) 13352 return SDValue(); 13353 } 13354 13355 unsigned N1RedOp = 0; 13356 if (!IsVecReduce(N1.getOperand(N1RedOp))) 13357 N1RedOp = 1; 13358 if (!IsVecReduce(N1.getOperand(N1RedOp))) 13359 return SDValue(); 13360 13361 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0.getOperand(1 - N0RedOp), 13362 N1.getOperand(1 - N1RedOp)); 13363 SDValue Add1 = 13364 DAG.getNode(ISD::ADD, dl, VT, Add0, N0.getOperand(N0RedOp)); 13365 return DAG.getNode(ISD::ADD, dl, VT, Add1, N1.getOperand(N1RedOp)); 13366 } 13367 return SDValue(); 13368 }; 13369 if (SDValue R = DistrubuteAddAddVecReduce(N0, N1)) 13370 return R; 13371 if (SDValue R = DistrubuteAddAddVecReduce(N1, N0)) 13372 return R; 13373 13374 // Distribute add(vecreduce(load(Y)), vecreduce(load(Z))) 13375 // Or add(add(X, vecreduce(load(Y))), vecreduce(load(Z))) 13376 // by ascending load offsets. This can help cores prefetch if the order of 13377 // loads is more predictable. 13378 auto DistrubuteVecReduceLoad = [&](SDValue N0, SDValue N1, bool IsForward) { 13379 // Check if two reductions are known to load data where one is before/after 13380 // another. Return negative if N0 loads data before N1, positive if N1 is 13381 // before N0 and 0 otherwise if nothing is known. 13382 auto IsKnownOrderedLoad = [&](SDValue N0, SDValue N1) { 13383 // Look through to the first operand of a MUL, for the VMLA case. 13384 // Currently only looks at the first operand, in the hope they are equal. 13385 if (N0.getOpcode() == ISD::MUL) 13386 N0 = N0.getOperand(0); 13387 if (N1.getOpcode() == ISD::MUL) 13388 N1 = N1.getOperand(0); 13389 13390 // Return true if the two operands are loads to the same object and the 13391 // offset of the first is known to be less than the offset of the second. 13392 LoadSDNode *Load0 = dyn_cast<LoadSDNode>(N0); 13393 LoadSDNode *Load1 = dyn_cast<LoadSDNode>(N1); 13394 if (!Load0 || !Load1 || Load0->getChain() != Load1->getChain() || 13395 !Load0->isSimple() || !Load1->isSimple() || Load0->isIndexed() || 13396 Load1->isIndexed()) 13397 return 0; 13398 13399 auto BaseLocDecomp0 = BaseIndexOffset::match(Load0, DAG); 13400 auto BaseLocDecomp1 = BaseIndexOffset::match(Load1, DAG); 13401 13402 if (!BaseLocDecomp0.getBase() || 13403 BaseLocDecomp0.getBase() != BaseLocDecomp1.getBase() || 13404 !BaseLocDecomp0.hasValidOffset() || !BaseLocDecomp1.hasValidOffset()) 13405 return 0; 13406 if (BaseLocDecomp0.getOffset() < BaseLocDecomp1.getOffset()) 13407 return -1; 13408 if (BaseLocDecomp0.getOffset() > BaseLocDecomp1.getOffset()) 13409 return 1; 13410 return 0; 13411 }; 13412 13413 SDValue X; 13414 if (N0.getOpcode() == ISD::ADD) { 13415 if (IsVecReduce(N0.getOperand(0)) && IsVecReduce(N0.getOperand(1))) { 13416 int IsBefore = IsKnownOrderedLoad(N0.getOperand(0).getOperand(0), 13417 N0.getOperand(1).getOperand(0)); 13418 if (IsBefore < 0) { 13419 X = N0.getOperand(0); 13420 N0 = N0.getOperand(1); 13421 } else if (IsBefore > 0) { 13422 X = N0.getOperand(1); 13423 N0 = N0.getOperand(0); 13424 } else 13425 return SDValue(); 13426 } else if (IsVecReduce(N0.getOperand(0))) { 13427 X = N0.getOperand(1); 13428 N0 = N0.getOperand(0); 13429 } else if (IsVecReduce(N0.getOperand(1))) { 13430 X = N0.getOperand(0); 13431 N0 = N0.getOperand(1); 13432 } else 13433 return SDValue(); 13434 } else if (IsForward && IsVecReduce(N0) && IsVecReduce(N1) && 13435 IsKnownOrderedLoad(N0.getOperand(0), N1.getOperand(0)) < 0) { 13436 // Note this is backward to how you would expect. We create 13437 // add(reduce(load + 16), reduce(load + 0)) so that the 13438 // add(reduce(load+16), X) is combined into VADDVA(X, load+16)), leaving 13439 // the X as VADDV(load + 0) 13440 return DAG.getNode(ISD::ADD, dl, VT, N1, N0); 13441 } else 13442 return SDValue(); 13443 13444 if (!IsVecReduce(N0) || !IsVecReduce(N1)) 13445 return SDValue(); 13446 13447 if (IsKnownOrderedLoad(N1.getOperand(0), N0.getOperand(0)) >= 0) 13448 return SDValue(); 13449 13450 // Switch from add(add(X, N0), N1) to add(add(X, N1), N0) 13451 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, X, N1); 13452 return DAG.getNode(ISD::ADD, dl, VT, Add0, N0); 13453 }; 13454 if (SDValue R = DistrubuteVecReduceLoad(N0, N1, true)) 13455 return R; 13456 if (SDValue R = DistrubuteVecReduceLoad(N1, N0, false)) 13457 return R; 13458 return SDValue(); 13459 } 13460 13461 static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG, 13462 const ARMSubtarget *Subtarget) { 13463 if (!Subtarget->hasMVEIntegerOps()) 13464 return SDValue(); 13465 13466 if (SDValue R = TryDistrubutionADDVecReduce(N, DAG)) 13467 return R; 13468 13469 EVT VT = N->getValueType(0); 13470 SDValue N0 = N->getOperand(0); 13471 SDValue N1 = N->getOperand(1); 13472 SDLoc dl(N); 13473 13474 if (VT != MVT::i64) 13475 return SDValue(); 13476 13477 // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this 13478 // will look like: 13479 // t1: i32,i32 = ARMISD::VADDLVs x 13480 // t2: i64 = build_pair t1, t1:1 13481 // t3: i64 = add t2, y 13482 // Otherwise we try to push the add up above VADDLVAx, to potentially allow 13483 // the add to be simplified seperately. 13484 // We also need to check for sext / zext and commutitive adds. 13485 auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA, 13486 SDValue NB) { 13487 if (NB->getOpcode() != ISD::BUILD_PAIR) 13488 return SDValue(); 13489 SDValue VecRed = NB->getOperand(0); 13490 if ((VecRed->getOpcode() != Opcode && VecRed->getOpcode() != OpcodeA) || 13491 VecRed.getResNo() != 0 || 13492 NB->getOperand(1) != SDValue(VecRed.getNode(), 1)) 13493 return SDValue(); 13494 13495 if (VecRed->getOpcode() == OpcodeA) { 13496 // add(NA, VADDLVA(Inp), Y) -> VADDLVA(add(NA, Inp), Y) 13497 SDValue Inp = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, 13498 VecRed.getOperand(0), VecRed.getOperand(1)); 13499 NA = DAG.getNode(ISD::ADD, dl, MVT::i64, Inp, NA); 13500 } 13501 13502 SmallVector<SDValue, 4> Ops; 13503 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA, 13504 DAG.getConstant(0, dl, MVT::i32))); 13505 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA, 13506 DAG.getConstant(1, dl, MVT::i32))); 13507 unsigned S = VecRed->getOpcode() == OpcodeA ? 2 : 0; 13508 for (unsigned I = S, E = VecRed.getNumOperands(); I < E; I++) 13509 Ops.push_back(VecRed->getOperand(I)); 13510 SDValue Red = 13511 DAG.getNode(OpcodeA, dl, DAG.getVTList({MVT::i32, MVT::i32}), Ops); 13512 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red, 13513 SDValue(Red.getNode(), 1)); 13514 }; 13515 13516 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1)) 13517 return M; 13518 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1)) 13519 return M; 13520 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0)) 13521 return M; 13522 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0)) 13523 return M; 13524 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1)) 13525 return M; 13526 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1)) 13527 return M; 13528 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0)) 13529 return M; 13530 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0)) 13531 return M; 13532 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1)) 13533 return M; 13534 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1)) 13535 return M; 13536 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0)) 13537 return M; 13538 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0)) 13539 return M; 13540 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N0, N1)) 13541 return M; 13542 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N0, N1)) 13543 return M; 13544 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N1, N0)) 13545 return M; 13546 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N1, N0)) 13547 return M; 13548 return SDValue(); 13549 } 13550 13551 bool 13552 ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N, 13553 CombineLevel Level) const { 13554 if (Level == BeforeLegalizeTypes) 13555 return true; 13556 13557 if (N->getOpcode() != ISD::SHL) 13558 return true; 13559 13560 if (Subtarget->isThumb1Only()) { 13561 // Avoid making expensive immediates by commuting shifts. (This logic 13562 // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted 13563 // for free.) 13564 if (N->getOpcode() != ISD::SHL) 13565 return true; 13566 SDValue N1 = N->getOperand(0); 13567 if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND && 13568 N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR) 13569 return true; 13570 if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) { 13571 if (Const->getAPIntValue().ult(256)) 13572 return false; 13573 if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) && 13574 Const->getAPIntValue().sgt(-256)) 13575 return false; 13576 } 13577 return true; 13578 } 13579 13580 // Turn off commute-with-shift transform after legalization, so it doesn't 13581 // conflict with PerformSHLSimplify. (We could try to detect when 13582 // PerformSHLSimplify would trigger more precisely, but it isn't 13583 // really necessary.) 13584 return false; 13585 } 13586 13587 bool ARMTargetLowering::shouldFoldConstantShiftPairToMask( 13588 const SDNode *N, CombineLevel Level) const { 13589 if (!Subtarget->isThumb1Only()) 13590 return true; 13591 13592 if (Level == BeforeLegalizeTypes) 13593 return true; 13594 13595 return false; 13596 } 13597 13598 bool ARMTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const { 13599 if (!Subtarget->hasNEON()) { 13600 if (Subtarget->isThumb1Only()) 13601 return VT.getScalarSizeInBits() <= 32; 13602 return true; 13603 } 13604 return VT.isScalarInteger(); 13605 } 13606 13607 bool ARMTargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT, 13608 EVT VT) const { 13609 if (!isOperationLegalOrCustom(Op, VT) || !FPVT.isSimple()) 13610 return false; 13611 13612 switch (FPVT.getSimpleVT().SimpleTy) { 13613 case MVT::f16: 13614 return Subtarget->hasVFP2Base(); 13615 case MVT::f32: 13616 return Subtarget->hasVFP2Base(); 13617 case MVT::f64: 13618 return Subtarget->hasFP64(); 13619 case MVT::v4f32: 13620 case MVT::v8f16: 13621 return Subtarget->hasMVEFloatOps(); 13622 default: 13623 return false; 13624 } 13625 } 13626 13627 static SDValue PerformSHLSimplify(SDNode *N, 13628 TargetLowering::DAGCombinerInfo &DCI, 13629 const ARMSubtarget *ST) { 13630 // Allow the generic combiner to identify potential bswaps. 13631 if (DCI.isBeforeLegalize()) 13632 return SDValue(); 13633 13634 // DAG combiner will fold: 13635 // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) 13636 // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2 13637 // Other code patterns that can be also be modified have the following form: 13638 // b + ((a << 1) | 510) 13639 // b + ((a << 1) & 510) 13640 // b + ((a << 1) ^ 510) 13641 // b + ((a << 1) + 510) 13642 13643 // Many instructions can perform the shift for free, but it requires both 13644 // the operands to be registers. If c1 << c2 is too large, a mov immediate 13645 // instruction will needed. So, unfold back to the original pattern if: 13646 // - if c1 and c2 are small enough that they don't require mov imms. 13647 // - the user(s) of the node can perform an shl 13648 13649 // No shifted operands for 16-bit instructions. 13650 if (ST->isThumb() && ST->isThumb1Only()) 13651 return SDValue(); 13652 13653 // Check that all the users could perform the shl themselves. 13654 for (auto U : N->uses()) { 13655 switch(U->getOpcode()) { 13656 default: 13657 return SDValue(); 13658 case ISD::SUB: 13659 case ISD::ADD: 13660 case ISD::AND: 13661 case ISD::OR: 13662 case ISD::XOR: 13663 case ISD::SETCC: 13664 case ARMISD::CMP: 13665 // Check that the user isn't already using a constant because there 13666 // aren't any instructions that support an immediate operand and a 13667 // shifted operand. 13668 if (isa<ConstantSDNode>(U->getOperand(0)) || 13669 isa<ConstantSDNode>(U->getOperand(1))) 13670 return SDValue(); 13671 13672 // Check that it's not already using a shift. 13673 if (U->getOperand(0).getOpcode() == ISD::SHL || 13674 U->getOperand(1).getOpcode() == ISD::SHL) 13675 return SDValue(); 13676 break; 13677 } 13678 } 13679 13680 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR && 13681 N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND) 13682 return SDValue(); 13683 13684 if (N->getOperand(0).getOpcode() != ISD::SHL) 13685 return SDValue(); 13686 13687 SDValue SHL = N->getOperand(0); 13688 13689 auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1)); 13690 auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1)); 13691 if (!C1ShlC2 || !C2) 13692 return SDValue(); 13693 13694 APInt C2Int = C2->getAPIntValue(); 13695 APInt C1Int = C1ShlC2->getAPIntValue(); 13696 13697 // Check that performing a lshr will not lose any information. 13698 APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(), 13699 C2Int.getBitWidth() - C2->getZExtValue()); 13700 if ((C1Int & Mask) != C1Int) 13701 return SDValue(); 13702 13703 // Shift the first constant. 13704 C1Int.lshrInPlace(C2Int); 13705 13706 // The immediates are encoded as an 8-bit value that can be rotated. 13707 auto LargeImm = [](const APInt &Imm) { 13708 unsigned Zeros = Imm.countLeadingZeros() + Imm.countTrailingZeros(); 13709 return Imm.getBitWidth() - Zeros > 8; 13710 }; 13711 13712 if (LargeImm(C1Int) || LargeImm(C2Int)) 13713 return SDValue(); 13714 13715 SelectionDAG &DAG = DCI.DAG; 13716 SDLoc dl(N); 13717 SDValue X = SHL.getOperand(0); 13718 SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X, 13719 DAG.getConstant(C1Int, dl, MVT::i32)); 13720 // Shift left to compensate for the lshr of C1Int. 13721 SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1)); 13722 13723 LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump(); 13724 SHL.dump(); N->dump()); 13725 LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump()); 13726 return Res; 13727 } 13728 13729 13730 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. 13731 /// 13732 static SDValue PerformADDCombine(SDNode *N, 13733 TargetLowering::DAGCombinerInfo &DCI, 13734 const ARMSubtarget *Subtarget) { 13735 SDValue N0 = N->getOperand(0); 13736 SDValue N1 = N->getOperand(1); 13737 13738 // Only works one way, because it needs an immediate operand. 13739 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 13740 return Result; 13741 13742 if (SDValue Result = PerformADDVecReduce(N, DCI.DAG, Subtarget)) 13743 return Result; 13744 13745 // First try with the default operand order. 13746 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget)) 13747 return Result; 13748 13749 // If that didn't work, try again with the operands commuted. 13750 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget); 13751 } 13752 13753 // Combine (sub 0, (csinc X, Y, CC)) -> (csinv -X, Y, CC) 13754 // providing -X is as cheap as X (currently, just a constant). 13755 static SDValue PerformSubCSINCCombine(SDNode *N, SelectionDAG &DAG) { 13756 if (N->getValueType(0) != MVT::i32 || !isNullConstant(N->getOperand(0))) 13757 return SDValue(); 13758 SDValue CSINC = N->getOperand(1); 13759 if (CSINC.getOpcode() != ARMISD::CSINC || !CSINC.hasOneUse()) 13760 return SDValue(); 13761 13762 ConstantSDNode *X = dyn_cast<ConstantSDNode>(CSINC.getOperand(0)); 13763 if (!X) 13764 return SDValue(); 13765 13766 return DAG.getNode(ARMISD::CSINV, SDLoc(N), MVT::i32, 13767 DAG.getNode(ISD::SUB, SDLoc(N), MVT::i32, N->getOperand(0), 13768 CSINC.getOperand(0)), 13769 CSINC.getOperand(1), CSINC.getOperand(2), 13770 CSINC.getOperand(3)); 13771 } 13772 13773 /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB. 13774 /// 13775 static SDValue PerformSUBCombine(SDNode *N, 13776 TargetLowering::DAGCombinerInfo &DCI, 13777 const ARMSubtarget *Subtarget) { 13778 SDValue N0 = N->getOperand(0); 13779 SDValue N1 = N->getOperand(1); 13780 13781 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 13782 if (N1.getNode()->hasOneUse()) 13783 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI)) 13784 return Result; 13785 13786 if (SDValue R = PerformSubCSINCCombine(N, DCI.DAG)) 13787 return R; 13788 13789 if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector()) 13790 return SDValue(); 13791 13792 // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x)) 13793 // so that we can readily pattern match more mve instructions which can use 13794 // a scalar operand. 13795 SDValue VDup = N->getOperand(1); 13796 if (VDup->getOpcode() != ARMISD::VDUP) 13797 return SDValue(); 13798 13799 SDValue VMov = N->getOperand(0); 13800 if (VMov->getOpcode() == ISD::BITCAST) 13801 VMov = VMov->getOperand(0); 13802 13803 if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov)) 13804 return SDValue(); 13805 13806 SDLoc dl(N); 13807 SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32, 13808 DCI.DAG.getConstant(0, dl, MVT::i32), 13809 VDup->getOperand(0)); 13810 return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate); 13811 } 13812 13813 /// PerformVMULCombine 13814 /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the 13815 /// special multiplier accumulator forwarding. 13816 /// vmul d3, d0, d2 13817 /// vmla d3, d1, d2 13818 /// is faster than 13819 /// vadd d3, d0, d1 13820 /// vmul d3, d3, d2 13821 // However, for (A + B) * (A + B), 13822 // vadd d2, d0, d1 13823 // vmul d3, d0, d2 13824 // vmla d3, d1, d2 13825 // is slower than 13826 // vadd d2, d0, d1 13827 // vmul d3, d2, d2 13828 static SDValue PerformVMULCombine(SDNode *N, 13829 TargetLowering::DAGCombinerInfo &DCI, 13830 const ARMSubtarget *Subtarget) { 13831 if (!Subtarget->hasVMLxForwarding()) 13832 return SDValue(); 13833 13834 SelectionDAG &DAG = DCI.DAG; 13835 SDValue N0 = N->getOperand(0); 13836 SDValue N1 = N->getOperand(1); 13837 unsigned Opcode = N0.getOpcode(); 13838 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 13839 Opcode != ISD::FADD && Opcode != ISD::FSUB) { 13840 Opcode = N1.getOpcode(); 13841 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 13842 Opcode != ISD::FADD && Opcode != ISD::FSUB) 13843 return SDValue(); 13844 std::swap(N0, N1); 13845 } 13846 13847 if (N0 == N1) 13848 return SDValue(); 13849 13850 EVT VT = N->getValueType(0); 13851 SDLoc DL(N); 13852 SDValue N00 = N0->getOperand(0); 13853 SDValue N01 = N0->getOperand(1); 13854 return DAG.getNode(Opcode, DL, VT, 13855 DAG.getNode(ISD::MUL, DL, VT, N00, N1), 13856 DAG.getNode(ISD::MUL, DL, VT, N01, N1)); 13857 } 13858 13859 static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG, 13860 const ARMSubtarget *Subtarget) { 13861 EVT VT = N->getValueType(0); 13862 if (VT != MVT::v2i64) 13863 return SDValue(); 13864 13865 SDValue N0 = N->getOperand(0); 13866 SDValue N1 = N->getOperand(1); 13867 13868 auto IsSignExt = [&](SDValue Op) { 13869 if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG) 13870 return SDValue(); 13871 EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT(); 13872 if (VT.getScalarSizeInBits() == 32) 13873 return Op->getOperand(0); 13874 return SDValue(); 13875 }; 13876 auto IsZeroExt = [&](SDValue Op) { 13877 // Zero extends are a little more awkward. At the point we are matching 13878 // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask. 13879 // That might be before of after a bitcast depending on how the and is 13880 // placed. Because this has to look through bitcasts, it is currently only 13881 // supported on LE. 13882 if (!Subtarget->isLittle()) 13883 return SDValue(); 13884 13885 SDValue And = Op; 13886 if (And->getOpcode() == ISD::BITCAST) 13887 And = And->getOperand(0); 13888 if (And->getOpcode() != ISD::AND) 13889 return SDValue(); 13890 SDValue Mask = And->getOperand(1); 13891 if (Mask->getOpcode() == ISD::BITCAST) 13892 Mask = Mask->getOperand(0); 13893 13894 if (Mask->getOpcode() != ISD::BUILD_VECTOR || 13895 Mask.getValueType() != MVT::v4i32) 13896 return SDValue(); 13897 if (isAllOnesConstant(Mask->getOperand(0)) && 13898 isNullConstant(Mask->getOperand(1)) && 13899 isAllOnesConstant(Mask->getOperand(2)) && 13900 isNullConstant(Mask->getOperand(3))) 13901 return And->getOperand(0); 13902 return SDValue(); 13903 }; 13904 13905 SDLoc dl(N); 13906 if (SDValue Op0 = IsSignExt(N0)) { 13907 if (SDValue Op1 = IsSignExt(N1)) { 13908 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0); 13909 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1); 13910 return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a); 13911 } 13912 } 13913 if (SDValue Op0 = IsZeroExt(N0)) { 13914 if (SDValue Op1 = IsZeroExt(N1)) { 13915 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0); 13916 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1); 13917 return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a); 13918 } 13919 } 13920 13921 return SDValue(); 13922 } 13923 13924 static SDValue PerformMULCombine(SDNode *N, 13925 TargetLowering::DAGCombinerInfo &DCI, 13926 const ARMSubtarget *Subtarget) { 13927 SelectionDAG &DAG = DCI.DAG; 13928 13929 EVT VT = N->getValueType(0); 13930 if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64) 13931 return PerformMVEVMULLCombine(N, DAG, Subtarget); 13932 13933 if (Subtarget->isThumb1Only()) 13934 return SDValue(); 13935 13936 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 13937 return SDValue(); 13938 13939 if (VT.is64BitVector() || VT.is128BitVector()) 13940 return PerformVMULCombine(N, DCI, Subtarget); 13941 if (VT != MVT::i32) 13942 return SDValue(); 13943 13944 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 13945 if (!C) 13946 return SDValue(); 13947 13948 int64_t MulAmt = C->getSExtValue(); 13949 unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt); 13950 13951 ShiftAmt = ShiftAmt & (32 - 1); 13952 SDValue V = N->getOperand(0); 13953 SDLoc DL(N); 13954 13955 SDValue Res; 13956 MulAmt >>= ShiftAmt; 13957 13958 if (MulAmt >= 0) { 13959 if (isPowerOf2_32(MulAmt - 1)) { 13960 // (mul x, 2^N + 1) => (add (shl x, N), x) 13961 Res = DAG.getNode(ISD::ADD, DL, VT, 13962 V, 13963 DAG.getNode(ISD::SHL, DL, VT, 13964 V, 13965 DAG.getConstant(Log2_32(MulAmt - 1), DL, 13966 MVT::i32))); 13967 } else if (isPowerOf2_32(MulAmt + 1)) { 13968 // (mul x, 2^N - 1) => (sub (shl x, N), x) 13969 Res = DAG.getNode(ISD::SUB, DL, VT, 13970 DAG.getNode(ISD::SHL, DL, VT, 13971 V, 13972 DAG.getConstant(Log2_32(MulAmt + 1), DL, 13973 MVT::i32)), 13974 V); 13975 } else 13976 return SDValue(); 13977 } else { 13978 uint64_t MulAmtAbs = -MulAmt; 13979 if (isPowerOf2_32(MulAmtAbs + 1)) { 13980 // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) 13981 Res = DAG.getNode(ISD::SUB, DL, VT, 13982 V, 13983 DAG.getNode(ISD::SHL, DL, VT, 13984 V, 13985 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL, 13986 MVT::i32))); 13987 } else if (isPowerOf2_32(MulAmtAbs - 1)) { 13988 // (mul x, -(2^N + 1)) => - (add (shl x, N), x) 13989 Res = DAG.getNode(ISD::ADD, DL, VT, 13990 V, 13991 DAG.getNode(ISD::SHL, DL, VT, 13992 V, 13993 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL, 13994 MVT::i32))); 13995 Res = DAG.getNode(ISD::SUB, DL, VT, 13996 DAG.getConstant(0, DL, MVT::i32), Res); 13997 } else 13998 return SDValue(); 13999 } 14000 14001 if (ShiftAmt != 0) 14002 Res = DAG.getNode(ISD::SHL, DL, VT, 14003 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32)); 14004 14005 // Do not add new nodes to DAG combiner worklist. 14006 DCI.CombineTo(N, Res, false); 14007 return SDValue(); 14008 } 14009 14010 static SDValue CombineANDShift(SDNode *N, 14011 TargetLowering::DAGCombinerInfo &DCI, 14012 const ARMSubtarget *Subtarget) { 14013 // Allow DAGCombine to pattern-match before we touch the canonical form. 14014 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 14015 return SDValue(); 14016 14017 if (N->getValueType(0) != MVT::i32) 14018 return SDValue(); 14019 14020 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 14021 if (!N1C) 14022 return SDValue(); 14023 14024 uint32_t C1 = (uint32_t)N1C->getZExtValue(); 14025 // Don't transform uxtb/uxth. 14026 if (C1 == 255 || C1 == 65535) 14027 return SDValue(); 14028 14029 SDNode *N0 = N->getOperand(0).getNode(); 14030 if (!N0->hasOneUse()) 14031 return SDValue(); 14032 14033 if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL) 14034 return SDValue(); 14035 14036 bool LeftShift = N0->getOpcode() == ISD::SHL; 14037 14038 ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1)); 14039 if (!N01C) 14040 return SDValue(); 14041 14042 uint32_t C2 = (uint32_t)N01C->getZExtValue(); 14043 if (!C2 || C2 >= 32) 14044 return SDValue(); 14045 14046 // Clear irrelevant bits in the mask. 14047 if (LeftShift) 14048 C1 &= (-1U << C2); 14049 else 14050 C1 &= (-1U >> C2); 14051 14052 SelectionDAG &DAG = DCI.DAG; 14053 SDLoc DL(N); 14054 14055 // We have a pattern of the form "(and (shl x, c2) c1)" or 14056 // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to 14057 // transform to a pair of shifts, to save materializing c1. 14058 14059 // First pattern: right shift, then mask off leading bits. 14060 // FIXME: Use demanded bits? 14061 if (!LeftShift && isMask_32(C1)) { 14062 uint32_t C3 = countLeadingZeros(C1); 14063 if (C2 < C3) { 14064 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 14065 DAG.getConstant(C3 - C2, DL, MVT::i32)); 14066 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL, 14067 DAG.getConstant(C3, DL, MVT::i32)); 14068 } 14069 } 14070 14071 // First pattern, reversed: left shift, then mask off trailing bits. 14072 if (LeftShift && isMask_32(~C1)) { 14073 uint32_t C3 = countTrailingZeros(C1); 14074 if (C2 < C3) { 14075 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0), 14076 DAG.getConstant(C3 - C2, DL, MVT::i32)); 14077 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL, 14078 DAG.getConstant(C3, DL, MVT::i32)); 14079 } 14080 } 14081 14082 // Second pattern: left shift, then mask off leading bits. 14083 // FIXME: Use demanded bits? 14084 if (LeftShift && isShiftedMask_32(C1)) { 14085 uint32_t Trailing = countTrailingZeros(C1); 14086 uint32_t C3 = countLeadingZeros(C1); 14087 if (Trailing == C2 && C2 + C3 < 32) { 14088 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 14089 DAG.getConstant(C2 + C3, DL, MVT::i32)); 14090 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL, 14091 DAG.getConstant(C3, DL, MVT::i32)); 14092 } 14093 } 14094 14095 // Second pattern, reversed: right shift, then mask off trailing bits. 14096 // FIXME: Handle other patterns of known/demanded bits. 14097 if (!LeftShift && isShiftedMask_32(C1)) { 14098 uint32_t Leading = countLeadingZeros(C1); 14099 uint32_t C3 = countTrailingZeros(C1); 14100 if (Leading == C2 && C2 + C3 < 32) { 14101 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0), 14102 DAG.getConstant(C2 + C3, DL, MVT::i32)); 14103 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL, 14104 DAG.getConstant(C3, DL, MVT::i32)); 14105 } 14106 } 14107 14108 // FIXME: Transform "(and (shl x, c2) c1)" -> 14109 // "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than 14110 // c1. 14111 return SDValue(); 14112 } 14113 14114 static SDValue PerformANDCombine(SDNode *N, 14115 TargetLowering::DAGCombinerInfo &DCI, 14116 const ARMSubtarget *Subtarget) { 14117 // Attempt to use immediate-form VBIC 14118 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 14119 SDLoc dl(N); 14120 EVT VT = N->getValueType(0); 14121 SelectionDAG &DAG = DCI.DAG; 14122 14123 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v2i1 || 14124 VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1) 14125 return SDValue(); 14126 14127 APInt SplatBits, SplatUndef; 14128 unsigned SplatBitSize; 14129 bool HasAnyUndefs; 14130 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) && 14131 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 14132 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 || 14133 SplatBitSize == 64) { 14134 EVT VbicVT; 14135 SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(), 14136 SplatUndef.getZExtValue(), SplatBitSize, 14137 DAG, dl, VbicVT, VT, OtherModImm); 14138 if (Val.getNode()) { 14139 SDValue Input = 14140 DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0)); 14141 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val); 14142 return DAG.getNode(ISD::BITCAST, dl, VT, Vbic); 14143 } 14144 } 14145 } 14146 14147 if (!Subtarget->isThumb1Only()) { 14148 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) 14149 if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI)) 14150 return Result; 14151 14152 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 14153 return Result; 14154 } 14155 14156 if (Subtarget->isThumb1Only()) 14157 if (SDValue Result = CombineANDShift(N, DCI, Subtarget)) 14158 return Result; 14159 14160 return SDValue(); 14161 } 14162 14163 // Try combining OR nodes to SMULWB, SMULWT. 14164 static SDValue PerformORCombineToSMULWBT(SDNode *OR, 14165 TargetLowering::DAGCombinerInfo &DCI, 14166 const ARMSubtarget *Subtarget) { 14167 if (!Subtarget->hasV6Ops() || 14168 (Subtarget->isThumb() && 14169 (!Subtarget->hasThumb2() || !Subtarget->hasDSP()))) 14170 return SDValue(); 14171 14172 SDValue SRL = OR->getOperand(0); 14173 SDValue SHL = OR->getOperand(1); 14174 14175 if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) { 14176 SRL = OR->getOperand(1); 14177 SHL = OR->getOperand(0); 14178 } 14179 if (!isSRL16(SRL) || !isSHL16(SHL)) 14180 return SDValue(); 14181 14182 // The first operands to the shifts need to be the two results from the 14183 // same smul_lohi node. 14184 if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) || 14185 SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI) 14186 return SDValue(); 14187 14188 SDNode *SMULLOHI = SRL.getOperand(0).getNode(); 14189 if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) || 14190 SHL.getOperand(0) != SDValue(SMULLOHI, 1)) 14191 return SDValue(); 14192 14193 // Now we have: 14194 // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16))) 14195 // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments. 14196 // For SMUWB the 16-bit value will signed extended somehow. 14197 // For SMULWT only the SRA is required. 14198 // Check both sides of SMUL_LOHI 14199 SDValue OpS16 = SMULLOHI->getOperand(0); 14200 SDValue OpS32 = SMULLOHI->getOperand(1); 14201 14202 SelectionDAG &DAG = DCI.DAG; 14203 if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) { 14204 OpS16 = OpS32; 14205 OpS32 = SMULLOHI->getOperand(0); 14206 } 14207 14208 SDLoc dl(OR); 14209 unsigned Opcode = 0; 14210 if (isS16(OpS16, DAG)) 14211 Opcode = ARMISD::SMULWB; 14212 else if (isSRA16(OpS16)) { 14213 Opcode = ARMISD::SMULWT; 14214 OpS16 = OpS16->getOperand(0); 14215 } 14216 else 14217 return SDValue(); 14218 14219 SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16); 14220 DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res); 14221 return SDValue(OR, 0); 14222 } 14223 14224 static SDValue PerformORCombineToBFI(SDNode *N, 14225 TargetLowering::DAGCombinerInfo &DCI, 14226 const ARMSubtarget *Subtarget) { 14227 // BFI is only available on V6T2+ 14228 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops()) 14229 return SDValue(); 14230 14231 EVT VT = N->getValueType(0); 14232 SDValue N0 = N->getOperand(0); 14233 SDValue N1 = N->getOperand(1); 14234 SelectionDAG &DAG = DCI.DAG; 14235 SDLoc DL(N); 14236 // 1) or (and A, mask), val => ARMbfi A, val, mask 14237 // iff (val & mask) == val 14238 // 14239 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 14240 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2) 14241 // && mask == ~mask2 14242 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2) 14243 // && ~mask == mask2 14244 // (i.e., copy a bitfield value into another bitfield of the same width) 14245 14246 if (VT != MVT::i32) 14247 return SDValue(); 14248 14249 SDValue N00 = N0.getOperand(0); 14250 14251 // The value and the mask need to be constants so we can verify this is 14252 // actually a bitfield set. If the mask is 0xffff, we can do better 14253 // via a movt instruction, so don't use BFI in that case. 14254 SDValue MaskOp = N0.getOperand(1); 14255 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp); 14256 if (!MaskC) 14257 return SDValue(); 14258 unsigned Mask = MaskC->getZExtValue(); 14259 if (Mask == 0xffff) 14260 return SDValue(); 14261 SDValue Res; 14262 // Case (1): or (and A, mask), val => ARMbfi A, val, mask 14263 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 14264 if (N1C) { 14265 unsigned Val = N1C->getZExtValue(); 14266 if ((Val & ~Mask) != Val) 14267 return SDValue(); 14268 14269 if (ARM::isBitFieldInvertedMask(Mask)) { 14270 Val >>= countTrailingZeros(~Mask); 14271 14272 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, 14273 DAG.getConstant(Val, DL, MVT::i32), 14274 DAG.getConstant(Mask, DL, MVT::i32)); 14275 14276 DCI.CombineTo(N, Res, false); 14277 // Return value from the original node to inform the combiner than N is 14278 // now dead. 14279 return SDValue(N, 0); 14280 } 14281 } else if (N1.getOpcode() == ISD::AND) { 14282 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 14283 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 14284 if (!N11C) 14285 return SDValue(); 14286 unsigned Mask2 = N11C->getZExtValue(); 14287 14288 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern 14289 // as is to match. 14290 if (ARM::isBitFieldInvertedMask(Mask) && 14291 (Mask == ~Mask2)) { 14292 // The pack halfword instruction works better for masks that fit it, 14293 // so use that when it's available. 14294 if (Subtarget->hasDSP() && 14295 (Mask == 0xffff || Mask == 0xffff0000)) 14296 return SDValue(); 14297 // 2a 14298 unsigned amt = countTrailingZeros(Mask2); 14299 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0), 14300 DAG.getConstant(amt, DL, MVT::i32)); 14301 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res, 14302 DAG.getConstant(Mask, DL, MVT::i32)); 14303 DCI.CombineTo(N, Res, false); 14304 // Return value from the original node to inform the combiner than N is 14305 // now dead. 14306 return SDValue(N, 0); 14307 } else if (ARM::isBitFieldInvertedMask(~Mask) && 14308 (~Mask == Mask2)) { 14309 // The pack halfword instruction works better for masks that fit it, 14310 // so use that when it's available. 14311 if (Subtarget->hasDSP() && 14312 (Mask2 == 0xffff || Mask2 == 0xffff0000)) 14313 return SDValue(); 14314 // 2b 14315 unsigned lsb = countTrailingZeros(Mask); 14316 Res = DAG.getNode(ISD::SRL, DL, VT, N00, 14317 DAG.getConstant(lsb, DL, MVT::i32)); 14318 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res, 14319 DAG.getConstant(Mask2, DL, MVT::i32)); 14320 DCI.CombineTo(N, Res, false); 14321 // Return value from the original node to inform the combiner than N is 14322 // now dead. 14323 return SDValue(N, 0); 14324 } 14325 } 14326 14327 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) && 14328 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) && 14329 ARM::isBitFieldInvertedMask(~Mask)) { 14330 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask 14331 // where lsb(mask) == #shamt and masked bits of B are known zero. 14332 SDValue ShAmt = N00.getOperand(1); 14333 unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue(); 14334 unsigned LSB = countTrailingZeros(Mask); 14335 if (ShAmtC != LSB) 14336 return SDValue(); 14337 14338 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0), 14339 DAG.getConstant(~Mask, DL, MVT::i32)); 14340 14341 DCI.CombineTo(N, Res, false); 14342 // Return value from the original node to inform the combiner than N is 14343 // now dead. 14344 return SDValue(N, 0); 14345 } 14346 14347 return SDValue(); 14348 } 14349 14350 static bool isValidMVECond(unsigned CC, bool IsFloat) { 14351 switch (CC) { 14352 case ARMCC::EQ: 14353 case ARMCC::NE: 14354 case ARMCC::LE: 14355 case ARMCC::GT: 14356 case ARMCC::GE: 14357 case ARMCC::LT: 14358 return true; 14359 case ARMCC::HS: 14360 case ARMCC::HI: 14361 return !IsFloat; 14362 default: 14363 return false; 14364 }; 14365 } 14366 14367 static ARMCC::CondCodes getVCMPCondCode(SDValue N) { 14368 if (N->getOpcode() == ARMISD::VCMP) 14369 return (ARMCC::CondCodes)N->getConstantOperandVal(2); 14370 else if (N->getOpcode() == ARMISD::VCMPZ) 14371 return (ARMCC::CondCodes)N->getConstantOperandVal(1); 14372 else 14373 llvm_unreachable("Not a VCMP/VCMPZ!"); 14374 } 14375 14376 static bool CanInvertMVEVCMP(SDValue N) { 14377 ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N)); 14378 return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint()); 14379 } 14380 14381 static SDValue PerformORCombine_i1(SDNode *N, SelectionDAG &DAG, 14382 const ARMSubtarget *Subtarget) { 14383 // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain 14384 // together with predicates 14385 EVT VT = N->getValueType(0); 14386 SDLoc DL(N); 14387 SDValue N0 = N->getOperand(0); 14388 SDValue N1 = N->getOperand(1); 14389 14390 auto IsFreelyInvertable = [&](SDValue V) { 14391 if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ) 14392 return CanInvertMVEVCMP(V); 14393 return false; 14394 }; 14395 14396 // At least one operand must be freely invertable. 14397 if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1))) 14398 return SDValue(); 14399 14400 SDValue NewN0 = DAG.getLogicalNOT(DL, N0, VT); 14401 SDValue NewN1 = DAG.getLogicalNOT(DL, N1, VT); 14402 SDValue And = DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1); 14403 return DAG.getLogicalNOT(DL, And, VT); 14404 } 14405 14406 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR 14407 static SDValue PerformORCombine(SDNode *N, 14408 TargetLowering::DAGCombinerInfo &DCI, 14409 const ARMSubtarget *Subtarget) { 14410 // Attempt to use immediate-form VORR 14411 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 14412 SDLoc dl(N); 14413 EVT VT = N->getValueType(0); 14414 SelectionDAG &DAG = DCI.DAG; 14415 14416 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 14417 return SDValue(); 14418 14419 if (Subtarget->hasMVEIntegerOps() && (VT == MVT::v2i1 || VT == MVT::v4i1 || 14420 VT == MVT::v8i1 || VT == MVT::v16i1)) 14421 return PerformORCombine_i1(N, DAG, Subtarget); 14422 14423 APInt SplatBits, SplatUndef; 14424 unsigned SplatBitSize; 14425 bool HasAnyUndefs; 14426 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) && 14427 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 14428 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 || 14429 SplatBitSize == 64) { 14430 EVT VorrVT; 14431 SDValue Val = 14432 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), 14433 SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm); 14434 if (Val.getNode()) { 14435 SDValue Input = 14436 DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0)); 14437 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val); 14438 return DAG.getNode(ISD::BITCAST, dl, VT, Vorr); 14439 } 14440 } 14441 } 14442 14443 if (!Subtarget->isThumb1Only()) { 14444 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 14445 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) 14446 return Result; 14447 if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget)) 14448 return Result; 14449 } 14450 14451 SDValue N0 = N->getOperand(0); 14452 SDValue N1 = N->getOperand(1); 14453 14454 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant. 14455 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() && 14456 DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 14457 14458 // The code below optimizes (or (and X, Y), Z). 14459 // The AND operand needs to have a single user to make these optimizations 14460 // profitable. 14461 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse()) 14462 return SDValue(); 14463 14464 APInt SplatUndef; 14465 unsigned SplatBitSize; 14466 bool HasAnyUndefs; 14467 14468 APInt SplatBits0, SplatBits1; 14469 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1)); 14470 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1)); 14471 // Ensure that the second operand of both ands are constants 14472 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize, 14473 HasAnyUndefs) && !HasAnyUndefs) { 14474 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, 14475 HasAnyUndefs) && !HasAnyUndefs) { 14476 // Ensure that the bit width of the constants are the same and that 14477 // the splat arguments are logical inverses as per the pattern we 14478 // are trying to simplify. 14479 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() && 14480 SplatBits0 == ~SplatBits1) { 14481 // Canonicalize the vector type to make instruction selection 14482 // simpler. 14483 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 14484 SDValue Result = DAG.getNode(ARMISD::VBSP, dl, CanonicalVT, 14485 N0->getOperand(1), 14486 N0->getOperand(0), 14487 N1->getOperand(0)); 14488 return DAG.getNode(ISD::BITCAST, dl, VT, Result); 14489 } 14490 } 14491 } 14492 } 14493 14494 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when 14495 // reasonable. 14496 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { 14497 if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget)) 14498 return Res; 14499 } 14500 14501 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 14502 return Result; 14503 14504 return SDValue(); 14505 } 14506 14507 static SDValue PerformXORCombine(SDNode *N, 14508 TargetLowering::DAGCombinerInfo &DCI, 14509 const ARMSubtarget *Subtarget) { 14510 EVT VT = N->getValueType(0); 14511 SelectionDAG &DAG = DCI.DAG; 14512 14513 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 14514 return SDValue(); 14515 14516 if (!Subtarget->isThumb1Only()) { 14517 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 14518 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) 14519 return Result; 14520 14521 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 14522 return Result; 14523 } 14524 14525 if (Subtarget->hasMVEIntegerOps()) { 14526 // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition. 14527 SDValue N0 = N->getOperand(0); 14528 SDValue N1 = N->getOperand(1); 14529 const TargetLowering *TLI = Subtarget->getTargetLowering(); 14530 if (TLI->isConstTrueVal(N1) && 14531 (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) { 14532 if (CanInvertMVEVCMP(N0)) { 14533 SDLoc DL(N0); 14534 ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N0)); 14535 14536 SmallVector<SDValue, 4> Ops; 14537 Ops.push_back(N0->getOperand(0)); 14538 if (N0->getOpcode() == ARMISD::VCMP) 14539 Ops.push_back(N0->getOperand(1)); 14540 Ops.push_back(DAG.getConstant(CC, DL, MVT::i32)); 14541 return DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops); 14542 } 14543 } 14544 } 14545 14546 return SDValue(); 14547 } 14548 14549 // ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it, 14550 // and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and 14551 // their position in "to" (Rd). 14552 static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) { 14553 assert(N->getOpcode() == ARMISD::BFI); 14554 14555 SDValue From = N->getOperand(1); 14556 ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue(); 14557 FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation()); 14558 14559 // If the Base came from a SHR #C, we can deduce that it is really testing bit 14560 // #C in the base of the SHR. 14561 if (From->getOpcode() == ISD::SRL && 14562 isa<ConstantSDNode>(From->getOperand(1))) { 14563 APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue(); 14564 assert(Shift.getLimitedValue() < 32 && "Shift too large!"); 14565 FromMask <<= Shift.getLimitedValue(31); 14566 From = From->getOperand(0); 14567 } 14568 14569 return From; 14570 } 14571 14572 // If A and B contain one contiguous set of bits, does A | B == A . B? 14573 // 14574 // Neither A nor B must be zero. 14575 static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) { 14576 unsigned LastActiveBitInA = A.countTrailingZeros(); 14577 unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1; 14578 return LastActiveBitInA - 1 == FirstActiveBitInB; 14579 } 14580 14581 static SDValue FindBFIToCombineWith(SDNode *N) { 14582 // We have a BFI in N. Find a BFI it can combine with, if one exists. 14583 APInt ToMask, FromMask; 14584 SDValue From = ParseBFI(N, ToMask, FromMask); 14585 SDValue To = N->getOperand(0); 14586 14587 SDValue V = To; 14588 if (V.getOpcode() != ARMISD::BFI) 14589 return SDValue(); 14590 14591 APInt NewToMask, NewFromMask; 14592 SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask); 14593 if (NewFrom != From) 14594 return SDValue(); 14595 14596 // Do the written bits conflict with any we've seen so far? 14597 if ((NewToMask & ToMask).getBoolValue()) 14598 // Conflicting bits. 14599 return SDValue(); 14600 14601 // Are the new bits contiguous when combined with the old bits? 14602 if (BitsProperlyConcatenate(ToMask, NewToMask) && 14603 BitsProperlyConcatenate(FromMask, NewFromMask)) 14604 return V; 14605 if (BitsProperlyConcatenate(NewToMask, ToMask) && 14606 BitsProperlyConcatenate(NewFromMask, FromMask)) 14607 return V; 14608 14609 return SDValue(); 14610 } 14611 14612 static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG) { 14613 SDValue N0 = N->getOperand(0); 14614 SDValue N1 = N->getOperand(1); 14615 14616 if (N1.getOpcode() == ISD::AND) { 14617 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff 14618 // the bits being cleared by the AND are not demanded by the BFI. 14619 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 14620 if (!N11C) 14621 return SDValue(); 14622 unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 14623 unsigned LSB = countTrailingZeros(~InvMask); 14624 unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB; 14625 assert(Width < 14626 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) && 14627 "undefined behavior"); 14628 unsigned Mask = (1u << Width) - 1; 14629 unsigned Mask2 = N11C->getZExtValue(); 14630 if ((Mask & (~Mask2)) == 0) 14631 return DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0), 14632 N->getOperand(0), N1.getOperand(0), N->getOperand(2)); 14633 return SDValue(); 14634 } 14635 14636 // Look for another BFI to combine with. 14637 if (SDValue CombineBFI = FindBFIToCombineWith(N)) { 14638 // We've found a BFI. 14639 APInt ToMask1, FromMask1; 14640 SDValue From1 = ParseBFI(N, ToMask1, FromMask1); 14641 14642 APInt ToMask2, FromMask2; 14643 SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2); 14644 assert(From1 == From2); 14645 (void)From2; 14646 14647 // Create a new BFI, combining the two together. 14648 APInt NewFromMask = FromMask1 | FromMask2; 14649 APInt NewToMask = ToMask1 | ToMask2; 14650 14651 EVT VT = N->getValueType(0); 14652 SDLoc dl(N); 14653 14654 if (NewFromMask[0] == 0) 14655 From1 = DAG.getNode( 14656 ISD::SRL, dl, VT, From1, 14657 DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT)); 14658 return DAG.getNode(ARMISD::BFI, dl, VT, CombineBFI.getOperand(0), From1, 14659 DAG.getConstant(~NewToMask, dl, VT)); 14660 } 14661 14662 // Reassociate BFI(BFI (A, B, M1), C, M2) to BFI(BFI (A, C, M2), B, M1) so 14663 // that lower bit insertions are performed first, providing that M1 and M2 14664 // do no overlap. This can allow multiple BFI instructions to be combined 14665 // together by the other folds above. 14666 if (N->getOperand(0).getOpcode() == ARMISD::BFI) { 14667 APInt ToMask1 = ~N->getConstantOperandAPInt(2); 14668 APInt ToMask2 = ~N0.getConstantOperandAPInt(2); 14669 14670 if (!N0.hasOneUse() || (ToMask1 & ToMask2) != 0 || 14671 ToMask1.countLeadingZeros() < ToMask2.countLeadingZeros()) 14672 return SDValue(); 14673 14674 EVT VT = N->getValueType(0); 14675 SDLoc dl(N); 14676 SDValue BFI1 = DAG.getNode(ARMISD::BFI, dl, VT, N0.getOperand(0), 14677 N->getOperand(1), N->getOperand(2)); 14678 return DAG.getNode(ARMISD::BFI, dl, VT, BFI1, N0.getOperand(1), 14679 N0.getOperand(2)); 14680 } 14681 14682 return SDValue(); 14683 } 14684 14685 // Check that N is CMPZ(CSINC(0, 0, CC, X)), 14686 // or CMPZ(CMOV(1, 0, CC, $cpsr, X)) 14687 // return X if valid. 14688 static SDValue IsCMPZCSINC(SDNode *Cmp, ARMCC::CondCodes &CC) { 14689 if (Cmp->getOpcode() != ARMISD::CMPZ || !isNullConstant(Cmp->getOperand(1))) 14690 return SDValue(); 14691 SDValue CSInc = Cmp->getOperand(0); 14692 14693 // Ignore any `And 1` nodes that may not yet have been removed. We are 14694 // looking for a value that produces 1/0, so these have no effect on the 14695 // code. 14696 while (CSInc.getOpcode() == ISD::AND && 14697 isa<ConstantSDNode>(CSInc.getOperand(1)) && 14698 CSInc.getConstantOperandVal(1) == 1 && CSInc->hasOneUse()) 14699 CSInc = CSInc.getOperand(0); 14700 14701 if (CSInc.getOpcode() == ARMISD::CSINC && 14702 isNullConstant(CSInc.getOperand(0)) && 14703 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) { 14704 CC = (ARMCC::CondCodes)CSInc.getConstantOperandVal(2); 14705 return CSInc.getOperand(3); 14706 } 14707 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(0)) && 14708 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) { 14709 CC = (ARMCC::CondCodes)CSInc.getConstantOperandVal(2); 14710 return CSInc.getOperand(4); 14711 } 14712 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(1)) && 14713 isNullConstant(CSInc.getOperand(0)) && CSInc->hasOneUse()) { 14714 CC = ARMCC::getOppositeCondition( 14715 (ARMCC::CondCodes)CSInc.getConstantOperandVal(2)); 14716 return CSInc.getOperand(4); 14717 } 14718 return SDValue(); 14719 } 14720 14721 static SDValue PerformCMPZCombine(SDNode *N, SelectionDAG &DAG) { 14722 // Given CMPZ(CSINC(C, 0, 0, EQ), 0), we can just use C directly. As in 14723 // t92: glue = ARMISD::CMPZ t74, 0 14724 // t93: i32 = ARMISD::CSINC 0, 0, 1, t92 14725 // t96: glue = ARMISD::CMPZ t93, 0 14726 // t114: i32 = ARMISD::CSINV 0, 0, 0, t96 14727 ARMCC::CondCodes Cond; 14728 if (SDValue C = IsCMPZCSINC(N, Cond)) 14729 if (Cond == ARMCC::EQ) 14730 return C; 14731 return SDValue(); 14732 } 14733 14734 static SDValue PerformCSETCombine(SDNode *N, SelectionDAG &DAG) { 14735 // Fold away an unneccessary CMPZ/CSINC 14736 // CSXYZ A, B, C1 (CMPZ (CSINC 0, 0, C2, D), 0) -> 14737 // if C1==EQ -> CSXYZ A, B, C2, D 14738 // if C1==NE -> CSXYZ A, B, NOT(C2), D 14739 ARMCC::CondCodes Cond; 14740 if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) { 14741 if (N->getConstantOperandVal(2) == ARMCC::EQ) 14742 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0), 14743 N->getOperand(1), 14744 DAG.getConstant(Cond, SDLoc(N), MVT::i32), C); 14745 if (N->getConstantOperandVal(2) == ARMCC::NE) 14746 return DAG.getNode( 14747 N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0), 14748 N->getOperand(1), 14749 DAG.getConstant(ARMCC::getOppositeCondition(Cond), SDLoc(N), MVT::i32), C); 14750 } 14751 return SDValue(); 14752 } 14753 14754 /// PerformVMOVRRDCombine - Target-specific dag combine xforms for 14755 /// ARMISD::VMOVRRD. 14756 static SDValue PerformVMOVRRDCombine(SDNode *N, 14757 TargetLowering::DAGCombinerInfo &DCI, 14758 const ARMSubtarget *Subtarget) { 14759 // vmovrrd(vmovdrr x, y) -> x,y 14760 SDValue InDouble = N->getOperand(0); 14761 if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64()) 14762 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1)); 14763 14764 // vmovrrd(load f64) -> (load i32), (load i32) 14765 SDNode *InNode = InDouble.getNode(); 14766 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() && 14767 InNode->getValueType(0) == MVT::f64 && 14768 InNode->getOperand(1).getOpcode() == ISD::FrameIndex && 14769 !cast<LoadSDNode>(InNode)->isVolatile()) { 14770 // TODO: Should this be done for non-FrameIndex operands? 14771 LoadSDNode *LD = cast<LoadSDNode>(InNode); 14772 14773 SelectionDAG &DAG = DCI.DAG; 14774 SDLoc DL(LD); 14775 SDValue BasePtr = LD->getBasePtr(); 14776 SDValue NewLD1 = 14777 DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(), 14778 LD->getAlignment(), LD->getMemOperand()->getFlags()); 14779 14780 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 14781 DAG.getConstant(4, DL, MVT::i32)); 14782 14783 SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr, 14784 LD->getPointerInfo().getWithOffset(4), 14785 std::min(4U, LD->getAlignment()), 14786 LD->getMemOperand()->getFlags()); 14787 14788 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1)); 14789 if (DCI.DAG.getDataLayout().isBigEndian()) 14790 std::swap (NewLD1, NewLD2); 14791 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2); 14792 return Result; 14793 } 14794 14795 // VMOVRRD(extract(..(build_vector(a, b, c, d)))) -> a,b or c,d 14796 // VMOVRRD(extract(insert_vector(insert_vector(.., a, l1), b, l2))) -> a,b 14797 if (InDouble.getOpcode() == ISD::EXTRACT_VECTOR_ELT && 14798 isa<ConstantSDNode>(InDouble.getOperand(1))) { 14799 SDValue BV = InDouble.getOperand(0); 14800 // Look up through any nop bitcasts and vector_reg_casts. bitcasts may 14801 // change lane order under big endian. 14802 bool BVSwap = BV.getOpcode() == ISD::BITCAST; 14803 while ( 14804 (BV.getOpcode() == ISD::BITCAST || 14805 BV.getOpcode() == ARMISD::VECTOR_REG_CAST) && 14806 (BV.getValueType() == MVT::v2f64 || BV.getValueType() == MVT::v2i64)) { 14807 BVSwap = BV.getOpcode() == ISD::BITCAST; 14808 BV = BV.getOperand(0); 14809 } 14810 if (BV.getValueType() != MVT::v4i32) 14811 return SDValue(); 14812 14813 // Handle buildvectors, pulling out the correct lane depending on 14814 // endianness. 14815 unsigned Offset = InDouble.getConstantOperandVal(1) == 1 ? 2 : 0; 14816 if (BV.getOpcode() == ISD::BUILD_VECTOR) { 14817 SDValue Op0 = BV.getOperand(Offset); 14818 SDValue Op1 = BV.getOperand(Offset + 1); 14819 if (!Subtarget->isLittle() && BVSwap) 14820 std::swap(Op0, Op1); 14821 14822 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N)); 14823 } 14824 14825 // A chain of insert_vectors, grabbing the correct value of the chain of 14826 // inserts. 14827 SDValue Op0, Op1; 14828 while (BV.getOpcode() == ISD::INSERT_VECTOR_ELT) { 14829 if (isa<ConstantSDNode>(BV.getOperand(2))) { 14830 if (BV.getConstantOperandVal(2) == Offset) 14831 Op0 = BV.getOperand(1); 14832 if (BV.getConstantOperandVal(2) == Offset + 1) 14833 Op1 = BV.getOperand(1); 14834 } 14835 BV = BV.getOperand(0); 14836 } 14837 if (!Subtarget->isLittle() && BVSwap) 14838 std::swap(Op0, Op1); 14839 if (Op0 && Op1) 14840 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N)); 14841 } 14842 14843 return SDValue(); 14844 } 14845 14846 /// PerformVMOVDRRCombine - Target-specific dag combine xforms for 14847 /// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands. 14848 static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) { 14849 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X) 14850 SDValue Op0 = N->getOperand(0); 14851 SDValue Op1 = N->getOperand(1); 14852 if (Op0.getOpcode() == ISD::BITCAST) 14853 Op0 = Op0.getOperand(0); 14854 if (Op1.getOpcode() == ISD::BITCAST) 14855 Op1 = Op1.getOperand(0); 14856 if (Op0.getOpcode() == ARMISD::VMOVRRD && 14857 Op0.getNode() == Op1.getNode() && 14858 Op0.getResNo() == 0 && Op1.getResNo() == 1) 14859 return DAG.getNode(ISD::BITCAST, SDLoc(N), 14860 N->getValueType(0), Op0.getOperand(0)); 14861 return SDValue(); 14862 } 14863 14864 static SDValue PerformVMOVhrCombine(SDNode *N, 14865 TargetLowering::DAGCombinerInfo &DCI) { 14866 SDValue Op0 = N->getOperand(0); 14867 14868 // VMOVhr (VMOVrh (X)) -> X 14869 if (Op0->getOpcode() == ARMISD::VMOVrh) 14870 return Op0->getOperand(0); 14871 14872 // FullFP16: half values are passed in S-registers, and we don't 14873 // need any of the bitcast and moves: 14874 // 14875 // t2: f32,ch = CopyFromReg t0, Register:f32 %0 14876 // t5: i32 = bitcast t2 14877 // t18: f16 = ARMISD::VMOVhr t5 14878 if (Op0->getOpcode() == ISD::BITCAST) { 14879 SDValue Copy = Op0->getOperand(0); 14880 if (Copy.getValueType() == MVT::f32 && 14881 Copy->getOpcode() == ISD::CopyFromReg) { 14882 SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1)}; 14883 SDValue NewCopy = 14884 DCI.DAG.getNode(ISD::CopyFromReg, SDLoc(N), N->getValueType(0), Ops); 14885 return NewCopy; 14886 } 14887 } 14888 14889 // fold (VMOVhr (load x)) -> (load (f16*)x) 14890 if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Op0)) { 14891 if (LN0->hasOneUse() && LN0->isUnindexed() && 14892 LN0->getMemoryVT() == MVT::i16) { 14893 SDValue Load = 14894 DCI.DAG.getLoad(N->getValueType(0), SDLoc(N), LN0->getChain(), 14895 LN0->getBasePtr(), LN0->getMemOperand()); 14896 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0)); 14897 DCI.DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1)); 14898 return Load; 14899 } 14900 } 14901 14902 // Only the bottom 16 bits of the source register are used. 14903 APInt DemandedMask = APInt::getLowBitsSet(32, 16); 14904 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo(); 14905 if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI)) 14906 return SDValue(N, 0); 14907 14908 return SDValue(); 14909 } 14910 14911 static SDValue PerformVMOVrhCombine(SDNode *N, SelectionDAG &DAG) { 14912 SDValue N0 = N->getOperand(0); 14913 EVT VT = N->getValueType(0); 14914 14915 // fold (VMOVrh (fpconst x)) -> const x 14916 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N0)) { 14917 APFloat V = C->getValueAPF(); 14918 return DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT); 14919 } 14920 14921 // fold (VMOVrh (load x)) -> (zextload (i16*)x) 14922 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) { 14923 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 14924 14925 SDValue Load = 14926 DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(), 14927 LN0->getBasePtr(), MVT::i16, LN0->getMemOperand()); 14928 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0)); 14929 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1)); 14930 return Load; 14931 } 14932 14933 // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n) 14934 if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT && 14935 isa<ConstantSDNode>(N0->getOperand(1))) 14936 return DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0), 14937 N0->getOperand(1)); 14938 14939 return SDValue(); 14940 } 14941 14942 /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node 14943 /// are normal, non-volatile loads. If so, it is profitable to bitcast an 14944 /// i64 vector to have f64 elements, since the value can then be loaded 14945 /// directly into a VFP register. 14946 static bool hasNormalLoadOperand(SDNode *N) { 14947 unsigned NumElts = N->getValueType(0).getVectorNumElements(); 14948 for (unsigned i = 0; i < NumElts; ++i) { 14949 SDNode *Elt = N->getOperand(i).getNode(); 14950 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile()) 14951 return true; 14952 } 14953 return false; 14954 } 14955 14956 /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for 14957 /// ISD::BUILD_VECTOR. 14958 static SDValue PerformBUILD_VECTORCombine(SDNode *N, 14959 TargetLowering::DAGCombinerInfo &DCI, 14960 const ARMSubtarget *Subtarget) { 14961 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X): 14962 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value 14963 // into a pair of GPRs, which is fine when the value is used as a scalar, 14964 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD. 14965 SelectionDAG &DAG = DCI.DAG; 14966 if (N->getNumOperands() == 2) 14967 if (SDValue RV = PerformVMOVDRRCombine(N, DAG)) 14968 return RV; 14969 14970 // Load i64 elements as f64 values so that type legalization does not split 14971 // them up into i32 values. 14972 EVT VT = N->getValueType(0); 14973 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N)) 14974 return SDValue(); 14975 SDLoc dl(N); 14976 SmallVector<SDValue, 8> Ops; 14977 unsigned NumElts = VT.getVectorNumElements(); 14978 for (unsigned i = 0; i < NumElts; ++i) { 14979 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i)); 14980 Ops.push_back(V); 14981 // Make the DAGCombiner fold the bitcast. 14982 DCI.AddToWorklist(V.getNode()); 14983 } 14984 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts); 14985 SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops); 14986 return DAG.getNode(ISD::BITCAST, dl, VT, BV); 14987 } 14988 14989 /// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR. 14990 static SDValue 14991 PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 14992 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR. 14993 // At that time, we may have inserted bitcasts from integer to float. 14994 // If these bitcasts have survived DAGCombine, change the lowering of this 14995 // BUILD_VECTOR in something more vector friendly, i.e., that does not 14996 // force to use floating point types. 14997 14998 // Make sure we can change the type of the vector. 14999 // This is possible iff: 15000 // 1. The vector is only used in a bitcast to a integer type. I.e., 15001 // 1.1. Vector is used only once. 15002 // 1.2. Use is a bit convert to an integer type. 15003 // 2. The size of its operands are 32-bits (64-bits are not legal). 15004 EVT VT = N->getValueType(0); 15005 EVT EltVT = VT.getVectorElementType(); 15006 15007 // Check 1.1. and 2. 15008 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse()) 15009 return SDValue(); 15010 15011 // By construction, the input type must be float. 15012 assert(EltVT == MVT::f32 && "Unexpected type!"); 15013 15014 // Check 1.2. 15015 SDNode *Use = *N->use_begin(); 15016 if (Use->getOpcode() != ISD::BITCAST || 15017 Use->getValueType(0).isFloatingPoint()) 15018 return SDValue(); 15019 15020 // Check profitability. 15021 // Model is, if more than half of the relevant operands are bitcast from 15022 // i32, turn the build_vector into a sequence of insert_vector_elt. 15023 // Relevant operands are everything that is not statically 15024 // (i.e., at compile time) bitcasted. 15025 unsigned NumOfBitCastedElts = 0; 15026 unsigned NumElts = VT.getVectorNumElements(); 15027 unsigned NumOfRelevantElts = NumElts; 15028 for (unsigned Idx = 0; Idx < NumElts; ++Idx) { 15029 SDValue Elt = N->getOperand(Idx); 15030 if (Elt->getOpcode() == ISD::BITCAST) { 15031 // Assume only bit cast to i32 will go away. 15032 if (Elt->getOperand(0).getValueType() == MVT::i32) 15033 ++NumOfBitCastedElts; 15034 } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt)) 15035 // Constants are statically casted, thus do not count them as 15036 // relevant operands. 15037 --NumOfRelevantElts; 15038 } 15039 15040 // Check if more than half of the elements require a non-free bitcast. 15041 if (NumOfBitCastedElts <= NumOfRelevantElts / 2) 15042 return SDValue(); 15043 15044 SelectionDAG &DAG = DCI.DAG; 15045 // Create the new vector type. 15046 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); 15047 // Check if the type is legal. 15048 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 15049 if (!TLI.isTypeLegal(VecVT)) 15050 return SDValue(); 15051 15052 // Combine: 15053 // ARMISD::BUILD_VECTOR E1, E2, ..., EN. 15054 // => BITCAST INSERT_VECTOR_ELT 15055 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1), 15056 // (BITCAST EN), N. 15057 SDValue Vec = DAG.getUNDEF(VecVT); 15058 SDLoc dl(N); 15059 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) { 15060 SDValue V = N->getOperand(Idx); 15061 if (V.isUndef()) 15062 continue; 15063 if (V.getOpcode() == ISD::BITCAST && 15064 V->getOperand(0).getValueType() == MVT::i32) 15065 // Fold obvious case. 15066 V = V.getOperand(0); 15067 else { 15068 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V); 15069 // Make the DAGCombiner fold the bitcasts. 15070 DCI.AddToWorklist(V.getNode()); 15071 } 15072 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32); 15073 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx); 15074 } 15075 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec); 15076 // Make the DAGCombiner fold the bitcasts. 15077 DCI.AddToWorklist(Vec.getNode()); 15078 return Vec; 15079 } 15080 15081 static SDValue 15082 PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 15083 EVT VT = N->getValueType(0); 15084 SDValue Op = N->getOperand(0); 15085 SDLoc dl(N); 15086 15087 // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x) 15088 if (Op->getOpcode() == ARMISD::PREDICATE_CAST) { 15089 // If the valuetypes are the same, we can remove the cast entirely. 15090 if (Op->getOperand(0).getValueType() == VT) 15091 return Op->getOperand(0); 15092 return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0)); 15093 } 15094 15095 // Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce 15096 // more VPNOT which might get folded as else predicates. 15097 if (Op.getValueType() == MVT::i32 && isBitwiseNot(Op)) { 15098 SDValue X = 15099 DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0)); 15100 SDValue C = DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, 15101 DCI.DAG.getConstant(65535, dl, MVT::i32)); 15102 return DCI.DAG.getNode(ISD::XOR, dl, VT, X, C); 15103 } 15104 15105 // Only the bottom 16 bits of the source register are used. 15106 if (Op.getValueType() == MVT::i32) { 15107 APInt DemandedMask = APInt::getLowBitsSet(32, 16); 15108 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo(); 15109 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI)) 15110 return SDValue(N, 0); 15111 } 15112 return SDValue(); 15113 } 15114 15115 static SDValue PerformVECTOR_REG_CASTCombine(SDNode *N, SelectionDAG &DAG, 15116 const ARMSubtarget *ST) { 15117 EVT VT = N->getValueType(0); 15118 SDValue Op = N->getOperand(0); 15119 SDLoc dl(N); 15120 15121 // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST 15122 if (ST->isLittle()) 15123 return DAG.getNode(ISD::BITCAST, dl, VT, Op); 15124 15125 // VECTOR_REG_CAST undef -> undef 15126 if (Op.isUndef()) 15127 return DAG.getUNDEF(VT); 15128 15129 // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x) 15130 if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) { 15131 // If the valuetypes are the same, we can remove the cast entirely. 15132 if (Op->getOperand(0).getValueType() == VT) 15133 return Op->getOperand(0); 15134 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0)); 15135 } 15136 15137 return SDValue(); 15138 } 15139 15140 static SDValue PerformVCMPCombine(SDNode *N, SelectionDAG &DAG, 15141 const ARMSubtarget *Subtarget) { 15142 if (!Subtarget->hasMVEIntegerOps()) 15143 return SDValue(); 15144 15145 EVT VT = N->getValueType(0); 15146 SDValue Op0 = N->getOperand(0); 15147 SDValue Op1 = N->getOperand(1); 15148 ARMCC::CondCodes Cond = 15149 (ARMCC::CondCodes)cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 15150 SDLoc dl(N); 15151 15152 // vcmp X, 0, cc -> vcmpz X, cc 15153 if (isZeroVector(Op1)) 15154 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, N->getOperand(2)); 15155 15156 unsigned SwappedCond = getSwappedCondition(Cond); 15157 if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) { 15158 // vcmp 0, X, cc -> vcmpz X, reversed(cc) 15159 if (isZeroVector(Op0)) 15160 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1, 15161 DAG.getConstant(SwappedCond, dl, MVT::i32)); 15162 // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc) 15163 if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP) 15164 return DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0, 15165 DAG.getConstant(SwappedCond, dl, MVT::i32)); 15166 } 15167 15168 return SDValue(); 15169 } 15170 15171 /// PerformInsertEltCombine - Target-specific dag combine xforms for 15172 /// ISD::INSERT_VECTOR_ELT. 15173 static SDValue PerformInsertEltCombine(SDNode *N, 15174 TargetLowering::DAGCombinerInfo &DCI) { 15175 // Bitcast an i64 load inserted into a vector to f64. 15176 // Otherwise, the i64 value will be legalized to a pair of i32 values. 15177 EVT VT = N->getValueType(0); 15178 SDNode *Elt = N->getOperand(1).getNode(); 15179 if (VT.getVectorElementType() != MVT::i64 || 15180 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile()) 15181 return SDValue(); 15182 15183 SelectionDAG &DAG = DCI.DAG; 15184 SDLoc dl(N); 15185 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 15186 VT.getVectorNumElements()); 15187 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0)); 15188 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1)); 15189 // Make the DAGCombiner fold the bitcasts. 15190 DCI.AddToWorklist(Vec.getNode()); 15191 DCI.AddToWorklist(V.getNode()); 15192 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT, 15193 Vec, V, N->getOperand(2)); 15194 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt); 15195 } 15196 15197 // Convert a pair of extracts from the same base vector to a VMOVRRD. Either 15198 // directly or bitcast to an integer if the original is a float vector. 15199 // extract(x, n); extract(x, n+1) -> VMOVRRD(extract v2f64 x, n/2) 15200 // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD(extract x, n/2) 15201 static SDValue 15202 PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 15203 EVT VT = N->getValueType(0); 15204 SDLoc dl(N); 15205 15206 if (!DCI.isAfterLegalizeDAG() || VT != MVT::i32 || 15207 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(MVT::f64)) 15208 return SDValue(); 15209 15210 SDValue Ext = SDValue(N, 0); 15211 if (Ext.getOpcode() == ISD::BITCAST && 15212 Ext.getOperand(0).getValueType() == MVT::f32) 15213 Ext = Ext.getOperand(0); 15214 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 15215 !isa<ConstantSDNode>(Ext.getOperand(1)) || 15216 Ext.getConstantOperandVal(1) % 2 != 0) 15217 return SDValue(); 15218 if (Ext->use_size() == 1 && 15219 (Ext->use_begin()->getOpcode() == ISD::SINT_TO_FP || 15220 Ext->use_begin()->getOpcode() == ISD::UINT_TO_FP)) 15221 return SDValue(); 15222 15223 SDValue Op0 = Ext.getOperand(0); 15224 EVT VecVT = Op0.getValueType(); 15225 unsigned ResNo = Op0.getResNo(); 15226 unsigned Lane = Ext.getConstantOperandVal(1); 15227 if (VecVT.getVectorNumElements() != 4) 15228 return SDValue(); 15229 15230 // Find another extract, of Lane + 1 15231 auto OtherIt = find_if(Op0->uses(), [&](SDNode *V) { 15232 return V->getOpcode() == ISD::EXTRACT_VECTOR_ELT && 15233 isa<ConstantSDNode>(V->getOperand(1)) && 15234 V->getConstantOperandVal(1) == Lane + 1 && 15235 V->getOperand(0).getResNo() == ResNo; 15236 }); 15237 if (OtherIt == Op0->uses().end()) 15238 return SDValue(); 15239 15240 // For float extracts, we need to be converting to a i32 for both vector 15241 // lanes. 15242 SDValue OtherExt(*OtherIt, 0); 15243 if (OtherExt.getValueType() != MVT::i32) { 15244 if (OtherExt->use_size() != 1 || 15245 OtherExt->use_begin()->getOpcode() != ISD::BITCAST || 15246 OtherExt->use_begin()->getValueType(0) != MVT::i32) 15247 return SDValue(); 15248 OtherExt = SDValue(*OtherExt->use_begin(), 0); 15249 } 15250 15251 // Convert the type to a f64 and extract with a VMOVRRD. 15252 SDValue F64 = DCI.DAG.getNode( 15253 ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 15254 DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v2f64, Op0), 15255 DCI.DAG.getConstant(Ext.getConstantOperandVal(1) / 2, dl, MVT::i32)); 15256 SDValue VMOVRRD = 15257 DCI.DAG.getNode(ARMISD::VMOVRRD, dl, {MVT::i32, MVT::i32}, F64); 15258 15259 DCI.CombineTo(OtherExt.getNode(), SDValue(VMOVRRD.getNode(), 1)); 15260 return VMOVRRD; 15261 } 15262 15263 static SDValue PerformExtractEltCombine(SDNode *N, 15264 TargetLowering::DAGCombinerInfo &DCI, 15265 const ARMSubtarget *ST) { 15266 SDValue Op0 = N->getOperand(0); 15267 EVT VT = N->getValueType(0); 15268 SDLoc dl(N); 15269 15270 // extract (vdup x) -> x 15271 if (Op0->getOpcode() == ARMISD::VDUP) { 15272 SDValue X = Op0->getOperand(0); 15273 if (VT == MVT::f16 && X.getValueType() == MVT::i32) 15274 return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X); 15275 if (VT == MVT::i32 && X.getValueType() == MVT::f16) 15276 return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X); 15277 if (VT == MVT::f32 && X.getValueType() == MVT::i32) 15278 return DCI.DAG.getNode(ISD::BITCAST, dl, VT, X); 15279 15280 while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST) 15281 X = X->getOperand(0); 15282 if (X.getValueType() == VT) 15283 return X; 15284 } 15285 15286 // extract ARM_BUILD_VECTOR -> x 15287 if (Op0->getOpcode() == ARMISD::BUILD_VECTOR && 15288 isa<ConstantSDNode>(N->getOperand(1)) && 15289 N->getConstantOperandVal(1) < Op0.getNumOperands()) { 15290 return Op0.getOperand(N->getConstantOperandVal(1)); 15291 } 15292 15293 // extract(bitcast(BUILD_VECTOR(VMOVDRR(a, b), ..))) -> a or b 15294 if (Op0.getValueType() == MVT::v4i32 && 15295 isa<ConstantSDNode>(N->getOperand(1)) && 15296 Op0.getOpcode() == ISD::BITCAST && 15297 Op0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR && 15298 Op0.getOperand(0).getValueType() == MVT::v2f64) { 15299 SDValue BV = Op0.getOperand(0); 15300 unsigned Offset = N->getConstantOperandVal(1); 15301 SDValue MOV = BV.getOperand(Offset < 2 ? 0 : 1); 15302 if (MOV.getOpcode() == ARMISD::VMOVDRR) 15303 return MOV.getOperand(ST->isLittle() ? Offset % 2 : 1 - Offset % 2); 15304 } 15305 15306 // extract x, n; extract x, n+1 -> VMOVRRD x 15307 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI)) 15308 return R; 15309 15310 // extract (MVETrunc(x)) -> extract x 15311 if (Op0->getOpcode() == ARMISD::MVETRUNC) { 15312 unsigned Idx = N->getConstantOperandVal(1); 15313 unsigned Vec = 15314 Idx / Op0->getOperand(0).getValueType().getVectorNumElements(); 15315 unsigned SubIdx = 15316 Idx % Op0->getOperand(0).getValueType().getVectorNumElements(); 15317 return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Op0.getOperand(Vec), 15318 DCI.DAG.getConstant(SubIdx, dl, MVT::i32)); 15319 } 15320 15321 return SDValue(); 15322 } 15323 15324 static SDValue PerformSignExtendInregCombine(SDNode *N, SelectionDAG &DAG) { 15325 SDValue Op = N->getOperand(0); 15326 EVT VT = N->getValueType(0); 15327 15328 // sext_inreg(VGETLANEu) -> VGETLANEs 15329 if (Op.getOpcode() == ARMISD::VGETLANEu && 15330 cast<VTSDNode>(N->getOperand(1))->getVT() == 15331 Op.getOperand(0).getValueType().getScalarType()) 15332 return DAG.getNode(ARMISD::VGETLANEs, SDLoc(N), VT, Op.getOperand(0), 15333 Op.getOperand(1)); 15334 15335 return SDValue(); 15336 } 15337 15338 // When lowering complex nodes that we recognize, like VQDMULH and MULH, we 15339 // can end up with shuffle(binop(shuffle, shuffle)), that can be simplified to 15340 // binop as the shuffles cancel out. 15341 static SDValue FlattenVectorShuffle(ShuffleVectorSDNode *N, SelectionDAG &DAG) { 15342 EVT VT = N->getValueType(0); 15343 if (!N->getOperand(1).isUndef() || N->getOperand(0).getValueType() != VT) 15344 return SDValue(); 15345 SDValue Op = N->getOperand(0); 15346 15347 // Looking for binary operators that will have been folded from 15348 // truncates/extends. 15349 switch (Op.getOpcode()) { 15350 case ARMISD::VQDMULH: 15351 case ISD::MULHS: 15352 case ISD::MULHU: 15353 case ISD::ABDS: 15354 case ISD::ABDU: 15355 break; 15356 default: 15357 return SDValue(); 15358 } 15359 15360 ShuffleVectorSDNode *Op0 = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0)); 15361 ShuffleVectorSDNode *Op1 = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1)); 15362 if (!Op0 || !Op1 || !Op0->getOperand(1).isUndef() || 15363 !Op1->getOperand(1).isUndef() || Op0->getMask() != Op1->getMask() || 15364 Op0->getOperand(0).getValueType() != VT) 15365 return SDValue(); 15366 15367 // Check the mask turns into an identity shuffle. 15368 ArrayRef<int> NMask = N->getMask(); 15369 ArrayRef<int> OpMask = Op0->getMask(); 15370 for (int i = 0, e = NMask.size(); i != e; i++) { 15371 if (NMask[i] > 0 && OpMask[NMask[i]] > 0 && OpMask[NMask[i]] != i) 15372 return SDValue(); 15373 } 15374 15375 return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), 15376 Op0->getOperand(0), Op1->getOperand(0)); 15377 } 15378 15379 static SDValue 15380 PerformInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 15381 SDValue Vec = N->getOperand(0); 15382 SDValue SubVec = N->getOperand(1); 15383 uint64_t IdxVal = N->getConstantOperandVal(2); 15384 EVT VecVT = Vec.getValueType(); 15385 EVT SubVT = SubVec.getValueType(); 15386 15387 // Only do this for legal fixed vector types. 15388 if (!VecVT.isFixedLengthVector() || 15389 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VecVT) || 15390 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(SubVT)) 15391 return SDValue(); 15392 15393 // Ignore widening patterns. 15394 if (IdxVal == 0 && Vec.isUndef()) 15395 return SDValue(); 15396 15397 // Subvector must be half the width and an "aligned" insertion. 15398 unsigned NumSubElts = SubVT.getVectorNumElements(); 15399 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() || 15400 (IdxVal != 0 && IdxVal != NumSubElts)) 15401 return SDValue(); 15402 15403 // Fold insert_subvector -> concat_vectors 15404 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi)) 15405 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub) 15406 SDLoc DL(N); 15407 SDValue Lo, Hi; 15408 if (IdxVal == 0) { 15409 Lo = SubVec; 15410 Hi = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec, 15411 DCI.DAG.getVectorIdxConstant(NumSubElts, DL)); 15412 } else { 15413 Lo = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec, 15414 DCI.DAG.getVectorIdxConstant(0, DL)); 15415 Hi = SubVec; 15416 } 15417 return DCI.DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi); 15418 } 15419 15420 // shuffle(MVETrunc(x, y)) -> VMOVN(x, y) 15421 static SDValue PerformShuffleVMOVNCombine(ShuffleVectorSDNode *N, 15422 SelectionDAG &DAG) { 15423 SDValue Trunc = N->getOperand(0); 15424 EVT VT = Trunc.getValueType(); 15425 if (Trunc.getOpcode() != ARMISD::MVETRUNC || !N->getOperand(1).isUndef()) 15426 return SDValue(); 15427 15428 SDLoc DL(Trunc); 15429 if (isVMOVNTruncMask(N->getMask(), VT, false)) 15430 return DAG.getNode( 15431 ARMISD::VMOVN, DL, VT, 15432 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)), 15433 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)), 15434 DAG.getConstant(1, DL, MVT::i32)); 15435 else if (isVMOVNTruncMask(N->getMask(), VT, true)) 15436 return DAG.getNode( 15437 ARMISD::VMOVN, DL, VT, 15438 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)), 15439 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)), 15440 DAG.getConstant(1, DL, MVT::i32)); 15441 return SDValue(); 15442 } 15443 15444 /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for 15445 /// ISD::VECTOR_SHUFFLE. 15446 static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { 15447 if (SDValue R = FlattenVectorShuffle(cast<ShuffleVectorSDNode>(N), DAG)) 15448 return R; 15449 if (SDValue R = PerformShuffleVMOVNCombine(cast<ShuffleVectorSDNode>(N), DAG)) 15450 return R; 15451 15452 // The LLVM shufflevector instruction does not require the shuffle mask 15453 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does 15454 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the 15455 // operands do not match the mask length, they are extended by concatenating 15456 // them with undef vectors. That is probably the right thing for other 15457 // targets, but for NEON it is better to concatenate two double-register 15458 // size vector operands into a single quad-register size vector. Do that 15459 // transformation here: 15460 // shuffle(concat(v1, undef), concat(v2, undef)) -> 15461 // shuffle(concat(v1, v2), undef) 15462 SDValue Op0 = N->getOperand(0); 15463 SDValue Op1 = N->getOperand(1); 15464 if (Op0.getOpcode() != ISD::CONCAT_VECTORS || 15465 Op1.getOpcode() != ISD::CONCAT_VECTORS || 15466 Op0.getNumOperands() != 2 || 15467 Op1.getNumOperands() != 2) 15468 return SDValue(); 15469 SDValue Concat0Op1 = Op0.getOperand(1); 15470 SDValue Concat1Op1 = Op1.getOperand(1); 15471 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef()) 15472 return SDValue(); 15473 // Skip the transformation if any of the types are illegal. 15474 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 15475 EVT VT = N->getValueType(0); 15476 if (!TLI.isTypeLegal(VT) || 15477 !TLI.isTypeLegal(Concat0Op1.getValueType()) || 15478 !TLI.isTypeLegal(Concat1Op1.getValueType())) 15479 return SDValue(); 15480 15481 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, 15482 Op0.getOperand(0), Op1.getOperand(0)); 15483 // Translate the shuffle mask. 15484 SmallVector<int, 16> NewMask; 15485 unsigned NumElts = VT.getVectorNumElements(); 15486 unsigned HalfElts = NumElts/2; 15487 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 15488 for (unsigned n = 0; n < NumElts; ++n) { 15489 int MaskElt = SVN->getMaskElt(n); 15490 int NewElt = -1; 15491 if (MaskElt < (int)HalfElts) 15492 NewElt = MaskElt; 15493 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts)) 15494 NewElt = HalfElts + MaskElt - NumElts; 15495 NewMask.push_back(NewElt); 15496 } 15497 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat, 15498 DAG.getUNDEF(VT), NewMask); 15499 } 15500 15501 /// Load/store instruction that can be merged with a base address 15502 /// update 15503 struct BaseUpdateTarget { 15504 SDNode *N; 15505 bool isIntrinsic; 15506 bool isStore; 15507 unsigned AddrOpIdx; 15508 }; 15509 15510 struct BaseUpdateUser { 15511 /// Instruction that updates a pointer 15512 SDNode *N; 15513 /// Pointer increment operand 15514 SDValue Inc; 15515 /// Pointer increment value if it is a constant, or 0 otherwise 15516 unsigned ConstInc; 15517 }; 15518 15519 static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target, 15520 struct BaseUpdateUser &User, 15521 bool SimpleConstIncOnly, 15522 TargetLowering::DAGCombinerInfo &DCI) { 15523 SelectionDAG &DAG = DCI.DAG; 15524 SDNode *N = Target.N; 15525 MemSDNode *MemN = cast<MemSDNode>(N); 15526 SDLoc dl(N); 15527 15528 // Find the new opcode for the updating load/store. 15529 bool isLoadOp = true; 15530 bool isLaneOp = false; 15531 // Workaround for vst1x and vld1x intrinsics which do not have alignment 15532 // as an operand. 15533 bool hasAlignment = true; 15534 unsigned NewOpc = 0; 15535 unsigned NumVecs = 0; 15536 if (Target.isIntrinsic) { 15537 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 15538 switch (IntNo) { 15539 default: 15540 llvm_unreachable("unexpected intrinsic for Neon base update"); 15541 case Intrinsic::arm_neon_vld1: 15542 NewOpc = ARMISD::VLD1_UPD; 15543 NumVecs = 1; 15544 break; 15545 case Intrinsic::arm_neon_vld2: 15546 NewOpc = ARMISD::VLD2_UPD; 15547 NumVecs = 2; 15548 break; 15549 case Intrinsic::arm_neon_vld3: 15550 NewOpc = ARMISD::VLD3_UPD; 15551 NumVecs = 3; 15552 break; 15553 case Intrinsic::arm_neon_vld4: 15554 NewOpc = ARMISD::VLD4_UPD; 15555 NumVecs = 4; 15556 break; 15557 case Intrinsic::arm_neon_vld1x2: 15558 NewOpc = ARMISD::VLD1x2_UPD; 15559 NumVecs = 2; 15560 hasAlignment = false; 15561 break; 15562 case Intrinsic::arm_neon_vld1x3: 15563 NewOpc = ARMISD::VLD1x3_UPD; 15564 NumVecs = 3; 15565 hasAlignment = false; 15566 break; 15567 case Intrinsic::arm_neon_vld1x4: 15568 NewOpc = ARMISD::VLD1x4_UPD; 15569 NumVecs = 4; 15570 hasAlignment = false; 15571 break; 15572 case Intrinsic::arm_neon_vld2dup: 15573 NewOpc = ARMISD::VLD2DUP_UPD; 15574 NumVecs = 2; 15575 break; 15576 case Intrinsic::arm_neon_vld3dup: 15577 NewOpc = ARMISD::VLD3DUP_UPD; 15578 NumVecs = 3; 15579 break; 15580 case Intrinsic::arm_neon_vld4dup: 15581 NewOpc = ARMISD::VLD4DUP_UPD; 15582 NumVecs = 4; 15583 break; 15584 case Intrinsic::arm_neon_vld2lane: 15585 NewOpc = ARMISD::VLD2LN_UPD; 15586 NumVecs = 2; 15587 isLaneOp = true; 15588 break; 15589 case Intrinsic::arm_neon_vld3lane: 15590 NewOpc = ARMISD::VLD3LN_UPD; 15591 NumVecs = 3; 15592 isLaneOp = true; 15593 break; 15594 case Intrinsic::arm_neon_vld4lane: 15595 NewOpc = ARMISD::VLD4LN_UPD; 15596 NumVecs = 4; 15597 isLaneOp = true; 15598 break; 15599 case Intrinsic::arm_neon_vst1: 15600 NewOpc = ARMISD::VST1_UPD; 15601 NumVecs = 1; 15602 isLoadOp = false; 15603 break; 15604 case Intrinsic::arm_neon_vst2: 15605 NewOpc = ARMISD::VST2_UPD; 15606 NumVecs = 2; 15607 isLoadOp = false; 15608 break; 15609 case Intrinsic::arm_neon_vst3: 15610 NewOpc = ARMISD::VST3_UPD; 15611 NumVecs = 3; 15612 isLoadOp = false; 15613 break; 15614 case Intrinsic::arm_neon_vst4: 15615 NewOpc = ARMISD::VST4_UPD; 15616 NumVecs = 4; 15617 isLoadOp = false; 15618 break; 15619 case Intrinsic::arm_neon_vst2lane: 15620 NewOpc = ARMISD::VST2LN_UPD; 15621 NumVecs = 2; 15622 isLoadOp = false; 15623 isLaneOp = true; 15624 break; 15625 case Intrinsic::arm_neon_vst3lane: 15626 NewOpc = ARMISD::VST3LN_UPD; 15627 NumVecs = 3; 15628 isLoadOp = false; 15629 isLaneOp = true; 15630 break; 15631 case Intrinsic::arm_neon_vst4lane: 15632 NewOpc = ARMISD::VST4LN_UPD; 15633 NumVecs = 4; 15634 isLoadOp = false; 15635 isLaneOp = true; 15636 break; 15637 case Intrinsic::arm_neon_vst1x2: 15638 NewOpc = ARMISD::VST1x2_UPD; 15639 NumVecs = 2; 15640 isLoadOp = false; 15641 hasAlignment = false; 15642 break; 15643 case Intrinsic::arm_neon_vst1x3: 15644 NewOpc = ARMISD::VST1x3_UPD; 15645 NumVecs = 3; 15646 isLoadOp = false; 15647 hasAlignment = false; 15648 break; 15649 case Intrinsic::arm_neon_vst1x4: 15650 NewOpc = ARMISD::VST1x4_UPD; 15651 NumVecs = 4; 15652 isLoadOp = false; 15653 hasAlignment = false; 15654 break; 15655 } 15656 } else { 15657 isLaneOp = true; 15658 switch (N->getOpcode()) { 15659 default: 15660 llvm_unreachable("unexpected opcode for Neon base update"); 15661 case ARMISD::VLD1DUP: 15662 NewOpc = ARMISD::VLD1DUP_UPD; 15663 NumVecs = 1; 15664 break; 15665 case ARMISD::VLD2DUP: 15666 NewOpc = ARMISD::VLD2DUP_UPD; 15667 NumVecs = 2; 15668 break; 15669 case ARMISD::VLD3DUP: 15670 NewOpc = ARMISD::VLD3DUP_UPD; 15671 NumVecs = 3; 15672 break; 15673 case ARMISD::VLD4DUP: 15674 NewOpc = ARMISD::VLD4DUP_UPD; 15675 NumVecs = 4; 15676 break; 15677 case ISD::LOAD: 15678 NewOpc = ARMISD::VLD1_UPD; 15679 NumVecs = 1; 15680 isLaneOp = false; 15681 break; 15682 case ISD::STORE: 15683 NewOpc = ARMISD::VST1_UPD; 15684 NumVecs = 1; 15685 isLaneOp = false; 15686 isLoadOp = false; 15687 break; 15688 } 15689 } 15690 15691 // Find the size of memory referenced by the load/store. 15692 EVT VecTy; 15693 if (isLoadOp) { 15694 VecTy = N->getValueType(0); 15695 } else if (Target.isIntrinsic) { 15696 VecTy = N->getOperand(Target.AddrOpIdx + 1).getValueType(); 15697 } else { 15698 assert(Target.isStore && 15699 "Node has to be a load, a store, or an intrinsic!"); 15700 VecTy = N->getOperand(1).getValueType(); 15701 } 15702 15703 bool isVLDDUPOp = 15704 NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD || 15705 NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD; 15706 15707 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; 15708 if (isLaneOp || isVLDDUPOp) 15709 NumBytes /= VecTy.getVectorNumElements(); 15710 15711 if (NumBytes >= 3 * 16 && User.ConstInc != NumBytes) { 15712 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two 15713 // separate instructions that make it harder to use a non-constant update. 15714 return false; 15715 } 15716 15717 if (SimpleConstIncOnly && User.ConstInc != NumBytes) 15718 return false; 15719 15720 // OK, we found an ADD we can fold into the base update. 15721 // Now, create a _UPD node, taking care of not breaking alignment. 15722 15723 EVT AlignedVecTy = VecTy; 15724 unsigned Alignment = MemN->getAlignment(); 15725 15726 // If this is a less-than-standard-aligned load/store, change the type to 15727 // match the standard alignment. 15728 // The alignment is overlooked when selecting _UPD variants; and it's 15729 // easier to introduce bitcasts here than fix that. 15730 // There are 3 ways to get to this base-update combine: 15731 // - intrinsics: they are assumed to be properly aligned (to the standard 15732 // alignment of the memory type), so we don't need to do anything. 15733 // - ARMISD::VLDx nodes: they are only generated from the aforementioned 15734 // intrinsics, so, likewise, there's nothing to do. 15735 // - generic load/store instructions: the alignment is specified as an 15736 // explicit operand, rather than implicitly as the standard alignment 15737 // of the memory type (like the intrisics). We need to change the 15738 // memory type to match the explicit alignment. That way, we don't 15739 // generate non-standard-aligned ARMISD::VLDx nodes. 15740 if (isa<LSBaseSDNode>(N)) { 15741 if (Alignment == 0) 15742 Alignment = 1; 15743 if (Alignment < VecTy.getScalarSizeInBits() / 8) { 15744 MVT EltTy = MVT::getIntegerVT(Alignment * 8); 15745 assert(NumVecs == 1 && "Unexpected multi-element generic load/store."); 15746 assert(!isLaneOp && "Unexpected generic load/store lane."); 15747 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8); 15748 AlignedVecTy = MVT::getVectorVT(EltTy, NumElts); 15749 } 15750 // Don't set an explicit alignment on regular load/stores that we want 15751 // to transform to VLD/VST 1_UPD nodes. 15752 // This matches the behavior of regular load/stores, which only get an 15753 // explicit alignment if the MMO alignment is larger than the standard 15754 // alignment of the memory type. 15755 // Intrinsics, however, always get an explicit alignment, set to the 15756 // alignment of the MMO. 15757 Alignment = 1; 15758 } 15759 15760 // Create the new updating load/store node. 15761 // First, create an SDVTList for the new updating node's results. 15762 EVT Tys[6]; 15763 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); 15764 unsigned n; 15765 for (n = 0; n < NumResultVecs; ++n) 15766 Tys[n] = AlignedVecTy; 15767 Tys[n++] = MVT::i32; 15768 Tys[n] = MVT::Other; 15769 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2)); 15770 15771 // Then, gather the new node's operands. 15772 SmallVector<SDValue, 8> Ops; 15773 Ops.push_back(N->getOperand(0)); // incoming chain 15774 Ops.push_back(N->getOperand(Target.AddrOpIdx)); 15775 Ops.push_back(User.Inc); 15776 15777 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) { 15778 // Try to match the intrinsic's signature 15779 Ops.push_back(StN->getValue()); 15780 } else { 15781 // Loads (and of course intrinsics) match the intrinsics' signature, 15782 // so just add all but the alignment operand. 15783 unsigned LastOperand = 15784 hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands(); 15785 for (unsigned i = Target.AddrOpIdx + 1; i < LastOperand; ++i) 15786 Ops.push_back(N->getOperand(i)); 15787 } 15788 15789 // For all node types, the alignment operand is always the last one. 15790 Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32)); 15791 15792 // If this is a non-standard-aligned STORE, the penultimate operand is the 15793 // stored value. Bitcast it to the aligned type. 15794 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) { 15795 SDValue &StVal = Ops[Ops.size() - 2]; 15796 StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal); 15797 } 15798 15799 EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy; 15800 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT, 15801 MemN->getMemOperand()); 15802 15803 // Update the uses. 15804 SmallVector<SDValue, 5> NewResults; 15805 for (unsigned i = 0; i < NumResultVecs; ++i) 15806 NewResults.push_back(SDValue(UpdN.getNode(), i)); 15807 15808 // If this is an non-standard-aligned LOAD, the first result is the loaded 15809 // value. Bitcast it to the expected result type. 15810 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) { 15811 SDValue &LdVal = NewResults[0]; 15812 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal); 15813 } 15814 15815 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain 15816 DCI.CombineTo(N, NewResults); 15817 DCI.CombineTo(User.N, SDValue(UpdN.getNode(), NumResultVecs)); 15818 15819 return true; 15820 } 15821 15822 // If (opcode ptr inc) is and ADD-like instruction, return the 15823 // increment value. Otherwise return 0. 15824 static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr, 15825 SDValue Inc, const SelectionDAG &DAG) { 15826 ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode()); 15827 if (!CInc) 15828 return 0; 15829 15830 switch (Opcode) { 15831 case ARMISD::VLD1_UPD: 15832 case ISD::ADD: 15833 return CInc->getZExtValue(); 15834 case ISD::OR: { 15835 if (DAG.haveNoCommonBitsSet(Ptr, Inc)) { 15836 // (OR ptr inc) is the same as (ADD ptr inc) 15837 return CInc->getZExtValue(); 15838 } 15839 return 0; 15840 } 15841 default: 15842 return 0; 15843 } 15844 } 15845 15846 static bool findPointerConstIncrement(SDNode *N, SDValue *Ptr, SDValue *CInc) { 15847 switch (N->getOpcode()) { 15848 case ISD::ADD: 15849 case ISD::OR: { 15850 if (isa<ConstantSDNode>(N->getOperand(1))) { 15851 *Ptr = N->getOperand(0); 15852 *CInc = N->getOperand(1); 15853 return true; 15854 } 15855 return false; 15856 } 15857 case ARMISD::VLD1_UPD: { 15858 if (isa<ConstantSDNode>(N->getOperand(2))) { 15859 *Ptr = N->getOperand(1); 15860 *CInc = N->getOperand(2); 15861 return true; 15862 } 15863 return false; 15864 } 15865 default: 15866 return false; 15867 } 15868 } 15869 15870 static bool isValidBaseUpdate(SDNode *N, SDNode *User) { 15871 // Check that the add is independent of the load/store. 15872 // Otherwise, folding it would create a cycle. Search through Addr 15873 // as well, since the User may not be a direct user of Addr and 15874 // only share a base pointer. 15875 SmallPtrSet<const SDNode *, 32> Visited; 15876 SmallVector<const SDNode *, 16> Worklist; 15877 Worklist.push_back(N); 15878 Worklist.push_back(User); 15879 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || 15880 SDNode::hasPredecessorHelper(User, Visited, Worklist)) 15881 return false; 15882 return true; 15883 } 15884 15885 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, 15886 /// NEON load/store intrinsics, and generic vector load/stores, to merge 15887 /// base address updates. 15888 /// For generic load/stores, the memory type is assumed to be a vector. 15889 /// The caller is assumed to have checked legality. 15890 static SDValue CombineBaseUpdate(SDNode *N, 15891 TargetLowering::DAGCombinerInfo &DCI) { 15892 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || 15893 N->getOpcode() == ISD::INTRINSIC_W_CHAIN); 15894 const bool isStore = N->getOpcode() == ISD::STORE; 15895 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1); 15896 BaseUpdateTarget Target = {N, isIntrinsic, isStore, AddrOpIdx}; 15897 15898 SDValue Addr = N->getOperand(AddrOpIdx); 15899 15900 SmallVector<BaseUpdateUser, 8> BaseUpdates; 15901 15902 // Search for a use of the address operand that is an increment. 15903 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), 15904 UE = Addr.getNode()->use_end(); UI != UE; ++UI) { 15905 SDNode *User = *UI; 15906 if (UI.getUse().getResNo() != Addr.getResNo() || 15907 User->getNumOperands() != 2) 15908 continue; 15909 15910 SDValue Inc = User->getOperand(UI.getOperandNo() == 1 ? 0 : 1); 15911 unsigned ConstInc = 15912 getPointerConstIncrement(User->getOpcode(), Addr, Inc, DCI.DAG); 15913 15914 if (ConstInc || User->getOpcode() == ISD::ADD) 15915 BaseUpdates.push_back({User, Inc, ConstInc}); 15916 } 15917 15918 // If the address is a constant pointer increment itself, find 15919 // another constant increment that has the same base operand 15920 SDValue Base; 15921 SDValue CInc; 15922 if (findPointerConstIncrement(Addr.getNode(), &Base, &CInc)) { 15923 unsigned Offset = 15924 getPointerConstIncrement(Addr->getOpcode(), Base, CInc, DCI.DAG); 15925 for (SDNode::use_iterator UI = Base->use_begin(), UE = Base->use_end(); 15926 UI != UE; ++UI) { 15927 15928 SDNode *User = *UI; 15929 if (UI.getUse().getResNo() != Base.getResNo() || User == Addr.getNode() || 15930 User->getNumOperands() != 2) 15931 continue; 15932 15933 SDValue UserInc = User->getOperand(UI.getOperandNo() == 0 ? 1 : 0); 15934 unsigned UserOffset = 15935 getPointerConstIncrement(User->getOpcode(), Base, UserInc, DCI.DAG); 15936 15937 if (!UserOffset || UserOffset <= Offset) 15938 continue; 15939 15940 unsigned NewConstInc = UserOffset - Offset; 15941 SDValue NewInc = DCI.DAG.getConstant(NewConstInc, SDLoc(N), MVT::i32); 15942 BaseUpdates.push_back({User, NewInc, NewConstInc}); 15943 } 15944 } 15945 15946 // Try to fold the load/store with an update that matches memory 15947 // access size. This should work well for sequential loads. 15948 // 15949 // Filter out invalid updates as well. 15950 unsigned NumValidUpd = BaseUpdates.size(); 15951 for (unsigned I = 0; I < NumValidUpd;) { 15952 BaseUpdateUser &User = BaseUpdates[I]; 15953 if (!isValidBaseUpdate(N, User.N)) { 15954 --NumValidUpd; 15955 std::swap(BaseUpdates[I], BaseUpdates[NumValidUpd]); 15956 continue; 15957 } 15958 15959 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/true, DCI)) 15960 return SDValue(); 15961 ++I; 15962 } 15963 BaseUpdates.resize(NumValidUpd); 15964 15965 // Try to fold with other users. Non-constant updates are considered 15966 // first, and constant updates are sorted to not break a sequence of 15967 // strided accesses (if there is any). 15968 std::sort(BaseUpdates.begin(), BaseUpdates.end(), 15969 [](BaseUpdateUser &LHS, BaseUpdateUser &RHS) { 15970 return LHS.ConstInc < RHS.ConstInc; 15971 }); 15972 for (BaseUpdateUser &User : BaseUpdates) { 15973 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/false, DCI)) 15974 return SDValue(); 15975 } 15976 return SDValue(); 15977 } 15978 15979 static SDValue PerformVLDCombine(SDNode *N, 15980 TargetLowering::DAGCombinerInfo &DCI) { 15981 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 15982 return SDValue(); 15983 15984 return CombineBaseUpdate(N, DCI); 15985 } 15986 15987 static SDValue PerformMVEVLDCombine(SDNode *N, 15988 TargetLowering::DAGCombinerInfo &DCI) { 15989 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 15990 return SDValue(); 15991 15992 SelectionDAG &DAG = DCI.DAG; 15993 SDValue Addr = N->getOperand(2); 15994 MemSDNode *MemN = cast<MemSDNode>(N); 15995 SDLoc dl(N); 15996 15997 // For the stores, where there are multiple intrinsics we only actually want 15998 // to post-inc the last of the them. 15999 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 16000 if (IntNo == Intrinsic::arm_mve_vst2q && 16001 cast<ConstantSDNode>(N->getOperand(5))->getZExtValue() != 1) 16002 return SDValue(); 16003 if (IntNo == Intrinsic::arm_mve_vst4q && 16004 cast<ConstantSDNode>(N->getOperand(7))->getZExtValue() != 3) 16005 return SDValue(); 16006 16007 // Search for a use of the address operand that is an increment. 16008 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), 16009 UE = Addr.getNode()->use_end(); 16010 UI != UE; ++UI) { 16011 SDNode *User = *UI; 16012 if (User->getOpcode() != ISD::ADD || 16013 UI.getUse().getResNo() != Addr.getResNo()) 16014 continue; 16015 16016 // Check that the add is independent of the load/store. Otherwise, folding 16017 // it would create a cycle. We can avoid searching through Addr as it's a 16018 // predecessor to both. 16019 SmallPtrSet<const SDNode *, 32> Visited; 16020 SmallVector<const SDNode *, 16> Worklist; 16021 Visited.insert(Addr.getNode()); 16022 Worklist.push_back(N); 16023 Worklist.push_back(User); 16024 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || 16025 SDNode::hasPredecessorHelper(User, Visited, Worklist)) 16026 continue; 16027 16028 // Find the new opcode for the updating load/store. 16029 bool isLoadOp = true; 16030 unsigned NewOpc = 0; 16031 unsigned NumVecs = 0; 16032 switch (IntNo) { 16033 default: 16034 llvm_unreachable("unexpected intrinsic for MVE VLDn combine"); 16035 case Intrinsic::arm_mve_vld2q: 16036 NewOpc = ARMISD::VLD2_UPD; 16037 NumVecs = 2; 16038 break; 16039 case Intrinsic::arm_mve_vld4q: 16040 NewOpc = ARMISD::VLD4_UPD; 16041 NumVecs = 4; 16042 break; 16043 case Intrinsic::arm_mve_vst2q: 16044 NewOpc = ARMISD::VST2_UPD; 16045 NumVecs = 2; 16046 isLoadOp = false; 16047 break; 16048 case Intrinsic::arm_mve_vst4q: 16049 NewOpc = ARMISD::VST4_UPD; 16050 NumVecs = 4; 16051 isLoadOp = false; 16052 break; 16053 } 16054 16055 // Find the size of memory referenced by the load/store. 16056 EVT VecTy; 16057 if (isLoadOp) { 16058 VecTy = N->getValueType(0); 16059 } else { 16060 VecTy = N->getOperand(3).getValueType(); 16061 } 16062 16063 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; 16064 16065 // If the increment is a constant, it must match the memory ref size. 16066 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 16067 ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode()); 16068 if (!CInc || CInc->getZExtValue() != NumBytes) 16069 continue; 16070 16071 // Create the new updating load/store node. 16072 // First, create an SDVTList for the new updating node's results. 16073 EVT Tys[6]; 16074 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); 16075 unsigned n; 16076 for (n = 0; n < NumResultVecs; ++n) 16077 Tys[n] = VecTy; 16078 Tys[n++] = MVT::i32; 16079 Tys[n] = MVT::Other; 16080 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2)); 16081 16082 // Then, gather the new node's operands. 16083 SmallVector<SDValue, 8> Ops; 16084 Ops.push_back(N->getOperand(0)); // incoming chain 16085 Ops.push_back(N->getOperand(2)); // ptr 16086 Ops.push_back(Inc); 16087 16088 for (unsigned i = 3; i < N->getNumOperands(); ++i) 16089 Ops.push_back(N->getOperand(i)); 16090 16091 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy, 16092 MemN->getMemOperand()); 16093 16094 // Update the uses. 16095 SmallVector<SDValue, 5> NewResults; 16096 for (unsigned i = 0; i < NumResultVecs; ++i) 16097 NewResults.push_back(SDValue(UpdN.getNode(), i)); 16098 16099 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain 16100 DCI.CombineTo(N, NewResults); 16101 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); 16102 16103 break; 16104 } 16105 16106 return SDValue(); 16107 } 16108 16109 /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a 16110 /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic 16111 /// are also VDUPLANEs. If so, combine them to a vldN-dup operation and 16112 /// return true. 16113 static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 16114 SelectionDAG &DAG = DCI.DAG; 16115 EVT VT = N->getValueType(0); 16116 // vldN-dup instructions only support 64-bit vectors for N > 1. 16117 if (!VT.is64BitVector()) 16118 return false; 16119 16120 // Check if the VDUPLANE operand is a vldN-dup intrinsic. 16121 SDNode *VLD = N->getOperand(0).getNode(); 16122 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN) 16123 return false; 16124 unsigned NumVecs = 0; 16125 unsigned NewOpc = 0; 16126 unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue(); 16127 if (IntNo == Intrinsic::arm_neon_vld2lane) { 16128 NumVecs = 2; 16129 NewOpc = ARMISD::VLD2DUP; 16130 } else if (IntNo == Intrinsic::arm_neon_vld3lane) { 16131 NumVecs = 3; 16132 NewOpc = ARMISD::VLD3DUP; 16133 } else if (IntNo == Intrinsic::arm_neon_vld4lane) { 16134 NumVecs = 4; 16135 NewOpc = ARMISD::VLD4DUP; 16136 } else { 16137 return false; 16138 } 16139 16140 // First check that all the vldN-lane uses are VDUPLANEs and that the lane 16141 // numbers match the load. 16142 unsigned VLDLaneNo = 16143 cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue(); 16144 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 16145 UI != UE; ++UI) { 16146 // Ignore uses of the chain result. 16147 if (UI.getUse().getResNo() == NumVecs) 16148 continue; 16149 SDNode *User = *UI; 16150 if (User->getOpcode() != ARMISD::VDUPLANE || 16151 VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue()) 16152 return false; 16153 } 16154 16155 // Create the vldN-dup node. 16156 EVT Tys[5]; 16157 unsigned n; 16158 for (n = 0; n < NumVecs; ++n) 16159 Tys[n] = VT; 16160 Tys[n] = MVT::Other; 16161 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1)); 16162 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) }; 16163 MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD); 16164 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, 16165 Ops, VLDMemInt->getMemoryVT(), 16166 VLDMemInt->getMemOperand()); 16167 16168 // Update the uses. 16169 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 16170 UI != UE; ++UI) { 16171 unsigned ResNo = UI.getUse().getResNo(); 16172 // Ignore uses of the chain result. 16173 if (ResNo == NumVecs) 16174 continue; 16175 SDNode *User = *UI; 16176 DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo)); 16177 } 16178 16179 // Now the vldN-lane intrinsic is dead except for its chain result. 16180 // Update uses of the chain. 16181 std::vector<SDValue> VLDDupResults; 16182 for (unsigned n = 0; n < NumVecs; ++n) 16183 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n)); 16184 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs)); 16185 DCI.CombineTo(VLD, VLDDupResults); 16186 16187 return true; 16188 } 16189 16190 /// PerformVDUPLANECombine - Target-specific dag combine xforms for 16191 /// ARMISD::VDUPLANE. 16192 static SDValue PerformVDUPLANECombine(SDNode *N, 16193 TargetLowering::DAGCombinerInfo &DCI, 16194 const ARMSubtarget *Subtarget) { 16195 SDValue Op = N->getOperand(0); 16196 EVT VT = N->getValueType(0); 16197 16198 // On MVE, we just convert the VDUPLANE to a VDUP with an extract. 16199 if (Subtarget->hasMVEIntegerOps()) { 16200 EVT ExtractVT = VT.getVectorElementType(); 16201 // We need to ensure we are creating a legal type. 16202 if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT)) 16203 ExtractVT = MVT::i32; 16204 SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT, 16205 N->getOperand(0), N->getOperand(1)); 16206 return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract); 16207 } 16208 16209 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses 16210 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation. 16211 if (CombineVLDDUP(N, DCI)) 16212 return SDValue(N, 0); 16213 16214 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is 16215 // redundant. Ignore bit_converts for now; element sizes are checked below. 16216 while (Op.getOpcode() == ISD::BITCAST) 16217 Op = Op.getOperand(0); 16218 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM) 16219 return SDValue(); 16220 16221 // Make sure the VMOV element size is not bigger than the VDUPLANE elements. 16222 unsigned EltSize = Op.getScalarValueSizeInBits(); 16223 // The canonical VMOV for a zero vector uses a 32-bit element size. 16224 unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 16225 unsigned EltBits; 16226 if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0) 16227 EltSize = 8; 16228 if (EltSize > VT.getScalarSizeInBits()) 16229 return SDValue(); 16230 16231 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); 16232 } 16233 16234 /// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP. 16235 static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG, 16236 const ARMSubtarget *Subtarget) { 16237 SDValue Op = N->getOperand(0); 16238 SDLoc dl(N); 16239 16240 if (Subtarget->hasMVEIntegerOps()) { 16241 // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will 16242 // need to come from a GPR. 16243 if (Op.getValueType() == MVT::f32) 16244 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), 16245 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op)); 16246 else if (Op.getValueType() == MVT::f16) 16247 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), 16248 DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op)); 16249 } 16250 16251 if (!Subtarget->hasNEON()) 16252 return SDValue(); 16253 16254 // Match VDUP(LOAD) -> VLD1DUP. 16255 // We match this pattern here rather than waiting for isel because the 16256 // transform is only legal for unindexed loads. 16257 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode()); 16258 if (LD && Op.hasOneUse() && LD->isUnindexed() && 16259 LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) { 16260 SDValue Ops[] = {LD->getOperand(0), LD->getOperand(1), 16261 DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32)}; 16262 SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other); 16263 SDValue VLDDup = 16264 DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys, Ops, 16265 LD->getMemoryVT(), LD->getMemOperand()); 16266 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1)); 16267 return VLDDup; 16268 } 16269 16270 return SDValue(); 16271 } 16272 16273 static SDValue PerformLOADCombine(SDNode *N, 16274 TargetLowering::DAGCombinerInfo &DCI, 16275 const ARMSubtarget *Subtarget) { 16276 EVT VT = N->getValueType(0); 16277 16278 // If this is a legal vector load, try to combine it into a VLD1_UPD. 16279 if (Subtarget->hasNEON() && ISD::isNormalLoad(N) && VT.isVector() && 16280 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 16281 return CombineBaseUpdate(N, DCI); 16282 16283 return SDValue(); 16284 } 16285 16286 // Optimize trunc store (of multiple scalars) to shuffle and store. First, 16287 // pack all of the elements in one place. Next, store to memory in fewer 16288 // chunks. 16289 static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, 16290 SelectionDAG &DAG) { 16291 SDValue StVal = St->getValue(); 16292 EVT VT = StVal.getValueType(); 16293 if (!St->isTruncatingStore() || !VT.isVector()) 16294 return SDValue(); 16295 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 16296 EVT StVT = St->getMemoryVT(); 16297 unsigned NumElems = VT.getVectorNumElements(); 16298 assert(StVT != VT && "Cannot truncate to the same type"); 16299 unsigned FromEltSz = VT.getScalarSizeInBits(); 16300 unsigned ToEltSz = StVT.getScalarSizeInBits(); 16301 16302 // From, To sizes and ElemCount must be pow of two 16303 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) 16304 return SDValue(); 16305 16306 // We are going to use the original vector elt for storing. 16307 // Accumulated smaller vector elements must be a multiple of the store size. 16308 if (0 != (NumElems * FromEltSz) % ToEltSz) 16309 return SDValue(); 16310 16311 unsigned SizeRatio = FromEltSz / ToEltSz; 16312 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits()); 16313 16314 // Create a type on which we perform the shuffle. 16315 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), 16316 NumElems * SizeRatio); 16317 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 16318 16319 SDLoc DL(St); 16320 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); 16321 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 16322 for (unsigned i = 0; i < NumElems; ++i) 16323 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1 16324 : i * SizeRatio; 16325 16326 // Can't shuffle using an illegal type. 16327 if (!TLI.isTypeLegal(WideVecVT)) 16328 return SDValue(); 16329 16330 SDValue Shuff = DAG.getVectorShuffle( 16331 WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec); 16332 // At this point all of the data is stored at the bottom of the 16333 // register. We now need to save it to mem. 16334 16335 // Find the largest store unit 16336 MVT StoreType = MVT::i8; 16337 for (MVT Tp : MVT::integer_valuetypes()) { 16338 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz) 16339 StoreType = Tp; 16340 } 16341 // Didn't find a legal store type. 16342 if (!TLI.isTypeLegal(StoreType)) 16343 return SDValue(); 16344 16345 // Bitcast the original vector into a vector of store-size units 16346 EVT StoreVecVT = 16347 EVT::getVectorVT(*DAG.getContext(), StoreType, 16348 VT.getSizeInBits() / EVT(StoreType).getSizeInBits()); 16349 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); 16350 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); 16351 SmallVector<SDValue, 8> Chains; 16352 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL, 16353 TLI.getPointerTy(DAG.getDataLayout())); 16354 SDValue BasePtr = St->getBasePtr(); 16355 16356 // Perform one or more big stores into memory. 16357 unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits(); 16358 for (unsigned I = 0; I < E; I++) { 16359 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType, 16360 ShuffWide, DAG.getIntPtrConstant(I, DL)); 16361 SDValue Ch = 16362 DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(), 16363 St->getAlignment(), St->getMemOperand()->getFlags()); 16364 BasePtr = 16365 DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment); 16366 Chains.push_back(Ch); 16367 } 16368 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 16369 } 16370 16371 // Try taking a single vector store from an fpround (which would otherwise turn 16372 // into an expensive buildvector) and splitting it into a series of narrowing 16373 // stores. 16374 static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, 16375 SelectionDAG &DAG) { 16376 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed()) 16377 return SDValue(); 16378 SDValue Trunc = St->getValue(); 16379 if (Trunc->getOpcode() != ISD::FP_ROUND) 16380 return SDValue(); 16381 EVT FromVT = Trunc->getOperand(0).getValueType(); 16382 EVT ToVT = Trunc.getValueType(); 16383 if (!ToVT.isVector()) 16384 return SDValue(); 16385 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements()); 16386 EVT ToEltVT = ToVT.getVectorElementType(); 16387 EVT FromEltVT = FromVT.getVectorElementType(); 16388 16389 if (FromEltVT != MVT::f32 || ToEltVT != MVT::f16) 16390 return SDValue(); 16391 16392 unsigned NumElements = 4; 16393 if (FromVT.getVectorNumElements() % NumElements != 0) 16394 return SDValue(); 16395 16396 // Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so 16397 // use the VMOVN over splitting the store. We are looking for patterns of: 16398 // !rev: 0 N 1 N+1 2 N+2 ... 16399 // rev: N 0 N+1 1 N+2 2 ... 16400 // The shuffle may either be a single source (in which case N = NumElts/2) or 16401 // two inputs extended with concat to the same size (in which case N = 16402 // NumElts). 16403 auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) { 16404 ArrayRef<int> M = SVN->getMask(); 16405 unsigned NumElts = ToVT.getVectorNumElements(); 16406 if (SVN->getOperand(1).isUndef()) 16407 NumElts /= 2; 16408 16409 unsigned Off0 = Rev ? NumElts : 0; 16410 unsigned Off1 = Rev ? 0 : NumElts; 16411 16412 for (unsigned I = 0; I < NumElts; I += 2) { 16413 if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2)) 16414 return false; 16415 if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2)) 16416 return false; 16417 } 16418 16419 return true; 16420 }; 16421 16422 if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc.getOperand(0))) 16423 if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true)) 16424 return SDValue(); 16425 16426 LLVMContext &C = *DAG.getContext(); 16427 SDLoc DL(St); 16428 // Details about the old store 16429 SDValue Ch = St->getChain(); 16430 SDValue BasePtr = St->getBasePtr(); 16431 Align Alignment = St->getOriginalAlign(); 16432 MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags(); 16433 AAMDNodes AAInfo = St->getAAInfo(); 16434 16435 // We split the store into slices of NumElements. fp16 trunc stores are vcvt 16436 // and then stored as truncating integer stores. 16437 EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements); 16438 EVT NewToVT = EVT::getVectorVT( 16439 C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements); 16440 16441 SmallVector<SDValue, 4> Stores; 16442 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) { 16443 unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8; 16444 SDValue NewPtr = 16445 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset)); 16446 16447 SDValue Extract = 16448 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0), 16449 DAG.getConstant(i * NumElements, DL, MVT::i32)); 16450 16451 SDValue FPTrunc = 16452 DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16), 16453 Extract, DAG.getConstant(0, DL, MVT::i32)); 16454 Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc); 16455 16456 SDValue Store = DAG.getTruncStore( 16457 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset), 16458 NewToVT, Alignment.value(), MMOFlags, AAInfo); 16459 Stores.push_back(Store); 16460 } 16461 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); 16462 } 16463 16464 // Try taking a single vector store from an MVETRUNC (which would otherwise turn 16465 // into an expensive buildvector) and splitting it into a series of narrowing 16466 // stores. 16467 static SDValue PerformSplittingMVETruncToNarrowingStores(StoreSDNode *St, 16468 SelectionDAG &DAG) { 16469 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed()) 16470 return SDValue(); 16471 SDValue Trunc = St->getValue(); 16472 if (Trunc->getOpcode() != ARMISD::MVETRUNC) 16473 return SDValue(); 16474 EVT FromVT = Trunc->getOperand(0).getValueType(); 16475 EVT ToVT = Trunc.getValueType(); 16476 16477 LLVMContext &C = *DAG.getContext(); 16478 SDLoc DL(St); 16479 // Details about the old store 16480 SDValue Ch = St->getChain(); 16481 SDValue BasePtr = St->getBasePtr(); 16482 Align Alignment = St->getOriginalAlign(); 16483 MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags(); 16484 AAMDNodes AAInfo = St->getAAInfo(); 16485 16486 EVT NewToVT = EVT::getVectorVT(C, ToVT.getVectorElementType(), 16487 FromVT.getVectorNumElements()); 16488 16489 SmallVector<SDValue, 4> Stores; 16490 for (unsigned i = 0; i < Trunc.getNumOperands(); i++) { 16491 unsigned NewOffset = 16492 i * FromVT.getVectorNumElements() * ToVT.getScalarSizeInBits() / 8; 16493 SDValue NewPtr = 16494 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset)); 16495 16496 SDValue Extract = Trunc.getOperand(i); 16497 SDValue Store = DAG.getTruncStore( 16498 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset), 16499 NewToVT, Alignment.value(), MMOFlags, AAInfo); 16500 Stores.push_back(Store); 16501 } 16502 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); 16503 } 16504 16505 // Given a floating point store from an extracted vector, with an integer 16506 // VGETLANE that already exists, store the existing VGETLANEu directly. This can 16507 // help reduce fp register pressure, doesn't require the fp extract and allows 16508 // use of more integer post-inc stores not available with vstr. 16509 static SDValue PerformExtractFpToIntStores(StoreSDNode *St, SelectionDAG &DAG) { 16510 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed()) 16511 return SDValue(); 16512 SDValue Extract = St->getValue(); 16513 EVT VT = Extract.getValueType(); 16514 // For now only uses f16. This may be useful for f32 too, but that will 16515 // be bitcast(extract), not the VGETLANEu we currently check here. 16516 if (VT != MVT::f16 || Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 16517 return SDValue(); 16518 16519 SDNode *GetLane = 16520 DAG.getNodeIfExists(ARMISD::VGETLANEu, DAG.getVTList(MVT::i32), 16521 {Extract.getOperand(0), Extract.getOperand(1)}); 16522 if (!GetLane) 16523 return SDValue(); 16524 16525 LLVMContext &C = *DAG.getContext(); 16526 SDLoc DL(St); 16527 // Create a new integer store to replace the existing floating point version. 16528 SDValue Ch = St->getChain(); 16529 SDValue BasePtr = St->getBasePtr(); 16530 Align Alignment = St->getOriginalAlign(); 16531 MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags(); 16532 AAMDNodes AAInfo = St->getAAInfo(); 16533 EVT NewToVT = EVT::getIntegerVT(C, VT.getSizeInBits()); 16534 SDValue Store = DAG.getTruncStore(Ch, DL, SDValue(GetLane, 0), BasePtr, 16535 St->getPointerInfo(), NewToVT, 16536 Alignment.value(), MMOFlags, AAInfo); 16537 16538 return Store; 16539 } 16540 16541 /// PerformSTORECombine - Target-specific dag combine xforms for 16542 /// ISD::STORE. 16543 static SDValue PerformSTORECombine(SDNode *N, 16544 TargetLowering::DAGCombinerInfo &DCI, 16545 const ARMSubtarget *Subtarget) { 16546 StoreSDNode *St = cast<StoreSDNode>(N); 16547 if (St->isVolatile()) 16548 return SDValue(); 16549 SDValue StVal = St->getValue(); 16550 EVT VT = StVal.getValueType(); 16551 16552 if (Subtarget->hasNEON()) 16553 if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG)) 16554 return Store; 16555 16556 if (Subtarget->hasMVEIntegerOps()) { 16557 if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG)) 16558 return NewToken; 16559 if (SDValue NewChain = PerformExtractFpToIntStores(St, DCI.DAG)) 16560 return NewChain; 16561 if (SDValue NewToken = 16562 PerformSplittingMVETruncToNarrowingStores(St, DCI.DAG)) 16563 return NewToken; 16564 } 16565 16566 if (!ISD::isNormalStore(St)) 16567 return SDValue(); 16568 16569 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and 16570 // ARM stores of arguments in the same cache line. 16571 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR && 16572 StVal.getNode()->hasOneUse()) { 16573 SelectionDAG &DAG = DCI.DAG; 16574 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 16575 SDLoc DL(St); 16576 SDValue BasePtr = St->getBasePtr(); 16577 SDValue NewST1 = DAG.getStore( 16578 St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0), 16579 BasePtr, St->getPointerInfo(), St->getOriginalAlign(), 16580 St->getMemOperand()->getFlags()); 16581 16582 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 16583 DAG.getConstant(4, DL, MVT::i32)); 16584 return DAG.getStore(NewST1.getValue(0), DL, 16585 StVal.getNode()->getOperand(isBigEndian ? 0 : 1), 16586 OffsetPtr, St->getPointerInfo().getWithOffset(4), 16587 St->getOriginalAlign(), 16588 St->getMemOperand()->getFlags()); 16589 } 16590 16591 if (StVal.getValueType() == MVT::i64 && 16592 StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 16593 16594 // Bitcast an i64 store extracted from a vector to f64. 16595 // Otherwise, the i64 value will be legalized to a pair of i32 values. 16596 SelectionDAG &DAG = DCI.DAG; 16597 SDLoc dl(StVal); 16598 SDValue IntVec = StVal.getOperand(0); 16599 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 16600 IntVec.getValueType().getVectorNumElements()); 16601 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec); 16602 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 16603 Vec, StVal.getOperand(1)); 16604 dl = SDLoc(N); 16605 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt); 16606 // Make the DAGCombiner fold the bitcasts. 16607 DCI.AddToWorklist(Vec.getNode()); 16608 DCI.AddToWorklist(ExtElt.getNode()); 16609 DCI.AddToWorklist(V.getNode()); 16610 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(), 16611 St->getPointerInfo(), St->getAlignment(), 16612 St->getMemOperand()->getFlags(), St->getAAInfo()); 16613 } 16614 16615 // If this is a legal vector store, try to combine it into a VST1_UPD. 16616 if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() && 16617 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 16618 return CombineBaseUpdate(N, DCI); 16619 16620 return SDValue(); 16621 } 16622 16623 /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) 16624 /// can replace combinations of VMUL and VCVT (floating-point to integer) 16625 /// when the VMUL has a constant operand that is a power of 2. 16626 /// 16627 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 16628 /// vmul.f32 d16, d17, d16 16629 /// vcvt.s32.f32 d16, d16 16630 /// becomes: 16631 /// vcvt.s32.f32 d16, d16, #3 16632 static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, 16633 const ARMSubtarget *Subtarget) { 16634 if (!Subtarget->hasNEON()) 16635 return SDValue(); 16636 16637 SDValue Op = N->getOperand(0); 16638 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || 16639 Op.getOpcode() != ISD::FMUL) 16640 return SDValue(); 16641 16642 SDValue ConstVec = Op->getOperand(1); 16643 if (!isa<BuildVectorSDNode>(ConstVec)) 16644 return SDValue(); 16645 16646 MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); 16647 uint32_t FloatBits = FloatTy.getSizeInBits(); 16648 MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); 16649 uint32_t IntBits = IntTy.getSizeInBits(); 16650 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 16651 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) { 16652 // These instructions only exist converting from f32 to i32. We can handle 16653 // smaller integers by generating an extra truncate, but larger ones would 16654 // be lossy. We also can't handle anything other than 2 or 4 lanes, since 16655 // these intructions only support v2i32/v4i32 types. 16656 return SDValue(); 16657 } 16658 16659 BitVector UndefElements; 16660 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 16661 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); 16662 if (C == -1 || C == 0 || C > 32) 16663 return SDValue(); 16664 16665 SDLoc dl(N); 16666 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT; 16667 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs : 16668 Intrinsic::arm_neon_vcvtfp2fxu; 16669 SDValue FixConv = DAG.getNode( 16670 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, 16671 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0), 16672 DAG.getConstant(C, dl, MVT::i32)); 16673 16674 if (IntBits < FloatBits) 16675 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv); 16676 16677 return FixConv; 16678 } 16679 16680 static SDValue PerformFAddVSelectCombine(SDNode *N, SelectionDAG &DAG, 16681 const ARMSubtarget *Subtarget) { 16682 if (!Subtarget->hasMVEFloatOps()) 16683 return SDValue(); 16684 16685 // Turn (fadd x, (vselect c, y, -0.0)) into (vselect c, (fadd x, y), x) 16686 // The second form can be more easily turned into a predicated vadd, and 16687 // possibly combined into a fma to become a predicated vfma. 16688 SDValue Op0 = N->getOperand(0); 16689 SDValue Op1 = N->getOperand(1); 16690 EVT VT = N->getValueType(0); 16691 SDLoc DL(N); 16692 16693 // The identity element for a fadd is -0.0, which these VMOV's represent. 16694 auto isNegativeZeroSplat = [&](SDValue Op) { 16695 if (Op.getOpcode() != ISD::BITCAST || 16696 Op.getOperand(0).getOpcode() != ARMISD::VMOVIMM) 16697 return false; 16698 if (VT == MVT::v4f32 && Op.getOperand(0).getConstantOperandVal(0) == 1664) 16699 return true; 16700 if (VT == MVT::v8f16 && Op.getOperand(0).getConstantOperandVal(0) == 2688) 16701 return true; 16702 return false; 16703 }; 16704 16705 if (Op0.getOpcode() == ISD::VSELECT && Op1.getOpcode() != ISD::VSELECT) 16706 std::swap(Op0, Op1); 16707 16708 if (Op1.getOpcode() != ISD::VSELECT || 16709 !isNegativeZeroSplat(Op1.getOperand(2))) 16710 return SDValue(); 16711 SDValue FAdd = 16712 DAG.getNode(ISD::FADD, DL, VT, Op0, Op1.getOperand(1), N->getFlags()); 16713 return DAG.getNode(ISD::VSELECT, DL, VT, Op1.getOperand(0), FAdd, Op0); 16714 } 16715 16716 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD) 16717 /// can replace combinations of VCVT (integer to floating-point) and VDIV 16718 /// when the VDIV has a constant operand that is a power of 2. 16719 /// 16720 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 16721 /// vcvt.f32.s32 d16, d16 16722 /// vdiv.f32 d16, d17, d16 16723 /// becomes: 16724 /// vcvt.f32.s32 d16, d16, #3 16725 static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG, 16726 const ARMSubtarget *Subtarget) { 16727 if (!Subtarget->hasNEON()) 16728 return SDValue(); 16729 16730 SDValue Op = N->getOperand(0); 16731 unsigned OpOpcode = Op.getNode()->getOpcode(); 16732 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() || 16733 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP)) 16734 return SDValue(); 16735 16736 SDValue ConstVec = N->getOperand(1); 16737 if (!isa<BuildVectorSDNode>(ConstVec)) 16738 return SDValue(); 16739 16740 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); 16741 uint32_t FloatBits = FloatTy.getSizeInBits(); 16742 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); 16743 uint32_t IntBits = IntTy.getSizeInBits(); 16744 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 16745 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) { 16746 // These instructions only exist converting from i32 to f32. We can handle 16747 // smaller integers by generating an extra extend, but larger ones would 16748 // be lossy. We also can't handle anything other than 2 or 4 lanes, since 16749 // these intructions only support v2i32/v4i32 types. 16750 return SDValue(); 16751 } 16752 16753 BitVector UndefElements; 16754 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 16755 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); 16756 if (C == -1 || C == 0 || C > 32) 16757 return SDValue(); 16758 16759 SDLoc dl(N); 16760 bool isSigned = OpOpcode == ISD::SINT_TO_FP; 16761 SDValue ConvInput = Op.getOperand(0); 16762 if (IntBits < FloatBits) 16763 ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, 16764 dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, 16765 ConvInput); 16766 16767 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp : 16768 Intrinsic::arm_neon_vcvtfxu2fp; 16769 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, 16770 Op.getValueType(), 16771 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), 16772 ConvInput, DAG.getConstant(C, dl, MVT::i32)); 16773 } 16774 16775 static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, 16776 const ARMSubtarget *ST) { 16777 if (!ST->hasMVEIntegerOps()) 16778 return SDValue(); 16779 16780 assert(N->getOpcode() == ISD::VECREDUCE_ADD); 16781 EVT ResVT = N->getValueType(0); 16782 SDValue N0 = N->getOperand(0); 16783 SDLoc dl(N); 16784 16785 // Try to turn vecreduce_add(add(x, y)) into vecreduce(x) + vecreduce(y) 16786 if (ResVT == MVT::i32 && N0.getOpcode() == ISD::ADD && 16787 (N0.getValueType() == MVT::v4i32 || N0.getValueType() == MVT::v8i16 || 16788 N0.getValueType() == MVT::v16i8)) { 16789 SDValue Red0 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(0)); 16790 SDValue Red1 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(1)); 16791 return DAG.getNode(ISD::ADD, dl, ResVT, Red0, Red1); 16792 } 16793 16794 // We are looking for something that will have illegal types if left alone, 16795 // but that we can convert to a single instruction under MVE. For example 16796 // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A 16797 // or 16798 // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B 16799 16800 // The legal cases are: 16801 // VADDV u/s 8/16/32 16802 // VMLAV u/s 8/16/32 16803 // VADDLV u/s 32 16804 // VMLALV u/s 16/32 16805 16806 // If the input vector is smaller than legal (v4i8/v4i16 for example) we can 16807 // extend it and use v4i32 instead. 16808 auto ExtTypeMatches = [](SDValue A, ArrayRef<MVT> ExtTypes) { 16809 EVT AVT = A.getValueType(); 16810 return any_of(ExtTypes, [&](MVT Ty) { 16811 return AVT.getVectorNumElements() == Ty.getVectorNumElements() && 16812 AVT.bitsLE(Ty); 16813 }); 16814 }; 16815 auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) { 16816 EVT AVT = A.getValueType(); 16817 if (!AVT.is128BitVector()) 16818 A = DAG.getNode(ExtendCode, dl, 16819 AVT.changeVectorElementType(MVT::getIntegerVT( 16820 128 / AVT.getVectorMinNumElements())), 16821 A); 16822 return A; 16823 }; 16824 auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) { 16825 if (ResVT != RetTy || N0->getOpcode() != ExtendCode) 16826 return SDValue(); 16827 SDValue A = N0->getOperand(0); 16828 if (ExtTypeMatches(A, ExtTypes)) 16829 return ExtendIfNeeded(A, ExtendCode); 16830 return SDValue(); 16831 }; 16832 auto IsPredVADDV = [&](MVT RetTy, unsigned ExtendCode, 16833 ArrayRef<MVT> ExtTypes, SDValue &Mask) { 16834 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT || 16835 !ISD::isBuildVectorAllZeros(N0->getOperand(2).getNode())) 16836 return SDValue(); 16837 Mask = N0->getOperand(0); 16838 SDValue Ext = N0->getOperand(1); 16839 if (Ext->getOpcode() != ExtendCode) 16840 return SDValue(); 16841 SDValue A = Ext->getOperand(0); 16842 if (ExtTypeMatches(A, ExtTypes)) 16843 return ExtendIfNeeded(A, ExtendCode); 16844 return SDValue(); 16845 }; 16846 auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes, 16847 SDValue &A, SDValue &B) { 16848 // For a vmla we are trying to match a larger pattern: 16849 // ExtA = sext/zext A 16850 // ExtB = sext/zext B 16851 // Mul = mul ExtA, ExtB 16852 // vecreduce.add Mul 16853 // There might also be en extra extend between the mul and the addreduce, so 16854 // long as the bitwidth is high enough to make them equivalent (for example 16855 // original v8i16 might be mul at v8i32 and the reduce happens at v8i64). 16856 if (ResVT != RetTy) 16857 return false; 16858 SDValue Mul = N0; 16859 if (Mul->getOpcode() == ExtendCode && 16860 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >= 16861 ResVT.getScalarSizeInBits()) 16862 Mul = Mul->getOperand(0); 16863 if (Mul->getOpcode() != ISD::MUL) 16864 return false; 16865 SDValue ExtA = Mul->getOperand(0); 16866 SDValue ExtB = Mul->getOperand(1); 16867 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode) 16868 return false; 16869 A = ExtA->getOperand(0); 16870 B = ExtB->getOperand(0); 16871 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) { 16872 A = ExtendIfNeeded(A, ExtendCode); 16873 B = ExtendIfNeeded(B, ExtendCode); 16874 return true; 16875 } 16876 return false; 16877 }; 16878 auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes, 16879 SDValue &A, SDValue &B, SDValue &Mask) { 16880 // Same as the pattern above with a select for the zero predicated lanes 16881 // ExtA = sext/zext A 16882 // ExtB = sext/zext B 16883 // Mul = mul ExtA, ExtB 16884 // N0 = select Mask, Mul, 0 16885 // vecreduce.add N0 16886 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT || 16887 !ISD::isBuildVectorAllZeros(N0->getOperand(2).getNode())) 16888 return false; 16889 Mask = N0->getOperand(0); 16890 SDValue Mul = N0->getOperand(1); 16891 if (Mul->getOpcode() == ExtendCode && 16892 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >= 16893 ResVT.getScalarSizeInBits()) 16894 Mul = Mul->getOperand(0); 16895 if (Mul->getOpcode() != ISD::MUL) 16896 return false; 16897 SDValue ExtA = Mul->getOperand(0); 16898 SDValue ExtB = Mul->getOperand(1); 16899 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode) 16900 return false; 16901 A = ExtA->getOperand(0); 16902 B = ExtB->getOperand(0); 16903 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) { 16904 A = ExtendIfNeeded(A, ExtendCode); 16905 B = ExtendIfNeeded(B, ExtendCode); 16906 return true; 16907 } 16908 return false; 16909 }; 16910 auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) { 16911 // Split illegal MVT::v16i8->i64 vector reductions into two legal v8i16->i64 16912 // reductions. The operands are extended with MVEEXT, but as they are 16913 // reductions the lane orders do not matter. MVEEXT may be combined with 16914 // loads to produce two extending loads, or else they will be expanded to 16915 // VREV/VMOVL. 16916 EVT VT = Ops[0].getValueType(); 16917 if (VT == MVT::v16i8) { 16918 assert((Opcode == ARMISD::VMLALVs || Opcode == ARMISD::VMLALVu) && 16919 "Unexpected illegal long reduction opcode"); 16920 bool IsUnsigned = Opcode == ARMISD::VMLALVu; 16921 16922 SDValue Ext0 = 16923 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl, 16924 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[0]); 16925 SDValue Ext1 = 16926 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl, 16927 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[1]); 16928 16929 SDValue MLA0 = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32), 16930 Ext0, Ext1); 16931 SDValue MLA1 = 16932 DAG.getNode(IsUnsigned ? ARMISD::VMLALVAu : ARMISD::VMLALVAs, dl, 16933 DAG.getVTList(MVT::i32, MVT::i32), MLA0, MLA0.getValue(1), 16934 Ext0.getValue(1), Ext1.getValue(1)); 16935 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, MLA1, MLA1.getValue(1)); 16936 } 16937 SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops); 16938 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node, 16939 SDValue(Node.getNode(), 1)); 16940 }; 16941 16942 SDValue A, B; 16943 SDValue Mask; 16944 if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B)) 16945 return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B); 16946 if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B)) 16947 return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B); 16948 if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32}, 16949 A, B)) 16950 return Create64bitNode(ARMISD::VMLALVs, {A, B}); 16951 if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32}, 16952 A, B)) 16953 return Create64bitNode(ARMISD::VMLALVu, {A, B}); 16954 if (IsVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B)) 16955 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, 16956 DAG.getNode(ARMISD::VMLAVs, dl, MVT::i32, A, B)); 16957 if (IsVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B)) 16958 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, 16959 DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B)); 16960 16961 if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B, 16962 Mask)) 16963 return DAG.getNode(ARMISD::VMLAVps, dl, ResVT, A, B, Mask); 16964 if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B, 16965 Mask)) 16966 return DAG.getNode(ARMISD::VMLAVpu, dl, ResVT, A, B, Mask); 16967 if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B, 16968 Mask)) 16969 return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask}); 16970 if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B, 16971 Mask)) 16972 return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask}); 16973 if (IsPredVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B, Mask)) 16974 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, 16975 DAG.getNode(ARMISD::VMLAVps, dl, MVT::i32, A, B, Mask)); 16976 if (IsPredVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B, Mask)) 16977 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, 16978 DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask)); 16979 16980 if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8})) 16981 return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A); 16982 if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8})) 16983 return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A); 16984 if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32})) 16985 return Create64bitNode(ARMISD::VADDLVs, {A}); 16986 if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32})) 16987 return Create64bitNode(ARMISD::VADDLVu, {A}); 16988 if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8})) 16989 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, 16990 DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A)); 16991 if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8})) 16992 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, 16993 DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A)); 16994 16995 if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask)) 16996 return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask); 16997 if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask)) 16998 return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask); 16999 if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}, Mask)) 17000 return Create64bitNode(ARMISD::VADDLVps, {A, Mask}); 17001 if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}, Mask)) 17002 return Create64bitNode(ARMISD::VADDLVpu, {A, Mask}); 17003 if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask)) 17004 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, 17005 DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask)); 17006 if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask)) 17007 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, 17008 DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask)); 17009 17010 // Some complications. We can get a case where the two inputs of the mul are 17011 // the same, then the output sext will have been helpfully converted to a 17012 // zext. Turn it back. 17013 SDValue Op = N0; 17014 if (Op->getOpcode() == ISD::VSELECT) 17015 Op = Op->getOperand(1); 17016 if (Op->getOpcode() == ISD::ZERO_EXTEND && 17017 Op->getOperand(0)->getOpcode() == ISD::MUL) { 17018 SDValue Mul = Op->getOperand(0); 17019 if (Mul->getOperand(0) == Mul->getOperand(1) && 17020 Mul->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) { 17021 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, N0->getValueType(0), Mul); 17022 if (Op != N0) 17023 Ext = DAG.getNode(ISD::VSELECT, dl, N0->getValueType(0), 17024 N0->getOperand(0), Ext, N0->getOperand(2)); 17025 return DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, Ext); 17026 } 17027 } 17028 17029 return SDValue(); 17030 } 17031 17032 static SDValue PerformVMOVNCombine(SDNode *N, 17033 TargetLowering::DAGCombinerInfo &DCI) { 17034 SDValue Op0 = N->getOperand(0); 17035 SDValue Op1 = N->getOperand(1); 17036 unsigned IsTop = N->getConstantOperandVal(2); 17037 17038 // VMOVNT a undef -> a 17039 // VMOVNB a undef -> a 17040 // VMOVNB undef a -> a 17041 if (Op1->isUndef()) 17042 return Op0; 17043 if (Op0->isUndef() && !IsTop) 17044 return Op1; 17045 17046 // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b) 17047 // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b) 17048 if ((Op1->getOpcode() == ARMISD::VQMOVNs || 17049 Op1->getOpcode() == ARMISD::VQMOVNu) && 17050 Op1->getConstantOperandVal(2) == 0) 17051 return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0), 17052 Op0, Op1->getOperand(1), N->getOperand(2)); 17053 17054 // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from 17055 // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting 17056 // into the top or bottom lanes. 17057 unsigned NumElts = N->getValueType(0).getVectorNumElements(); 17058 APInt Op1DemandedElts = APInt::getSplat(NumElts, APInt::getLowBitsSet(2, 1)); 17059 APInt Op0DemandedElts = 17060 IsTop ? Op1DemandedElts 17061 : APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1)); 17062 17063 APInt KnownUndef, KnownZero; 17064 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo(); 17065 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, KnownUndef, 17066 KnownZero, DCI)) 17067 return SDValue(N, 0); 17068 if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, KnownUndef, 17069 KnownZero, DCI)) 17070 return SDValue(N, 0); 17071 17072 return SDValue(); 17073 } 17074 17075 static SDValue PerformVQMOVNCombine(SDNode *N, 17076 TargetLowering::DAGCombinerInfo &DCI) { 17077 SDValue Op0 = N->getOperand(0); 17078 unsigned IsTop = N->getConstantOperandVal(2); 17079 17080 unsigned NumElts = N->getValueType(0).getVectorNumElements(); 17081 APInt Op0DemandedElts = 17082 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1) 17083 : APInt::getHighBitsSet(2, 1)); 17084 17085 APInt KnownUndef, KnownZero; 17086 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo(); 17087 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, KnownUndef, 17088 KnownZero, DCI)) 17089 return SDValue(N, 0); 17090 return SDValue(); 17091 } 17092 17093 static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG) { 17094 SDLoc DL(N); 17095 SDValue Op0 = N->getOperand(0); 17096 SDValue Op1 = N->getOperand(1); 17097 17098 // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from 17099 // uses of the intrinsics. 17100 if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) { 17101 int ShiftAmt = C->getSExtValue(); 17102 if (ShiftAmt == 0) { 17103 SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL); 17104 DAG.ReplaceAllUsesWith(N, Merge.getNode()); 17105 return SDValue(); 17106 } 17107 17108 if (ShiftAmt >= -32 && ShiftAmt < 0) { 17109 unsigned NewOpcode = 17110 N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL; 17111 SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1, 17112 DAG.getConstant(-ShiftAmt, DL, MVT::i32)); 17113 DAG.ReplaceAllUsesWith(N, NewShift.getNode()); 17114 return NewShift; 17115 } 17116 } 17117 17118 return SDValue(); 17119 } 17120 17121 /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. 17122 SDValue ARMTargetLowering::PerformIntrinsicCombine(SDNode *N, 17123 DAGCombinerInfo &DCI) const { 17124 SelectionDAG &DAG = DCI.DAG; 17125 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 17126 switch (IntNo) { 17127 default: 17128 // Don't do anything for most intrinsics. 17129 break; 17130 17131 // Vector shifts: check for immediate versions and lower them. 17132 // Note: This is done during DAG combining instead of DAG legalizing because 17133 // the build_vectors for 64-bit vector element shift counts are generally 17134 // not legal, and it is hard to see their values after they get legalized to 17135 // loads from a constant pool. 17136 case Intrinsic::arm_neon_vshifts: 17137 case Intrinsic::arm_neon_vshiftu: 17138 case Intrinsic::arm_neon_vrshifts: 17139 case Intrinsic::arm_neon_vrshiftu: 17140 case Intrinsic::arm_neon_vrshiftn: 17141 case Intrinsic::arm_neon_vqshifts: 17142 case Intrinsic::arm_neon_vqshiftu: 17143 case Intrinsic::arm_neon_vqshiftsu: 17144 case Intrinsic::arm_neon_vqshiftns: 17145 case Intrinsic::arm_neon_vqshiftnu: 17146 case Intrinsic::arm_neon_vqshiftnsu: 17147 case Intrinsic::arm_neon_vqrshiftns: 17148 case Intrinsic::arm_neon_vqrshiftnu: 17149 case Intrinsic::arm_neon_vqrshiftnsu: { 17150 EVT VT = N->getOperand(1).getValueType(); 17151 int64_t Cnt; 17152 unsigned VShiftOpc = 0; 17153 17154 switch (IntNo) { 17155 case Intrinsic::arm_neon_vshifts: 17156 case Intrinsic::arm_neon_vshiftu: 17157 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) { 17158 VShiftOpc = ARMISD::VSHLIMM; 17159 break; 17160 } 17161 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) { 17162 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM 17163 : ARMISD::VSHRuIMM); 17164 break; 17165 } 17166 return SDValue(); 17167 17168 case Intrinsic::arm_neon_vrshifts: 17169 case Intrinsic::arm_neon_vrshiftu: 17170 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) 17171 break; 17172 return SDValue(); 17173 17174 case Intrinsic::arm_neon_vqshifts: 17175 case Intrinsic::arm_neon_vqshiftu: 17176 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 17177 break; 17178 return SDValue(); 17179 17180 case Intrinsic::arm_neon_vqshiftsu: 17181 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 17182 break; 17183 llvm_unreachable("invalid shift count for vqshlu intrinsic"); 17184 17185 case Intrinsic::arm_neon_vrshiftn: 17186 case Intrinsic::arm_neon_vqshiftns: 17187 case Intrinsic::arm_neon_vqshiftnu: 17188 case Intrinsic::arm_neon_vqshiftnsu: 17189 case Intrinsic::arm_neon_vqrshiftns: 17190 case Intrinsic::arm_neon_vqrshiftnu: 17191 case Intrinsic::arm_neon_vqrshiftnsu: 17192 // Narrowing shifts require an immediate right shift. 17193 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt)) 17194 break; 17195 llvm_unreachable("invalid shift count for narrowing vector shift " 17196 "intrinsic"); 17197 17198 default: 17199 llvm_unreachable("unhandled vector shift"); 17200 } 17201 17202 switch (IntNo) { 17203 case Intrinsic::arm_neon_vshifts: 17204 case Intrinsic::arm_neon_vshiftu: 17205 // Opcode already set above. 17206 break; 17207 case Intrinsic::arm_neon_vrshifts: 17208 VShiftOpc = ARMISD::VRSHRsIMM; 17209 break; 17210 case Intrinsic::arm_neon_vrshiftu: 17211 VShiftOpc = ARMISD::VRSHRuIMM; 17212 break; 17213 case Intrinsic::arm_neon_vrshiftn: 17214 VShiftOpc = ARMISD::VRSHRNIMM; 17215 break; 17216 case Intrinsic::arm_neon_vqshifts: 17217 VShiftOpc = ARMISD::VQSHLsIMM; 17218 break; 17219 case Intrinsic::arm_neon_vqshiftu: 17220 VShiftOpc = ARMISD::VQSHLuIMM; 17221 break; 17222 case Intrinsic::arm_neon_vqshiftsu: 17223 VShiftOpc = ARMISD::VQSHLsuIMM; 17224 break; 17225 case Intrinsic::arm_neon_vqshiftns: 17226 VShiftOpc = ARMISD::VQSHRNsIMM; 17227 break; 17228 case Intrinsic::arm_neon_vqshiftnu: 17229 VShiftOpc = ARMISD::VQSHRNuIMM; 17230 break; 17231 case Intrinsic::arm_neon_vqshiftnsu: 17232 VShiftOpc = ARMISD::VQSHRNsuIMM; 17233 break; 17234 case Intrinsic::arm_neon_vqrshiftns: 17235 VShiftOpc = ARMISD::VQRSHRNsIMM; 17236 break; 17237 case Intrinsic::arm_neon_vqrshiftnu: 17238 VShiftOpc = ARMISD::VQRSHRNuIMM; 17239 break; 17240 case Intrinsic::arm_neon_vqrshiftnsu: 17241 VShiftOpc = ARMISD::VQRSHRNsuIMM; 17242 break; 17243 } 17244 17245 SDLoc dl(N); 17246 return DAG.getNode(VShiftOpc, dl, N->getValueType(0), 17247 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32)); 17248 } 17249 17250 case Intrinsic::arm_neon_vshiftins: { 17251 EVT VT = N->getOperand(1).getValueType(); 17252 int64_t Cnt; 17253 unsigned VShiftOpc = 0; 17254 17255 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt)) 17256 VShiftOpc = ARMISD::VSLIIMM; 17257 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt)) 17258 VShiftOpc = ARMISD::VSRIIMM; 17259 else { 17260 llvm_unreachable("invalid shift count for vsli/vsri intrinsic"); 17261 } 17262 17263 SDLoc dl(N); 17264 return DAG.getNode(VShiftOpc, dl, N->getValueType(0), 17265 N->getOperand(1), N->getOperand(2), 17266 DAG.getConstant(Cnt, dl, MVT::i32)); 17267 } 17268 17269 case Intrinsic::arm_neon_vqrshifts: 17270 case Intrinsic::arm_neon_vqrshiftu: 17271 // No immediate versions of these to check for. 17272 break; 17273 17274 case Intrinsic::arm_mve_vqdmlah: 17275 case Intrinsic::arm_mve_vqdmlash: 17276 case Intrinsic::arm_mve_vqrdmlah: 17277 case Intrinsic::arm_mve_vqrdmlash: 17278 case Intrinsic::arm_mve_vmla_n_predicated: 17279 case Intrinsic::arm_mve_vmlas_n_predicated: 17280 case Intrinsic::arm_mve_vqdmlah_predicated: 17281 case Intrinsic::arm_mve_vqdmlash_predicated: 17282 case Intrinsic::arm_mve_vqrdmlah_predicated: 17283 case Intrinsic::arm_mve_vqrdmlash_predicated: { 17284 // These intrinsics all take an i32 scalar operand which is narrowed to the 17285 // size of a single lane of the vector type they return. So we don't need 17286 // any bits of that operand above that point, which allows us to eliminate 17287 // uxth/sxth. 17288 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits(); 17289 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth); 17290 if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI)) 17291 return SDValue(); 17292 break; 17293 } 17294 17295 case Intrinsic::arm_mve_minv: 17296 case Intrinsic::arm_mve_maxv: 17297 case Intrinsic::arm_mve_minav: 17298 case Intrinsic::arm_mve_maxav: 17299 case Intrinsic::arm_mve_minv_predicated: 17300 case Intrinsic::arm_mve_maxv_predicated: 17301 case Intrinsic::arm_mve_minav_predicated: 17302 case Intrinsic::arm_mve_maxav_predicated: { 17303 // These intrinsics all take an i32 scalar operand which is narrowed to the 17304 // size of a single lane of the vector type they take as the other input. 17305 unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits(); 17306 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth); 17307 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) 17308 return SDValue(); 17309 break; 17310 } 17311 17312 case Intrinsic::arm_mve_addv: { 17313 // Turn this intrinsic straight into the appropriate ARMISD::VADDV node, 17314 // which allow PerformADDVecReduce to turn it into VADDLV when possible. 17315 bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 17316 unsigned Opc = Unsigned ? ARMISD::VADDVu : ARMISD::VADDVs; 17317 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1)); 17318 } 17319 17320 case Intrinsic::arm_mve_addlv: 17321 case Intrinsic::arm_mve_addlv_predicated: { 17322 // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR 17323 // which recombines the two outputs into an i64 17324 bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 17325 unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ? 17326 (Unsigned ? ARMISD::VADDLVu : ARMISD::VADDLVs) : 17327 (Unsigned ? ARMISD::VADDLVpu : ARMISD::VADDLVps); 17328 17329 SmallVector<SDValue, 4> Ops; 17330 for (unsigned i = 1, e = N->getNumOperands(); i < e; i++) 17331 if (i != 2) // skip the unsigned flag 17332 Ops.push_back(N->getOperand(i)); 17333 17334 SDLoc dl(N); 17335 SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops); 17336 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0), 17337 val.getValue(1)); 17338 } 17339 } 17340 17341 return SDValue(); 17342 } 17343 17344 /// PerformShiftCombine - Checks for immediate versions of vector shifts and 17345 /// lowers them. As with the vector shift intrinsics, this is done during DAG 17346 /// combining instead of DAG legalizing because the build_vectors for 64-bit 17347 /// vector element shift counts are generally not legal, and it is hard to see 17348 /// their values after they get legalized to loads from a constant pool. 17349 static SDValue PerformShiftCombine(SDNode *N, 17350 TargetLowering::DAGCombinerInfo &DCI, 17351 const ARMSubtarget *ST) { 17352 SelectionDAG &DAG = DCI.DAG; 17353 EVT VT = N->getValueType(0); 17354 17355 if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 && 17356 N->getOperand(0)->getOpcode() == ISD::AND && 17357 N->getOperand(0)->hasOneUse()) { 17358 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 17359 return SDValue(); 17360 // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't 17361 // usually show up because instcombine prefers to canonicalize it to 17362 // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come 17363 // out of GEP lowering in some cases. 17364 SDValue N0 = N->getOperand(0); 17365 ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1)); 17366 if (!ShiftAmtNode) 17367 return SDValue(); 17368 uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue()); 17369 ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1)); 17370 if (!AndMaskNode) 17371 return SDValue(); 17372 uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue()); 17373 // Don't transform uxtb/uxth. 17374 if (AndMask == 255 || AndMask == 65535) 17375 return SDValue(); 17376 if (isMask_32(AndMask)) { 17377 uint32_t MaskedBits = countLeadingZeros(AndMask); 17378 if (MaskedBits > ShiftAmt) { 17379 SDLoc DL(N); 17380 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 17381 DAG.getConstant(MaskedBits, DL, MVT::i32)); 17382 return DAG.getNode( 17383 ISD::SRL, DL, MVT::i32, SHL, 17384 DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32)); 17385 } 17386 } 17387 } 17388 17389 // Nothing to be done for scalar shifts. 17390 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 17391 if (!VT.isVector() || !TLI.isTypeLegal(VT)) 17392 return SDValue(); 17393 if (ST->hasMVEIntegerOps() && VT == MVT::v2i64) 17394 return SDValue(); 17395 17396 int64_t Cnt; 17397 17398 switch (N->getOpcode()) { 17399 default: llvm_unreachable("unexpected shift opcode"); 17400 17401 case ISD::SHL: 17402 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) { 17403 SDLoc dl(N); 17404 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0), 17405 DAG.getConstant(Cnt, dl, MVT::i32)); 17406 } 17407 break; 17408 17409 case ISD::SRA: 17410 case ISD::SRL: 17411 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { 17412 unsigned VShiftOpc = 17413 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM); 17414 SDLoc dl(N); 17415 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), 17416 DAG.getConstant(Cnt, dl, MVT::i32)); 17417 } 17418 } 17419 return SDValue(); 17420 } 17421 17422 // Look for a sign/zero/fpextend extend of a larger than legal load. This can be 17423 // split into multiple extending loads, which are simpler to deal with than an 17424 // arbitrary extend. For fp extends we use an integer extending load and a VCVTL 17425 // to convert the type to an f32. 17426 static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) { 17427 SDValue N0 = N->getOperand(0); 17428 if (N0.getOpcode() != ISD::LOAD) 17429 return SDValue(); 17430 LoadSDNode *LD = cast<LoadSDNode>(N0.getNode()); 17431 if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() || 17432 LD->getExtensionType() != ISD::NON_EXTLOAD) 17433 return SDValue(); 17434 EVT FromVT = LD->getValueType(0); 17435 EVT ToVT = N->getValueType(0); 17436 if (!ToVT.isVector()) 17437 return SDValue(); 17438 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements()); 17439 EVT ToEltVT = ToVT.getVectorElementType(); 17440 EVT FromEltVT = FromVT.getVectorElementType(); 17441 17442 unsigned NumElements = 0; 17443 if (ToEltVT == MVT::i32 && FromEltVT == MVT::i8) 17444 NumElements = 4; 17445 if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16) 17446 NumElements = 4; 17447 if (NumElements == 0 || 17448 (FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) || 17449 FromVT.getVectorNumElements() % NumElements != 0 || 17450 !isPowerOf2_32(NumElements)) 17451 return SDValue(); 17452 17453 LLVMContext &C = *DAG.getContext(); 17454 SDLoc DL(LD); 17455 // Details about the old load 17456 SDValue Ch = LD->getChain(); 17457 SDValue BasePtr = LD->getBasePtr(); 17458 Align Alignment = LD->getOriginalAlign(); 17459 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags(); 17460 AAMDNodes AAInfo = LD->getAAInfo(); 17461 17462 ISD::LoadExtType NewExtType = 17463 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 17464 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType()); 17465 EVT NewFromVT = EVT::getVectorVT( 17466 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements); 17467 EVT NewToVT = EVT::getVectorVT( 17468 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements); 17469 17470 SmallVector<SDValue, 4> Loads; 17471 SmallVector<SDValue, 4> Chains; 17472 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) { 17473 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8; 17474 SDValue NewPtr = 17475 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset)); 17476 17477 SDValue NewLoad = 17478 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset, 17479 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT, 17480 Alignment, MMOFlags, AAInfo); 17481 Loads.push_back(NewLoad); 17482 Chains.push_back(SDValue(NewLoad.getNode(), 1)); 17483 } 17484 17485 // Float truncs need to extended with VCVTB's into their floating point types. 17486 if (FromEltVT == MVT::f16) { 17487 SmallVector<SDValue, 4> Extends; 17488 17489 for (unsigned i = 0; i < Loads.size(); i++) { 17490 SDValue LoadBC = 17491 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]); 17492 SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC, 17493 DAG.getConstant(0, DL, MVT::i32)); 17494 Extends.push_back(FPExt); 17495 } 17496 17497 Loads = Extends; 17498 } 17499 17500 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 17501 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain); 17502 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads); 17503 } 17504 17505 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, 17506 /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND. 17507 static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, 17508 const ARMSubtarget *ST) { 17509 SDValue N0 = N->getOperand(0); 17510 17511 // Check for sign- and zero-extensions of vector extract operations of 8- and 17512 // 16-bit vector elements. NEON and MVE support these directly. They are 17513 // handled during DAG combining because type legalization will promote them 17514 // to 32-bit types and it is messy to recognize the operations after that. 17515 if ((ST->hasNEON() || ST->hasMVEIntegerOps()) && 17516 N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 17517 SDValue Vec = N0.getOperand(0); 17518 SDValue Lane = N0.getOperand(1); 17519 EVT VT = N->getValueType(0); 17520 EVT EltVT = N0.getValueType(); 17521 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 17522 17523 if (VT == MVT::i32 && 17524 (EltVT == MVT::i8 || EltVT == MVT::i16) && 17525 TLI.isTypeLegal(Vec.getValueType()) && 17526 isa<ConstantSDNode>(Lane)) { 17527 17528 unsigned Opc = 0; 17529 switch (N->getOpcode()) { 17530 default: llvm_unreachable("unexpected opcode"); 17531 case ISD::SIGN_EXTEND: 17532 Opc = ARMISD::VGETLANEs; 17533 break; 17534 case ISD::ZERO_EXTEND: 17535 case ISD::ANY_EXTEND: 17536 Opc = ARMISD::VGETLANEu; 17537 break; 17538 } 17539 return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane); 17540 } 17541 } 17542 17543 if (ST->hasMVEIntegerOps()) 17544 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG)) 17545 return NewLoad; 17546 17547 return SDValue(); 17548 } 17549 17550 static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG, 17551 const ARMSubtarget *ST) { 17552 if (ST->hasMVEFloatOps()) 17553 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG)) 17554 return NewLoad; 17555 17556 return SDValue(); 17557 } 17558 17559 /// PerformMinMaxCombine - Target-specific DAG combining for creating truncating 17560 /// saturates. 17561 static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG, 17562 const ARMSubtarget *ST) { 17563 EVT VT = N->getValueType(0); 17564 SDValue N0 = N->getOperand(0); 17565 if (!ST->hasMVEIntegerOps()) 17566 return SDValue(); 17567 17568 if (SDValue V = PerformVQDMULHCombine(N, DAG)) 17569 return V; 17570 17571 if (VT != MVT::v4i32 && VT != MVT::v8i16) 17572 return SDValue(); 17573 17574 auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) { 17575 // Check one is a smin and the other is a smax 17576 if (Min->getOpcode() != ISD::SMIN) 17577 std::swap(Min, Max); 17578 if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX) 17579 return false; 17580 17581 APInt SaturateC; 17582 if (VT == MVT::v4i32) 17583 SaturateC = APInt(32, (1 << 15) - 1, true); 17584 else //if (VT == MVT::v8i16) 17585 SaturateC = APInt(16, (1 << 7) - 1, true); 17586 17587 APInt MinC, MaxC; 17588 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) || 17589 MinC != SaturateC) 17590 return false; 17591 if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) || 17592 MaxC != ~SaturateC) 17593 return false; 17594 return true; 17595 }; 17596 17597 if (IsSignedSaturate(N, N0.getNode())) { 17598 SDLoc DL(N); 17599 MVT ExtVT, HalfVT; 17600 if (VT == MVT::v4i32) { 17601 HalfVT = MVT::v8i16; 17602 ExtVT = MVT::v4i16; 17603 } else { // if (VT == MVT::v8i16) 17604 HalfVT = MVT::v16i8; 17605 ExtVT = MVT::v8i8; 17606 } 17607 17608 // Create a VQMOVNB with undef top lanes, then signed extended into the top 17609 // half. That extend will hopefully be removed if only the bottom bits are 17610 // demanded (though a truncating store, for example). 17611 SDValue VQMOVN = 17612 DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT), 17613 N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32)); 17614 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN); 17615 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast, 17616 DAG.getValueType(ExtVT)); 17617 } 17618 17619 auto IsUnsignedSaturate = [&](SDNode *Min) { 17620 // For unsigned, we just need to check for <= 0xffff 17621 if (Min->getOpcode() != ISD::UMIN) 17622 return false; 17623 17624 APInt SaturateC; 17625 if (VT == MVT::v4i32) 17626 SaturateC = APInt(32, (1 << 16) - 1, true); 17627 else //if (VT == MVT::v8i16) 17628 SaturateC = APInt(16, (1 << 8) - 1, true); 17629 17630 APInt MinC; 17631 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) || 17632 MinC != SaturateC) 17633 return false; 17634 return true; 17635 }; 17636 17637 if (IsUnsignedSaturate(N)) { 17638 SDLoc DL(N); 17639 MVT HalfVT; 17640 unsigned ExtConst; 17641 if (VT == MVT::v4i32) { 17642 HalfVT = MVT::v8i16; 17643 ExtConst = 0x0000FFFF; 17644 } else { //if (VT == MVT::v8i16) 17645 HalfVT = MVT::v16i8; 17646 ExtConst = 0x00FF; 17647 } 17648 17649 // Create a VQMOVNB with undef top lanes, then ZExt into the top half with 17650 // an AND. That extend will hopefully be removed if only the bottom bits are 17651 // demanded (though a truncating store, for example). 17652 SDValue VQMOVN = 17653 DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0, 17654 DAG.getConstant(0, DL, MVT::i32)); 17655 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN); 17656 return DAG.getNode(ISD::AND, DL, VT, Bitcast, 17657 DAG.getConstant(ExtConst, DL, VT)); 17658 } 17659 17660 return SDValue(); 17661 } 17662 17663 static const APInt *isPowerOf2Constant(SDValue V) { 17664 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 17665 if (!C) 17666 return nullptr; 17667 const APInt *CV = &C->getAPIntValue(); 17668 return CV->isPowerOf2() ? CV : nullptr; 17669 } 17670 17671 SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const { 17672 // If we have a CMOV, OR and AND combination such as: 17673 // if (x & CN) 17674 // y |= CM; 17675 // 17676 // And: 17677 // * CN is a single bit; 17678 // * All bits covered by CM are known zero in y 17679 // 17680 // Then we can convert this into a sequence of BFI instructions. This will 17681 // always be a win if CM is a single bit, will always be no worse than the 17682 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is 17683 // three bits (due to the extra IT instruction). 17684 17685 SDValue Op0 = CMOV->getOperand(0); 17686 SDValue Op1 = CMOV->getOperand(1); 17687 auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2)); 17688 auto CC = CCNode->getAPIntValue().getLimitedValue(); 17689 SDValue CmpZ = CMOV->getOperand(4); 17690 17691 // The compare must be against zero. 17692 if (!isNullConstant(CmpZ->getOperand(1))) 17693 return SDValue(); 17694 17695 assert(CmpZ->getOpcode() == ARMISD::CMPZ); 17696 SDValue And = CmpZ->getOperand(0); 17697 if (And->getOpcode() != ISD::AND) 17698 return SDValue(); 17699 const APInt *AndC = isPowerOf2Constant(And->getOperand(1)); 17700 if (!AndC) 17701 return SDValue(); 17702 SDValue X = And->getOperand(0); 17703 17704 if (CC == ARMCC::EQ) { 17705 // We're performing an "equal to zero" compare. Swap the operands so we 17706 // canonicalize on a "not equal to zero" compare. 17707 std::swap(Op0, Op1); 17708 } else { 17709 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?"); 17710 } 17711 17712 if (Op1->getOpcode() != ISD::OR) 17713 return SDValue(); 17714 17715 ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1)); 17716 if (!OrC) 17717 return SDValue(); 17718 SDValue Y = Op1->getOperand(0); 17719 17720 if (Op0 != Y) 17721 return SDValue(); 17722 17723 // Now, is it profitable to continue? 17724 APInt OrCI = OrC->getAPIntValue(); 17725 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2; 17726 if (OrCI.countPopulation() > Heuristic) 17727 return SDValue(); 17728 17729 // Lastly, can we determine that the bits defined by OrCI 17730 // are zero in Y? 17731 KnownBits Known = DAG.computeKnownBits(Y); 17732 if ((OrCI & Known.Zero) != OrCI) 17733 return SDValue(); 17734 17735 // OK, we can do the combine. 17736 SDValue V = Y; 17737 SDLoc dl(X); 17738 EVT VT = X.getValueType(); 17739 unsigned BitInX = AndC->logBase2(); 17740 17741 if (BitInX != 0) { 17742 // We must shift X first. 17743 X = DAG.getNode(ISD::SRL, dl, VT, X, 17744 DAG.getConstant(BitInX, dl, VT)); 17745 } 17746 17747 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits(); 17748 BitInY < NumActiveBits; ++BitInY) { 17749 if (OrCI[BitInY] == 0) 17750 continue; 17751 APInt Mask(VT.getSizeInBits(), 0); 17752 Mask.setBit(BitInY); 17753 V = DAG.getNode(ARMISD::BFI, dl, VT, V, X, 17754 // Confusingly, the operand is an *inverted* mask. 17755 DAG.getConstant(~Mask, dl, VT)); 17756 } 17757 17758 return V; 17759 } 17760 17761 // Given N, the value controlling the conditional branch, search for the loop 17762 // intrinsic, returning it, along with how the value is used. We need to handle 17763 // patterns such as the following: 17764 // (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit) 17765 // (brcond (setcc (loop.decrement), 0, eq), exit) 17766 // (brcond (setcc (loop.decrement), 0, ne), header) 17767 static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm, 17768 bool &Negate) { 17769 switch (N->getOpcode()) { 17770 default: 17771 break; 17772 case ISD::XOR: { 17773 if (!isa<ConstantSDNode>(N.getOperand(1))) 17774 return SDValue(); 17775 if (!cast<ConstantSDNode>(N.getOperand(1))->isOne()) 17776 return SDValue(); 17777 Negate = !Negate; 17778 return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate); 17779 } 17780 case ISD::SETCC: { 17781 auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1)); 17782 if (!Const) 17783 return SDValue(); 17784 if (Const->isZero()) 17785 Imm = 0; 17786 else if (Const->isOne()) 17787 Imm = 1; 17788 else 17789 return SDValue(); 17790 CC = cast<CondCodeSDNode>(N.getOperand(2))->get(); 17791 return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate); 17792 } 17793 case ISD::INTRINSIC_W_CHAIN: { 17794 unsigned IntOp = cast<ConstantSDNode>(N.getOperand(1))->getZExtValue(); 17795 if (IntOp != Intrinsic::test_start_loop_iterations && 17796 IntOp != Intrinsic::loop_decrement_reg) 17797 return SDValue(); 17798 return N; 17799 } 17800 } 17801 return SDValue(); 17802 } 17803 17804 static SDValue PerformHWLoopCombine(SDNode *N, 17805 TargetLowering::DAGCombinerInfo &DCI, 17806 const ARMSubtarget *ST) { 17807 17808 // The hwloop intrinsics that we're interested are used for control-flow, 17809 // either for entering or exiting the loop: 17810 // - test.start.loop.iterations will test whether its operand is zero. If it 17811 // is zero, the proceeding branch should not enter the loop. 17812 // - loop.decrement.reg also tests whether its operand is zero. If it is 17813 // zero, the proceeding branch should not branch back to the beginning of 17814 // the loop. 17815 // So here, we need to check that how the brcond is using the result of each 17816 // of the intrinsics to ensure that we're branching to the right place at the 17817 // right time. 17818 17819 ISD::CondCode CC; 17820 SDValue Cond; 17821 int Imm = 1; 17822 bool Negate = false; 17823 SDValue Chain = N->getOperand(0); 17824 SDValue Dest; 17825 17826 if (N->getOpcode() == ISD::BRCOND) { 17827 CC = ISD::SETEQ; 17828 Cond = N->getOperand(1); 17829 Dest = N->getOperand(2); 17830 } else { 17831 assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!"); 17832 CC = cast<CondCodeSDNode>(N->getOperand(1))->get(); 17833 Cond = N->getOperand(2); 17834 Dest = N->getOperand(4); 17835 if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) { 17836 if (!Const->isOne() && !Const->isZero()) 17837 return SDValue(); 17838 Imm = Const->getZExtValue(); 17839 } else 17840 return SDValue(); 17841 } 17842 17843 SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate); 17844 if (!Int) 17845 return SDValue(); 17846 17847 if (Negate) 17848 CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32); 17849 17850 auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) { 17851 return (CC == ISD::SETEQ && Imm == 0) || 17852 (CC == ISD::SETNE && Imm == 1) || 17853 (CC == ISD::SETLT && Imm == 1) || 17854 (CC == ISD::SETULT && Imm == 1); 17855 }; 17856 17857 auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) { 17858 return (CC == ISD::SETEQ && Imm == 1) || 17859 (CC == ISD::SETNE && Imm == 0) || 17860 (CC == ISD::SETGT && Imm == 0) || 17861 (CC == ISD::SETUGT && Imm == 0) || 17862 (CC == ISD::SETGE && Imm == 1) || 17863 (CC == ISD::SETUGE && Imm == 1); 17864 }; 17865 17866 assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) && 17867 "unsupported condition"); 17868 17869 SDLoc dl(Int); 17870 SelectionDAG &DAG = DCI.DAG; 17871 SDValue Elements = Int.getOperand(2); 17872 unsigned IntOp = cast<ConstantSDNode>(Int->getOperand(1))->getZExtValue(); 17873 assert((N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BR) 17874 && "expected single br user"); 17875 SDNode *Br = *N->use_begin(); 17876 SDValue OtherTarget = Br->getOperand(1); 17877 17878 // Update the unconditional branch to branch to the given Dest. 17879 auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) { 17880 SDValue NewBrOps[] = { Br->getOperand(0), Dest }; 17881 SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps); 17882 DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr); 17883 }; 17884 17885 if (IntOp == Intrinsic::test_start_loop_iterations) { 17886 SDValue Res; 17887 SDValue Setup = DAG.getNode(ARMISD::WLSSETUP, dl, MVT::i32, Elements); 17888 // We expect this 'instruction' to branch when the counter is zero. 17889 if (IsTrueIfZero(CC, Imm)) { 17890 SDValue Ops[] = {Chain, Setup, Dest}; 17891 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); 17892 } else { 17893 // The logic is the reverse of what we need for WLS, so find the other 17894 // basic block target: the target of the proceeding br. 17895 UpdateUncondBr(Br, Dest, DAG); 17896 17897 SDValue Ops[] = {Chain, Setup, OtherTarget}; 17898 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); 17899 } 17900 // Update LR count to the new value 17901 DAG.ReplaceAllUsesOfValueWith(Int.getValue(0), Setup); 17902 // Update chain 17903 DAG.ReplaceAllUsesOfValueWith(Int.getValue(2), Int.getOperand(0)); 17904 return Res; 17905 } else { 17906 SDValue Size = DAG.getTargetConstant( 17907 cast<ConstantSDNode>(Int.getOperand(3))->getZExtValue(), dl, MVT::i32); 17908 SDValue Args[] = { Int.getOperand(0), Elements, Size, }; 17909 SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl, 17910 DAG.getVTList(MVT::i32, MVT::Other), Args); 17911 DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode()); 17912 17913 // We expect this instruction to branch when the count is not zero. 17914 SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget; 17915 17916 // Update the unconditional branch to target the loop preheader if we've 17917 // found the condition has been reversed. 17918 if (Target == OtherTarget) 17919 UpdateUncondBr(Br, Dest, DAG); 17920 17921 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 17922 SDValue(LoopDec.getNode(), 1), Chain); 17923 17924 SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target }; 17925 return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs); 17926 } 17927 return SDValue(); 17928 } 17929 17930 /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND. 17931 SDValue 17932 ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const { 17933 SDValue Cmp = N->getOperand(4); 17934 if (Cmp.getOpcode() != ARMISD::CMPZ) 17935 // Only looking at NE cases. 17936 return SDValue(); 17937 17938 EVT VT = N->getValueType(0); 17939 SDLoc dl(N); 17940 SDValue LHS = Cmp.getOperand(0); 17941 SDValue RHS = Cmp.getOperand(1); 17942 SDValue Chain = N->getOperand(0); 17943 SDValue BB = N->getOperand(1); 17944 SDValue ARMcc = N->getOperand(2); 17945 ARMCC::CondCodes CC = 17946 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); 17947 17948 // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0)) 17949 // -> (brcond Chain BB CC CPSR Cmp) 17950 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() && 17951 LHS->getOperand(0)->getOpcode() == ARMISD::CMOV && 17952 LHS->getOperand(0)->hasOneUse()) { 17953 auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0)); 17954 auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1)); 17955 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); 17956 auto *RHSC = dyn_cast<ConstantSDNode>(RHS); 17957 if ((LHS00C && LHS00C->getZExtValue() == 0) && 17958 (LHS01C && LHS01C->getZExtValue() == 1) && 17959 (LHS1C && LHS1C->getZExtValue() == 1) && 17960 (RHSC && RHSC->getZExtValue() == 0)) { 17961 return DAG.getNode( 17962 ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2), 17963 LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4)); 17964 } 17965 } 17966 17967 return SDValue(); 17968 } 17969 17970 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV. 17971 SDValue 17972 ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { 17973 SDValue Cmp = N->getOperand(4); 17974 if (Cmp.getOpcode() != ARMISD::CMPZ) 17975 // Only looking at EQ and NE cases. 17976 return SDValue(); 17977 17978 EVT VT = N->getValueType(0); 17979 SDLoc dl(N); 17980 SDValue LHS = Cmp.getOperand(0); 17981 SDValue RHS = Cmp.getOperand(1); 17982 SDValue FalseVal = N->getOperand(0); 17983 SDValue TrueVal = N->getOperand(1); 17984 SDValue ARMcc = N->getOperand(2); 17985 ARMCC::CondCodes CC = 17986 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); 17987 17988 // BFI is only available on V6T2+. 17989 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) { 17990 SDValue R = PerformCMOVToBFICombine(N, DAG); 17991 if (R) 17992 return R; 17993 } 17994 17995 // Simplify 17996 // mov r1, r0 17997 // cmp r1, x 17998 // mov r0, y 17999 // moveq r0, x 18000 // to 18001 // cmp r0, x 18002 // movne r0, y 18003 // 18004 // mov r1, r0 18005 // cmp r1, x 18006 // mov r0, x 18007 // movne r0, y 18008 // to 18009 // cmp r0, x 18010 // movne r0, y 18011 /// FIXME: Turn this into a target neutral optimization? 18012 SDValue Res; 18013 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) { 18014 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, 18015 N->getOperand(3), Cmp); 18016 } else if (CC == ARMCC::EQ && TrueVal == RHS) { 18017 SDValue ARMcc; 18018 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl); 18019 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, 18020 N->getOperand(3), NewCmp); 18021 } 18022 18023 // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0)) 18024 // -> (cmov F T CC CPSR Cmp) 18025 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) { 18026 auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)); 18027 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); 18028 auto *RHSC = dyn_cast<ConstantSDNode>(RHS); 18029 if ((LHS0C && LHS0C->getZExtValue() == 0) && 18030 (LHS1C && LHS1C->getZExtValue() == 1) && 18031 (RHSC && RHSC->getZExtValue() == 0)) { 18032 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, 18033 LHS->getOperand(2), LHS->getOperand(3), 18034 LHS->getOperand(4)); 18035 } 18036 } 18037 18038 if (!VT.isInteger()) 18039 return SDValue(); 18040 18041 // Fold away an unneccessary CMPZ/CMOV 18042 // CMOV A, B, C1, $cpsr, (CMPZ (CMOV 1, 0, C2, D), 0) -> 18043 // if C1==EQ -> CMOV A, B, C2, $cpsr, D 18044 // if C1==NE -> CMOV A, B, NOT(C2), $cpsr, D 18045 if (N->getConstantOperandVal(2) == ARMCC::EQ || 18046 N->getConstantOperandVal(2) == ARMCC::NE) { 18047 ARMCC::CondCodes Cond; 18048 if (SDValue C = IsCMPZCSINC(N->getOperand(4).getNode(), Cond)) { 18049 if (N->getConstantOperandVal(2) == ARMCC::NE) 18050 Cond = ARMCC::getOppositeCondition(Cond); 18051 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0), 18052 N->getOperand(1), 18053 DAG.getTargetConstant(Cond, SDLoc(N), MVT::i32), 18054 N->getOperand(3), C); 18055 } 18056 } 18057 18058 // Materialize a boolean comparison for integers so we can avoid branching. 18059 if (isNullConstant(FalseVal)) { 18060 if (CC == ARMCC::EQ && isOneConstant(TrueVal)) { 18061 if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) { 18062 // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it 18063 // right 5 bits will make that 32 be 1, otherwise it will be 0. 18064 // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5 18065 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); 18066 Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub), 18067 DAG.getConstant(5, dl, MVT::i32)); 18068 } else { 18069 // CMOV 0, 1, ==, (CMPZ x, y) -> 18070 // (ADDCARRY (SUB x, y), t:0, t:1) 18071 // where t = (SUBCARRY 0, (SUB x, y), 0) 18072 // 18073 // The SUBCARRY computes 0 - (x - y) and this will give a borrow when 18074 // x != y. In other words, a carry C == 1 when x == y, C == 0 18075 // otherwise. 18076 // The final ADDCARRY computes 18077 // x - y + (0 - (x - y)) + C == C 18078 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); 18079 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 18080 SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub); 18081 // ISD::SUBCARRY returns a borrow but we want the carry here 18082 // actually. 18083 SDValue Carry = 18084 DAG.getNode(ISD::SUB, dl, MVT::i32, 18085 DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1)); 18086 Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry); 18087 } 18088 } else if (CC == ARMCC::NE && !isNullConstant(RHS) && 18089 (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) { 18090 // This seems pointless but will allow us to combine it further below. 18091 // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1 18092 SDValue Sub = 18093 DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS); 18094 SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, 18095 Sub.getValue(1), SDValue()); 18096 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc, 18097 N->getOperand(3), CPSRGlue.getValue(1)); 18098 FalseVal = Sub; 18099 } 18100 } else if (isNullConstant(TrueVal)) { 18101 if (CC == ARMCC::EQ && !isNullConstant(RHS) && 18102 (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) { 18103 // This seems pointless but will allow us to combine it further below 18104 // Note that we change == for != as this is the dual for the case above. 18105 // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1 18106 SDValue Sub = 18107 DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS); 18108 SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, 18109 Sub.getValue(1), SDValue()); 18110 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal, 18111 DAG.getConstant(ARMCC::NE, dl, MVT::i32), 18112 N->getOperand(3), CPSRGlue.getValue(1)); 18113 FalseVal = Sub; 18114 } 18115 } 18116 18117 // On Thumb1, the DAG above may be further combined if z is a power of 2 18118 // (z == 2 ^ K). 18119 // CMOV (SUBS x, y), z, !=, (SUBS x, y):1 -> 18120 // t1 = (USUBO (SUB x, y), 1) 18121 // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1) 18122 // Result = if K != 0 then (SHL t2:0, K) else t2:0 18123 // 18124 // This also handles the special case of comparing against zero; it's 18125 // essentially, the same pattern, except there's no SUBS: 18126 // CMOV x, z, !=, (CMPZ x, 0) -> 18127 // t1 = (USUBO x, 1) 18128 // t2 = (SUBCARRY x, t1:0, t1:1) 18129 // Result = if K != 0 then (SHL t2:0, K) else t2:0 18130 const APInt *TrueConst; 18131 if (Subtarget->isThumb1Only() && CC == ARMCC::NE && 18132 ((FalseVal.getOpcode() == ARMISD::SUBS && 18133 FalseVal.getOperand(0) == LHS && FalseVal.getOperand(1) == RHS) || 18134 (FalseVal == LHS && isNullConstant(RHS))) && 18135 (TrueConst = isPowerOf2Constant(TrueVal))) { 18136 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 18137 unsigned ShiftAmount = TrueConst->logBase2(); 18138 if (ShiftAmount) 18139 TrueVal = DAG.getConstant(1, dl, VT); 18140 SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal); 18141 Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1)); 18142 18143 if (ShiftAmount) 18144 Res = DAG.getNode(ISD::SHL, dl, VT, Res, 18145 DAG.getConstant(ShiftAmount, dl, MVT::i32)); 18146 } 18147 18148 if (Res.getNode()) { 18149 KnownBits Known = DAG.computeKnownBits(SDValue(N,0)); 18150 // Capture demanded bits information that would be otherwise lost. 18151 if (Known.Zero == 0xfffffffe) 18152 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 18153 DAG.getValueType(MVT::i1)); 18154 else if (Known.Zero == 0xffffff00) 18155 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 18156 DAG.getValueType(MVT::i8)); 18157 else if (Known.Zero == 0xffff0000) 18158 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 18159 DAG.getValueType(MVT::i16)); 18160 } 18161 18162 return Res; 18163 } 18164 18165 static SDValue PerformBITCASTCombine(SDNode *N, 18166 TargetLowering::DAGCombinerInfo &DCI, 18167 const ARMSubtarget *ST) { 18168 SelectionDAG &DAG = DCI.DAG; 18169 SDValue Src = N->getOperand(0); 18170 EVT DstVT = N->getValueType(0); 18171 18172 // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE. 18173 if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) { 18174 EVT SrcVT = Src.getValueType(); 18175 if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits()) 18176 return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0)); 18177 } 18178 18179 // We may have a bitcast of something that has already had this bitcast 18180 // combine performed on it, so skip past any VECTOR_REG_CASTs. 18181 while (Src.getOpcode() == ARMISD::VECTOR_REG_CAST) 18182 Src = Src.getOperand(0); 18183 18184 // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that 18185 // would be generated is at least the width of the element type. 18186 EVT SrcVT = Src.getValueType(); 18187 if ((Src.getOpcode() == ARMISD::VMOVIMM || 18188 Src.getOpcode() == ARMISD::VMVNIMM || 18189 Src.getOpcode() == ARMISD::VMOVFPIMM) && 18190 SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() && 18191 DAG.getDataLayout().isBigEndian()) 18192 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src); 18193 18194 // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD x 18195 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI)) 18196 return R; 18197 18198 return SDValue(); 18199 } 18200 18201 // Some combines for the MVETrunc truncations legalizer helper. Also lowers the 18202 // node into stack operations after legalizeOps. 18203 SDValue ARMTargetLowering::PerformMVETruncCombine( 18204 SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const { 18205 SelectionDAG &DAG = DCI.DAG; 18206 EVT VT = N->getValueType(0); 18207 SDLoc DL(N); 18208 18209 // MVETrunc(Undef, Undef) -> Undef 18210 if (all_of(N->ops(), [](SDValue Op) { return Op.isUndef(); })) 18211 return DAG.getUNDEF(VT); 18212 18213 // MVETrunc(MVETrunc a b, MVETrunc c, d) -> MVETrunc 18214 if (N->getNumOperands() == 2 && 18215 N->getOperand(0).getOpcode() == ARMISD::MVETRUNC && 18216 N->getOperand(1).getOpcode() == ARMISD::MVETRUNC) 18217 return DAG.getNode(ARMISD::MVETRUNC, DL, VT, N->getOperand(0).getOperand(0), 18218 N->getOperand(0).getOperand(1), 18219 N->getOperand(1).getOperand(0), 18220 N->getOperand(1).getOperand(1)); 18221 18222 // MVETrunc(shuffle, shuffle) -> VMOVN 18223 if (N->getNumOperands() == 2 && 18224 N->getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE && 18225 N->getOperand(1).getOpcode() == ISD::VECTOR_SHUFFLE) { 18226 auto *S0 = cast<ShuffleVectorSDNode>(N->getOperand(0).getNode()); 18227 auto *S1 = cast<ShuffleVectorSDNode>(N->getOperand(1).getNode()); 18228 18229 if (S0->getOperand(0) == S1->getOperand(0) && 18230 S0->getOperand(1) == S1->getOperand(1)) { 18231 // Construct complete shuffle mask 18232 SmallVector<int, 8> Mask(S0->getMask().begin(), S0->getMask().end()); 18233 Mask.append(S1->getMask().begin(), S1->getMask().end()); 18234 18235 if (isVMOVNTruncMask(Mask, VT, false)) 18236 return DAG.getNode( 18237 ARMISD::VMOVN, DL, VT, 18238 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)), 18239 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)), 18240 DAG.getConstant(1, DL, MVT::i32)); 18241 if (isVMOVNTruncMask(Mask, VT, true)) 18242 return DAG.getNode( 18243 ARMISD::VMOVN, DL, VT, 18244 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)), 18245 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)), 18246 DAG.getConstant(1, DL, MVT::i32)); 18247 } 18248 } 18249 18250 // For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the 18251 // truncate to a buildvector to allow the generic optimisations to kick in. 18252 if (all_of(N->ops(), [](SDValue Op) { 18253 return Op.getOpcode() == ISD::BUILD_VECTOR || 18254 Op.getOpcode() == ISD::VECTOR_SHUFFLE || 18255 (Op.getOpcode() == ISD::BITCAST && 18256 Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR); 18257 })) { 18258 SmallVector<SDValue, 8> Extracts; 18259 for (unsigned Op = 0; Op < N->getNumOperands(); Op++) { 18260 SDValue O = N->getOperand(Op); 18261 for (unsigned i = 0; i < O.getValueType().getVectorNumElements(); i++) { 18262 SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, O, 18263 DAG.getConstant(i, DL, MVT::i32)); 18264 Extracts.push_back(Ext); 18265 } 18266 } 18267 return DAG.getBuildVector(VT, DL, Extracts); 18268 } 18269 18270 // If we are late in the legalization process and nothing has optimised 18271 // the trunc to anything better, lower it to a stack store and reload, 18272 // performing the truncation whilst keeping the lanes in the correct order: 18273 // VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack; 18274 if (!DCI.isAfterLegalizeDAG()) 18275 return SDValue(); 18276 18277 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::Fixed(16), Align(4)); 18278 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex(); 18279 int NumIns = N->getNumOperands(); 18280 assert((NumIns == 2 || NumIns == 4) && 18281 "Expected 2 or 4 inputs to an MVETrunc"); 18282 EVT StoreVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); 18283 if (N->getNumOperands() == 4) 18284 StoreVT = StoreVT.getHalfNumVectorElementsVT(*DAG.getContext()); 18285 18286 SmallVector<SDValue> Chains; 18287 for (int I = 0; I < NumIns; I++) { 18288 SDValue Ptr = DAG.getNode( 18289 ISD::ADD, DL, StackPtr.getValueType(), StackPtr, 18290 DAG.getConstant(I * 16 / NumIns, DL, StackPtr.getValueType())); 18291 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack( 18292 DAG.getMachineFunction(), SPFI, I * 16 / NumIns); 18293 SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), DL, N->getOperand(I), 18294 Ptr, MPI, StoreVT, Align(4)); 18295 Chains.push_back(Ch); 18296 } 18297 18298 SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 18299 MachinePointerInfo MPI = 18300 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI, 0); 18301 return DAG.getLoad(VT, DL, Chain, StackPtr, MPI, Align(4)); 18302 } 18303 18304 // Take a MVEEXT(load x) and split that into (extload x, extload x+8) 18305 static SDValue PerformSplittingMVEEXTToWideningLoad(SDNode *N, 18306 SelectionDAG &DAG) { 18307 SDValue N0 = N->getOperand(0); 18308 LoadSDNode *LD = dyn_cast<LoadSDNode>(N0.getNode()); 18309 if (!LD || !LD->isSimple() || !N0.hasOneUse() || LD->isIndexed()) 18310 return SDValue(); 18311 18312 EVT FromVT = LD->getMemoryVT(); 18313 EVT ToVT = N->getValueType(0); 18314 if (!ToVT.isVector()) 18315 return SDValue(); 18316 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements() * 2); 18317 EVT ToEltVT = ToVT.getVectorElementType(); 18318 EVT FromEltVT = FromVT.getVectorElementType(); 18319 18320 unsigned NumElements = 0; 18321 if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8)) 18322 NumElements = 4; 18323 if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8) 18324 NumElements = 8; 18325 assert(NumElements != 0); 18326 18327 ISD::LoadExtType NewExtType = 18328 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 18329 if (LD->getExtensionType() != ISD::NON_EXTLOAD && 18330 LD->getExtensionType() != ISD::EXTLOAD && 18331 LD->getExtensionType() != NewExtType) 18332 return SDValue(); 18333 18334 LLVMContext &C = *DAG.getContext(); 18335 SDLoc DL(LD); 18336 // Details about the old load 18337 SDValue Ch = LD->getChain(); 18338 SDValue BasePtr = LD->getBasePtr(); 18339 Align Alignment = LD->getOriginalAlign(); 18340 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags(); 18341 AAMDNodes AAInfo = LD->getAAInfo(); 18342 18343 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType()); 18344 EVT NewFromVT = EVT::getVectorVT( 18345 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements); 18346 EVT NewToVT = EVT::getVectorVT( 18347 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements); 18348 18349 SmallVector<SDValue, 4> Loads; 18350 SmallVector<SDValue, 4> Chains; 18351 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) { 18352 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8; 18353 SDValue NewPtr = 18354 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset)); 18355 18356 SDValue NewLoad = 18357 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset, 18358 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT, 18359 Alignment, MMOFlags, AAInfo); 18360 Loads.push_back(NewLoad); 18361 Chains.push_back(SDValue(NewLoad.getNode(), 1)); 18362 } 18363 18364 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 18365 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain); 18366 return DAG.getMergeValues(Loads, DL); 18367 } 18368 18369 // Perform combines for MVEEXT. If it has not be optimized to anything better 18370 // before lowering, it gets converted to stack store and extloads performing the 18371 // extend whilst still keeping the same lane ordering. 18372 SDValue ARMTargetLowering::PerformMVEExtCombine( 18373 SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const { 18374 SelectionDAG &DAG = DCI.DAG; 18375 EVT VT = N->getValueType(0); 18376 SDLoc DL(N); 18377 assert(N->getNumValues() == 2 && "Expected MVEEXT with 2 elements"); 18378 assert((VT == MVT::v4i32 || VT == MVT::v8i16) && "Unexpected MVEEXT type"); 18379 18380 EVT ExtVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT( 18381 *DAG.getContext()); 18382 auto Extend = [&](SDValue V) { 18383 SDValue VVT = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, V); 18384 return N->getOpcode() == ARMISD::MVESEXT 18385 ? DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, VVT, 18386 DAG.getValueType(ExtVT)) 18387 : DAG.getZeroExtendInReg(VVT, DL, ExtVT); 18388 }; 18389 18390 // MVEEXT(VDUP) -> SIGN_EXTEND_INREG(VDUP) 18391 if (N->getOperand(0).getOpcode() == ARMISD::VDUP) { 18392 SDValue Ext = Extend(N->getOperand(0)); 18393 return DAG.getMergeValues({Ext, Ext}, DL); 18394 } 18395 18396 // MVEEXT(shuffle) -> SIGN_EXTEND_INREG/ZERO_EXTEND_INREG 18397 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0))) { 18398 ArrayRef<int> Mask = SVN->getMask(); 18399 assert(Mask.size() == 2 * VT.getVectorNumElements()); 18400 assert(Mask.size() == SVN->getValueType(0).getVectorNumElements()); 18401 unsigned Rev = VT == MVT::v4i32 ? ARMISD::VREV32 : ARMISD::VREV16; 18402 SDValue Op0 = SVN->getOperand(0); 18403 SDValue Op1 = SVN->getOperand(1); 18404 18405 auto CheckInregMask = [&](int Start, int Offset) { 18406 for (int Idx = 0, E = VT.getVectorNumElements(); Idx < E; ++Idx) 18407 if (Mask[Start + Idx] >= 0 && Mask[Start + Idx] != Idx * 2 + Offset) 18408 return false; 18409 return true; 18410 }; 18411 SDValue V0 = SDValue(N, 0); 18412 SDValue V1 = SDValue(N, 1); 18413 if (CheckInregMask(0, 0)) 18414 V0 = Extend(Op0); 18415 else if (CheckInregMask(0, 1)) 18416 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0)); 18417 else if (CheckInregMask(0, Mask.size())) 18418 V0 = Extend(Op1); 18419 else if (CheckInregMask(0, Mask.size() + 1)) 18420 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1)); 18421 18422 if (CheckInregMask(VT.getVectorNumElements(), Mask.size())) 18423 V1 = Extend(Op1); 18424 else if (CheckInregMask(VT.getVectorNumElements(), Mask.size() + 1)) 18425 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1)); 18426 else if (CheckInregMask(VT.getVectorNumElements(), 0)) 18427 V1 = Extend(Op0); 18428 else if (CheckInregMask(VT.getVectorNumElements(), 1)) 18429 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0)); 18430 18431 if (V0.getNode() != N || V1.getNode() != N) 18432 return DAG.getMergeValues({V0, V1}, DL); 18433 } 18434 18435 // MVEEXT(load) -> extload, extload 18436 if (N->getOperand(0)->getOpcode() == ISD::LOAD) 18437 if (SDValue L = PerformSplittingMVEEXTToWideningLoad(N, DAG)) 18438 return L; 18439 18440 if (!DCI.isAfterLegalizeDAG()) 18441 return SDValue(); 18442 18443 // Lower to a stack store and reload: 18444 // VSTRW.32 a, stack; VLDRH.32 stack; VLDRH.32 stack+8; 18445 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::Fixed(16), Align(4)); 18446 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex(); 18447 int NumOuts = N->getNumValues(); 18448 assert((NumOuts == 2 || NumOuts == 4) && 18449 "Expected 2 or 4 outputs to an MVEEXT"); 18450 EVT LoadVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT( 18451 *DAG.getContext()); 18452 if (N->getNumOperands() == 4) 18453 LoadVT = LoadVT.getHalfNumVectorElementsVT(*DAG.getContext()); 18454 18455 MachinePointerInfo MPI = 18456 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI, 0); 18457 SDValue Chain = DAG.getStore(DAG.getEntryNode(), DL, N->getOperand(0), 18458 StackPtr, MPI, Align(4)); 18459 18460 SmallVector<SDValue> Loads; 18461 for (int I = 0; I < NumOuts; I++) { 18462 SDValue Ptr = DAG.getNode( 18463 ISD::ADD, DL, StackPtr.getValueType(), StackPtr, 18464 DAG.getConstant(I * 16 / NumOuts, DL, StackPtr.getValueType())); 18465 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack( 18466 DAG.getMachineFunction(), SPFI, I * 16 / NumOuts); 18467 SDValue Load = DAG.getExtLoad( 18468 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL, 18469 VT, Chain, Ptr, MPI, LoadVT, Align(4)); 18470 Loads.push_back(Load); 18471 } 18472 18473 return DAG.getMergeValues(Loads, DL); 18474 } 18475 18476 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, 18477 DAGCombinerInfo &DCI) const { 18478 switch (N->getOpcode()) { 18479 default: break; 18480 case ISD::SELECT_CC: 18481 case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget); 18482 case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget); 18483 case ISD::SETCC: return PerformVSetCCToVCTPCombine(N, DCI, Subtarget); 18484 case ISD::ABS: return PerformABSCombine(N, DCI, Subtarget); 18485 case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget); 18486 case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget); 18487 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget); 18488 case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget); 18489 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); 18490 case ISD::OR: return PerformORCombine(N, DCI, Subtarget); 18491 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget); 18492 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget); 18493 case ISD::BRCOND: 18494 case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget); 18495 case ARMISD::ADDC: 18496 case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget); 18497 case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget); 18498 case ARMISD::BFI: return PerformBFICombine(N, DCI.DAG); 18499 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget); 18500 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); 18501 case ARMISD::VMOVhr: return PerformVMOVhrCombine(N, DCI); 18502 case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DCI.DAG); 18503 case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget); 18504 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget); 18505 case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); 18506 case ISD::EXTRACT_VECTOR_ELT: 18507 return PerformExtractEltCombine(N, DCI, Subtarget); 18508 case ISD::SIGN_EXTEND_INREG: return PerformSignExtendInregCombine(N, DCI.DAG); 18509 case ISD::INSERT_SUBVECTOR: return PerformInsertSubvectorCombine(N, DCI); 18510 case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); 18511 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget); 18512 case ARMISD::VDUP: return PerformVDUPCombine(N, DCI.DAG, Subtarget); 18513 case ISD::FP_TO_SINT: 18514 case ISD::FP_TO_UINT: 18515 return PerformVCVTCombine(N, DCI.DAG, Subtarget); 18516 case ISD::FADD: 18517 return PerformFAddVSelectCombine(N, DCI.DAG, Subtarget); 18518 case ISD::FDIV: 18519 return PerformVDIVCombine(N, DCI.DAG, Subtarget); 18520 case ISD::INTRINSIC_WO_CHAIN: 18521 return PerformIntrinsicCombine(N, DCI); 18522 case ISD::SHL: 18523 case ISD::SRA: 18524 case ISD::SRL: 18525 return PerformShiftCombine(N, DCI, Subtarget); 18526 case ISD::SIGN_EXTEND: 18527 case ISD::ZERO_EXTEND: 18528 case ISD::ANY_EXTEND: 18529 return PerformExtendCombine(N, DCI.DAG, Subtarget); 18530 case ISD::FP_EXTEND: 18531 return PerformFPExtendCombine(N, DCI.DAG, Subtarget); 18532 case ISD::SMIN: 18533 case ISD::UMIN: 18534 case ISD::SMAX: 18535 case ISD::UMAX: 18536 return PerformMinMaxCombine(N, DCI.DAG, Subtarget); 18537 case ARMISD::CMOV: 18538 return PerformCMOVCombine(N, DCI.DAG); 18539 case ARMISD::BRCOND: 18540 return PerformBRCONDCombine(N, DCI.DAG); 18541 case ARMISD::CMPZ: 18542 return PerformCMPZCombine(N, DCI.DAG); 18543 case ARMISD::CSINC: 18544 case ARMISD::CSINV: 18545 case ARMISD::CSNEG: 18546 return PerformCSETCombine(N, DCI.DAG); 18547 case ISD::LOAD: 18548 return PerformLOADCombine(N, DCI, Subtarget); 18549 case ARMISD::VLD1DUP: 18550 case ARMISD::VLD2DUP: 18551 case ARMISD::VLD3DUP: 18552 case ARMISD::VLD4DUP: 18553 return PerformVLDCombine(N, DCI); 18554 case ARMISD::BUILD_VECTOR: 18555 return PerformARMBUILD_VECTORCombine(N, DCI); 18556 case ISD::BITCAST: 18557 return PerformBITCASTCombine(N, DCI, Subtarget); 18558 case ARMISD::PREDICATE_CAST: 18559 return PerformPREDICATE_CASTCombine(N, DCI); 18560 case ARMISD::VECTOR_REG_CAST: 18561 return PerformVECTOR_REG_CASTCombine(N, DCI.DAG, Subtarget); 18562 case ARMISD::MVETRUNC: 18563 return PerformMVETruncCombine(N, DCI); 18564 case ARMISD::MVESEXT: 18565 case ARMISD::MVEZEXT: 18566 return PerformMVEExtCombine(N, DCI); 18567 case ARMISD::VCMP: 18568 return PerformVCMPCombine(N, DCI.DAG, Subtarget); 18569 case ISD::VECREDUCE_ADD: 18570 return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget); 18571 case ARMISD::VMOVN: 18572 return PerformVMOVNCombine(N, DCI); 18573 case ARMISD::VQMOVNs: 18574 case ARMISD::VQMOVNu: 18575 return PerformVQMOVNCombine(N, DCI); 18576 case ARMISD::ASRL: 18577 case ARMISD::LSRL: 18578 case ARMISD::LSLL: 18579 return PerformLongShiftCombine(N, DCI.DAG); 18580 case ARMISD::SMULWB: { 18581 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 18582 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); 18583 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) 18584 return SDValue(); 18585 break; 18586 } 18587 case ARMISD::SMULWT: { 18588 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 18589 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); 18590 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) 18591 return SDValue(); 18592 break; 18593 } 18594 case ARMISD::SMLALBB: 18595 case ARMISD::QADD16b: 18596 case ARMISD::QSUB16b: 18597 case ARMISD::UQADD16b: 18598 case ARMISD::UQSUB16b: { 18599 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 18600 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); 18601 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 18602 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 18603 return SDValue(); 18604 break; 18605 } 18606 case ARMISD::SMLALBT: { 18607 unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits(); 18608 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); 18609 unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits(); 18610 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); 18611 if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) || 18612 (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI))) 18613 return SDValue(); 18614 break; 18615 } 18616 case ARMISD::SMLALTB: { 18617 unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits(); 18618 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); 18619 unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits(); 18620 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); 18621 if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) || 18622 (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI))) 18623 return SDValue(); 18624 break; 18625 } 18626 case ARMISD::SMLALTT: { 18627 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 18628 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); 18629 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 18630 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 18631 return SDValue(); 18632 break; 18633 } 18634 case ARMISD::QADD8b: 18635 case ARMISD::QSUB8b: 18636 case ARMISD::UQADD8b: 18637 case ARMISD::UQSUB8b: { 18638 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 18639 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8); 18640 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 18641 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 18642 return SDValue(); 18643 break; 18644 } 18645 case ISD::INTRINSIC_VOID: 18646 case ISD::INTRINSIC_W_CHAIN: 18647 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 18648 case Intrinsic::arm_neon_vld1: 18649 case Intrinsic::arm_neon_vld1x2: 18650 case Intrinsic::arm_neon_vld1x3: 18651 case Intrinsic::arm_neon_vld1x4: 18652 case Intrinsic::arm_neon_vld2: 18653 case Intrinsic::arm_neon_vld3: 18654 case Intrinsic::arm_neon_vld4: 18655 case Intrinsic::arm_neon_vld2lane: 18656 case Intrinsic::arm_neon_vld3lane: 18657 case Intrinsic::arm_neon_vld4lane: 18658 case Intrinsic::arm_neon_vld2dup: 18659 case Intrinsic::arm_neon_vld3dup: 18660 case Intrinsic::arm_neon_vld4dup: 18661 case Intrinsic::arm_neon_vst1: 18662 case Intrinsic::arm_neon_vst1x2: 18663 case Intrinsic::arm_neon_vst1x3: 18664 case Intrinsic::arm_neon_vst1x4: 18665 case Intrinsic::arm_neon_vst2: 18666 case Intrinsic::arm_neon_vst3: 18667 case Intrinsic::arm_neon_vst4: 18668 case Intrinsic::arm_neon_vst2lane: 18669 case Intrinsic::arm_neon_vst3lane: 18670 case Intrinsic::arm_neon_vst4lane: 18671 return PerformVLDCombine(N, DCI); 18672 case Intrinsic::arm_mve_vld2q: 18673 case Intrinsic::arm_mve_vld4q: 18674 case Intrinsic::arm_mve_vst2q: 18675 case Intrinsic::arm_mve_vst4q: 18676 return PerformMVEVLDCombine(N, DCI); 18677 default: break; 18678 } 18679 break; 18680 } 18681 return SDValue(); 18682 } 18683 18684 bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc, 18685 EVT VT) const { 18686 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE); 18687 } 18688 18689 bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, 18690 Align Alignment, 18691 MachineMemOperand::Flags, 18692 bool *Fast) const { 18693 // Depends what it gets converted into if the type is weird. 18694 if (!VT.isSimple()) 18695 return false; 18696 18697 // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus 18698 bool AllowsUnaligned = Subtarget->allowsUnalignedMem(); 18699 auto Ty = VT.getSimpleVT().SimpleTy; 18700 18701 if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) { 18702 // Unaligned access can use (for example) LRDB, LRDH, LDR 18703 if (AllowsUnaligned) { 18704 if (Fast) 18705 *Fast = Subtarget->hasV7Ops(); 18706 return true; 18707 } 18708 } 18709 18710 if (Ty == MVT::f64 || Ty == MVT::v2f64) { 18711 // For any little-endian targets with neon, we can support unaligned ld/st 18712 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8. 18713 // A big-endian target may also explicitly support unaligned accesses 18714 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) { 18715 if (Fast) 18716 *Fast = true; 18717 return true; 18718 } 18719 } 18720 18721 if (!Subtarget->hasMVEIntegerOps()) 18722 return false; 18723 18724 // These are for predicates 18725 if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1 || 18726 Ty == MVT::v2i1)) { 18727 if (Fast) 18728 *Fast = true; 18729 return true; 18730 } 18731 18732 // These are for truncated stores/narrowing loads. They are fine so long as 18733 // the alignment is at least the size of the item being loaded 18734 if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) && 18735 Alignment >= VT.getScalarSizeInBits() / 8) { 18736 if (Fast) 18737 *Fast = true; 18738 return true; 18739 } 18740 18741 // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and 18742 // VSTRW.U32 all store the vector register in exactly the same format, and 18743 // differ only in the range of their immediate offset field and the required 18744 // alignment. So there is always a store that can be used, regardless of 18745 // actual type. 18746 // 18747 // For big endian, that is not the case. But can still emit a (VSTRB.U8; 18748 // VREV64.8) pair and get the same effect. This will likely be better than 18749 // aligning the vector through the stack. 18750 if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 || 18751 Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 || 18752 Ty == MVT::v2f64) { 18753 if (Fast) 18754 *Fast = true; 18755 return true; 18756 } 18757 18758 return false; 18759 } 18760 18761 18762 EVT ARMTargetLowering::getOptimalMemOpType( 18763 const MemOp &Op, const AttributeList &FuncAttributes) const { 18764 // See if we can use NEON instructions for this... 18765 if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() && 18766 !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) { 18767 bool Fast; 18768 if (Op.size() >= 16 && 18769 (Op.isAligned(Align(16)) || 18770 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, Align(1), 18771 MachineMemOperand::MONone, &Fast) && 18772 Fast))) { 18773 return MVT::v2f64; 18774 } else if (Op.size() >= 8 && 18775 (Op.isAligned(Align(8)) || 18776 (allowsMisalignedMemoryAccesses( 18777 MVT::f64, 0, Align(1), MachineMemOperand::MONone, &Fast) && 18778 Fast))) { 18779 return MVT::f64; 18780 } 18781 } 18782 18783 // Let the target-independent logic figure it out. 18784 return MVT::Other; 18785 } 18786 18787 // 64-bit integers are split into their high and low parts and held in two 18788 // different registers, so the trunc is free since the low register can just 18789 // be used. 18790 bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const { 18791 if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy()) 18792 return false; 18793 unsigned SrcBits = SrcTy->getPrimitiveSizeInBits(); 18794 unsigned DestBits = DstTy->getPrimitiveSizeInBits(); 18795 return (SrcBits == 64 && DestBits == 32); 18796 } 18797 18798 bool ARMTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const { 18799 if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() || 18800 !DstVT.isInteger()) 18801 return false; 18802 unsigned SrcBits = SrcVT.getSizeInBits(); 18803 unsigned DestBits = DstVT.getSizeInBits(); 18804 return (SrcBits == 64 && DestBits == 32); 18805 } 18806 18807 bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 18808 if (Val.getOpcode() != ISD::LOAD) 18809 return false; 18810 18811 EVT VT1 = Val.getValueType(); 18812 if (!VT1.isSimple() || !VT1.isInteger() || 18813 !VT2.isSimple() || !VT2.isInteger()) 18814 return false; 18815 18816 switch (VT1.getSimpleVT().SimpleTy) { 18817 default: break; 18818 case MVT::i1: 18819 case MVT::i8: 18820 case MVT::i16: 18821 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits. 18822 return true; 18823 } 18824 18825 return false; 18826 } 18827 18828 bool ARMTargetLowering::isFNegFree(EVT VT) const { 18829 if (!VT.isSimple()) 18830 return false; 18831 18832 // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that 18833 // negate values directly (fneg is free). So, we don't want to let the DAG 18834 // combiner rewrite fneg into xors and some other instructions. For f16 and 18835 // FullFP16 argument passing, some bitcast nodes may be introduced, 18836 // triggering this DAG combine rewrite, so we are avoiding that with this. 18837 switch (VT.getSimpleVT().SimpleTy) { 18838 default: break; 18839 case MVT::f16: 18840 return Subtarget->hasFullFP16(); 18841 } 18842 18843 return false; 18844 } 18845 18846 /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth 18847 /// of the vector elements. 18848 static bool areExtractExts(Value *Ext1, Value *Ext2) { 18849 auto areExtDoubled = [](Instruction *Ext) { 18850 return Ext->getType()->getScalarSizeInBits() == 18851 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); 18852 }; 18853 18854 if (!match(Ext1, m_ZExtOrSExt(m_Value())) || 18855 !match(Ext2, m_ZExtOrSExt(m_Value())) || 18856 !areExtDoubled(cast<Instruction>(Ext1)) || 18857 !areExtDoubled(cast<Instruction>(Ext2))) 18858 return false; 18859 18860 return true; 18861 } 18862 18863 /// Check if sinking \p I's operands to I's basic block is profitable, because 18864 /// the operands can be folded into a target instruction, e.g. 18865 /// sext/zext can be folded into vsubl. 18866 bool ARMTargetLowering::shouldSinkOperands(Instruction *I, 18867 SmallVectorImpl<Use *> &Ops) const { 18868 if (!I->getType()->isVectorTy()) 18869 return false; 18870 18871 if (Subtarget->hasNEON()) { 18872 switch (I->getOpcode()) { 18873 case Instruction::Sub: 18874 case Instruction::Add: { 18875 if (!areExtractExts(I->getOperand(0), I->getOperand(1))) 18876 return false; 18877 Ops.push_back(&I->getOperandUse(0)); 18878 Ops.push_back(&I->getOperandUse(1)); 18879 return true; 18880 } 18881 default: 18882 return false; 18883 } 18884 } 18885 18886 if (!Subtarget->hasMVEIntegerOps()) 18887 return false; 18888 18889 auto IsFMSMul = [&](Instruction *I) { 18890 if (!I->hasOneUse()) 18891 return false; 18892 auto *Sub = cast<Instruction>(*I->users().begin()); 18893 return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I; 18894 }; 18895 auto IsFMS = [&](Instruction *I) { 18896 if (match(I->getOperand(0), m_FNeg(m_Value())) || 18897 match(I->getOperand(1), m_FNeg(m_Value()))) 18898 return true; 18899 return false; 18900 }; 18901 18902 auto IsSinker = [&](Instruction *I, int Operand) { 18903 switch (I->getOpcode()) { 18904 case Instruction::Add: 18905 case Instruction::Mul: 18906 case Instruction::FAdd: 18907 case Instruction::ICmp: 18908 case Instruction::FCmp: 18909 return true; 18910 case Instruction::FMul: 18911 return !IsFMSMul(I); 18912 case Instruction::Sub: 18913 case Instruction::FSub: 18914 case Instruction::Shl: 18915 case Instruction::LShr: 18916 case Instruction::AShr: 18917 return Operand == 1; 18918 case Instruction::Call: 18919 if (auto *II = dyn_cast<IntrinsicInst>(I)) { 18920 switch (II->getIntrinsicID()) { 18921 case Intrinsic::fma: 18922 return !IsFMS(I); 18923 case Intrinsic::sadd_sat: 18924 case Intrinsic::uadd_sat: 18925 case Intrinsic::arm_mve_add_predicated: 18926 case Intrinsic::arm_mve_mul_predicated: 18927 case Intrinsic::arm_mve_qadd_predicated: 18928 case Intrinsic::arm_mve_vhadd: 18929 case Intrinsic::arm_mve_hadd_predicated: 18930 case Intrinsic::arm_mve_vqdmull: 18931 case Intrinsic::arm_mve_vqdmull_predicated: 18932 case Intrinsic::arm_mve_vqdmulh: 18933 case Intrinsic::arm_mve_qdmulh_predicated: 18934 case Intrinsic::arm_mve_vqrdmulh: 18935 case Intrinsic::arm_mve_qrdmulh_predicated: 18936 case Intrinsic::arm_mve_fma_predicated: 18937 return true; 18938 case Intrinsic::ssub_sat: 18939 case Intrinsic::usub_sat: 18940 case Intrinsic::arm_mve_sub_predicated: 18941 case Intrinsic::arm_mve_qsub_predicated: 18942 case Intrinsic::arm_mve_hsub_predicated: 18943 case Intrinsic::arm_mve_vhsub: 18944 return Operand == 1; 18945 default: 18946 return false; 18947 } 18948 } 18949 return false; 18950 default: 18951 return false; 18952 } 18953 }; 18954 18955 for (auto OpIdx : enumerate(I->operands())) { 18956 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get()); 18957 // Make sure we are not already sinking this operand 18958 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; })) 18959 continue; 18960 18961 Instruction *Shuffle = Op; 18962 if (Shuffle->getOpcode() == Instruction::BitCast) 18963 Shuffle = dyn_cast<Instruction>(Shuffle->getOperand(0)); 18964 // We are looking for a splat that can be sunk. 18965 if (!Shuffle || 18966 !match(Shuffle, m_Shuffle( 18967 m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()), 18968 m_Undef(), m_ZeroMask()))) 18969 continue; 18970 if (!IsSinker(I, OpIdx.index())) 18971 continue; 18972 18973 // All uses of the shuffle should be sunk to avoid duplicating it across gpr 18974 // and vector registers 18975 for (Use &U : Op->uses()) { 18976 Instruction *Insn = cast<Instruction>(U.getUser()); 18977 if (!IsSinker(Insn, U.getOperandNo())) 18978 return false; 18979 } 18980 18981 Ops.push_back(&Shuffle->getOperandUse(0)); 18982 if (Shuffle != Op) 18983 Ops.push_back(&Op->getOperandUse(0)); 18984 Ops.push_back(&OpIdx.value()); 18985 } 18986 return true; 18987 } 18988 18989 Type *ARMTargetLowering::shouldConvertSplatType(ShuffleVectorInst *SVI) const { 18990 if (!Subtarget->hasMVEIntegerOps()) 18991 return nullptr; 18992 Type *SVIType = SVI->getType(); 18993 Type *ScalarType = SVIType->getScalarType(); 18994 18995 if (ScalarType->isFloatTy()) 18996 return Type::getInt32Ty(SVIType->getContext()); 18997 if (ScalarType->isHalfTy()) 18998 return Type::getInt16Ty(SVIType->getContext()); 18999 return nullptr; 19000 } 19001 19002 bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { 19003 EVT VT = ExtVal.getValueType(); 19004 19005 if (!isTypeLegal(VT)) 19006 return false; 19007 19008 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) { 19009 if (Ld->isExpandingLoad()) 19010 return false; 19011 } 19012 19013 if (Subtarget->hasMVEIntegerOps()) 19014 return true; 19015 19016 // Don't create a loadext if we can fold the extension into a wide/long 19017 // instruction. 19018 // If there's more than one user instruction, the loadext is desirable no 19019 // matter what. There can be two uses by the same instruction. 19020 if (ExtVal->use_empty() || 19021 !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode())) 19022 return true; 19023 19024 SDNode *U = *ExtVal->use_begin(); 19025 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB || 19026 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM)) 19027 return false; 19028 19029 return true; 19030 } 19031 19032 bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { 19033 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 19034 return false; 19035 19036 if (!isTypeLegal(EVT::getEVT(Ty1))) 19037 return false; 19038 19039 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); 19040 19041 // Assuming the caller doesn't have a zeroext or signext return parameter, 19042 // truncation all the way down to i1 is valid. 19043 return true; 19044 } 19045 19046 InstructionCost ARMTargetLowering::getScalingFactorCost(const DataLayout &DL, 19047 const AddrMode &AM, 19048 Type *Ty, 19049 unsigned AS) const { 19050 if (isLegalAddressingMode(DL, AM, Ty, AS)) { 19051 if (Subtarget->hasFPAO()) 19052 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster 19053 return 0; 19054 } 19055 return -1; 19056 } 19057 19058 /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster 19059 /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be 19060 /// expanded to FMAs when this method returns true, otherwise fmuladd is 19061 /// expanded to fmul + fadd. 19062 /// 19063 /// ARM supports both fused and unfused multiply-add operations; we already 19064 /// lower a pair of fmul and fadd to the latter so it's not clear that there 19065 /// would be a gain or that the gain would be worthwhile enough to risk 19066 /// correctness bugs. 19067 /// 19068 /// For MVE, we set this to true as it helps simplify the need for some 19069 /// patterns (and we don't have the non-fused floating point instruction). 19070 bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, 19071 EVT VT) const { 19072 if (!VT.isSimple()) 19073 return false; 19074 19075 switch (VT.getSimpleVT().SimpleTy) { 19076 case MVT::v4f32: 19077 case MVT::v8f16: 19078 return Subtarget->hasMVEFloatOps(); 19079 case MVT::f16: 19080 return Subtarget->useFPVFMx16(); 19081 case MVT::f32: 19082 return Subtarget->useFPVFMx(); 19083 case MVT::f64: 19084 return Subtarget->useFPVFMx64(); 19085 default: 19086 break; 19087 } 19088 19089 return false; 19090 } 19091 19092 static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { 19093 if (V < 0) 19094 return false; 19095 19096 unsigned Scale = 1; 19097 switch (VT.getSimpleVT().SimpleTy) { 19098 case MVT::i1: 19099 case MVT::i8: 19100 // Scale == 1; 19101 break; 19102 case MVT::i16: 19103 // Scale == 2; 19104 Scale = 2; 19105 break; 19106 default: 19107 // On thumb1 we load most things (i32, i64, floats, etc) with a LDR 19108 // Scale == 4; 19109 Scale = 4; 19110 break; 19111 } 19112 19113 if ((V & (Scale - 1)) != 0) 19114 return false; 19115 return isUInt<5>(V / Scale); 19116 } 19117 19118 static bool isLegalT2AddressImmediate(int64_t V, EVT VT, 19119 const ARMSubtarget *Subtarget) { 19120 if (!VT.isInteger() && !VT.isFloatingPoint()) 19121 return false; 19122 if (VT.isVector() && Subtarget->hasNEON()) 19123 return false; 19124 if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() && 19125 !Subtarget->hasMVEFloatOps()) 19126 return false; 19127 19128 bool IsNeg = false; 19129 if (V < 0) { 19130 IsNeg = true; 19131 V = -V; 19132 } 19133 19134 unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U); 19135 19136 // MVE: size * imm7 19137 if (VT.isVector() && Subtarget->hasMVEIntegerOps()) { 19138 switch (VT.getSimpleVT().getVectorElementType().SimpleTy) { 19139 case MVT::i32: 19140 case MVT::f32: 19141 return isShiftedUInt<7,2>(V); 19142 case MVT::i16: 19143 case MVT::f16: 19144 return isShiftedUInt<7,1>(V); 19145 case MVT::i8: 19146 return isUInt<7>(V); 19147 default: 19148 return false; 19149 } 19150 } 19151 19152 // half VLDR: 2 * imm8 19153 if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16()) 19154 return isShiftedUInt<8, 1>(V); 19155 // VLDR and LDRD: 4 * imm8 19156 if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8) 19157 return isShiftedUInt<8, 2>(V); 19158 19159 if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) { 19160 // + imm12 or - imm8 19161 if (IsNeg) 19162 return isUInt<8>(V); 19163 return isUInt<12>(V); 19164 } 19165 19166 return false; 19167 } 19168 19169 /// isLegalAddressImmediate - Return true if the integer value can be used 19170 /// as the offset of the target addressing mode for load / store of the 19171 /// given type. 19172 static bool isLegalAddressImmediate(int64_t V, EVT VT, 19173 const ARMSubtarget *Subtarget) { 19174 if (V == 0) 19175 return true; 19176 19177 if (!VT.isSimple()) 19178 return false; 19179 19180 if (Subtarget->isThumb1Only()) 19181 return isLegalT1AddressImmediate(V, VT); 19182 else if (Subtarget->isThumb2()) 19183 return isLegalT2AddressImmediate(V, VT, Subtarget); 19184 19185 // ARM mode. 19186 if (V < 0) 19187 V = - V; 19188 switch (VT.getSimpleVT().SimpleTy) { 19189 default: return false; 19190 case MVT::i1: 19191 case MVT::i8: 19192 case MVT::i32: 19193 // +- imm12 19194 return isUInt<12>(V); 19195 case MVT::i16: 19196 // +- imm8 19197 return isUInt<8>(V); 19198 case MVT::f32: 19199 case MVT::f64: 19200 if (!Subtarget->hasVFP2Base()) // FIXME: NEON? 19201 return false; 19202 return isShiftedUInt<8, 2>(V); 19203 } 19204 } 19205 19206 bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM, 19207 EVT VT) const { 19208 int Scale = AM.Scale; 19209 if (Scale < 0) 19210 return false; 19211 19212 switch (VT.getSimpleVT().SimpleTy) { 19213 default: return false; 19214 case MVT::i1: 19215 case MVT::i8: 19216 case MVT::i16: 19217 case MVT::i32: 19218 if (Scale == 1) 19219 return true; 19220 // r + r << imm 19221 Scale = Scale & ~1; 19222 return Scale == 2 || Scale == 4 || Scale == 8; 19223 case MVT::i64: 19224 // FIXME: What are we trying to model here? ldrd doesn't have an r + r 19225 // version in Thumb mode. 19226 // r + r 19227 if (Scale == 1) 19228 return true; 19229 // r * 2 (this can be lowered to r + r). 19230 if (!AM.HasBaseReg && Scale == 2) 19231 return true; 19232 return false; 19233 case MVT::isVoid: 19234 // Note, we allow "void" uses (basically, uses that aren't loads or 19235 // stores), because arm allows folding a scale into many arithmetic 19236 // operations. This should be made more precise and revisited later. 19237 19238 // Allow r << imm, but the imm has to be a multiple of two. 19239 if (Scale & 1) return false; 19240 return isPowerOf2_32(Scale); 19241 } 19242 } 19243 19244 bool ARMTargetLowering::isLegalT1ScaledAddressingMode(const AddrMode &AM, 19245 EVT VT) const { 19246 const int Scale = AM.Scale; 19247 19248 // Negative scales are not supported in Thumb1. 19249 if (Scale < 0) 19250 return false; 19251 19252 // Thumb1 addressing modes do not support register scaling excepting the 19253 // following cases: 19254 // 1. Scale == 1 means no scaling. 19255 // 2. Scale == 2 this can be lowered to r + r if there is no base register. 19256 return (Scale == 1) || (!AM.HasBaseReg && Scale == 2); 19257 } 19258 19259 /// isLegalAddressingMode - Return true if the addressing mode represented 19260 /// by AM is legal for this target, for a load/store of the specified type. 19261 bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL, 19262 const AddrMode &AM, Type *Ty, 19263 unsigned AS, Instruction *I) const { 19264 EVT VT = getValueType(DL, Ty, true); 19265 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget)) 19266 return false; 19267 19268 // Can never fold addr of global into load/store. 19269 if (AM.BaseGV) 19270 return false; 19271 19272 switch (AM.Scale) { 19273 case 0: // no scale reg, must be "r+i" or "r", or "i". 19274 break; 19275 default: 19276 // ARM doesn't support any R+R*scale+imm addr modes. 19277 if (AM.BaseOffs) 19278 return false; 19279 19280 if (!VT.isSimple()) 19281 return false; 19282 19283 if (Subtarget->isThumb1Only()) 19284 return isLegalT1ScaledAddressingMode(AM, VT); 19285 19286 if (Subtarget->isThumb2()) 19287 return isLegalT2ScaledAddressingMode(AM, VT); 19288 19289 int Scale = AM.Scale; 19290 switch (VT.getSimpleVT().SimpleTy) { 19291 default: return false; 19292 case MVT::i1: 19293 case MVT::i8: 19294 case MVT::i32: 19295 if (Scale < 0) Scale = -Scale; 19296 if (Scale == 1) 19297 return true; 19298 // r + r << imm 19299 return isPowerOf2_32(Scale & ~1); 19300 case MVT::i16: 19301 case MVT::i64: 19302 // r +/- r 19303 if (Scale == 1 || (AM.HasBaseReg && Scale == -1)) 19304 return true; 19305 // r * 2 (this can be lowered to r + r). 19306 if (!AM.HasBaseReg && Scale == 2) 19307 return true; 19308 return false; 19309 19310 case MVT::isVoid: 19311 // Note, we allow "void" uses (basically, uses that aren't loads or 19312 // stores), because arm allows folding a scale into many arithmetic 19313 // operations. This should be made more precise and revisited later. 19314 19315 // Allow r << imm, but the imm has to be a multiple of two. 19316 if (Scale & 1) return false; 19317 return isPowerOf2_32(Scale); 19318 } 19319 } 19320 return true; 19321 } 19322 19323 /// isLegalICmpImmediate - Return true if the specified immediate is legal 19324 /// icmp immediate, that is the target has icmp instructions which can compare 19325 /// a register against the immediate without having to materialize the 19326 /// immediate into a register. 19327 bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const { 19328 // Thumb2 and ARM modes can use cmn for negative immediates. 19329 if (!Subtarget->isThumb()) 19330 return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 || 19331 ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1; 19332 if (Subtarget->isThumb2()) 19333 return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 || 19334 ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1; 19335 // Thumb1 doesn't have cmn, and only 8-bit immediates. 19336 return Imm >= 0 && Imm <= 255; 19337 } 19338 19339 /// isLegalAddImmediate - Return true if the specified immediate is a legal add 19340 /// *or sub* immediate, that is the target has add or sub instructions which can 19341 /// add a register with the immediate without having to materialize the 19342 /// immediate into a register. 19343 bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const { 19344 // Same encoding for add/sub, just flip the sign. 19345 int64_t AbsImm = std::abs(Imm); 19346 if (!Subtarget->isThumb()) 19347 return ARM_AM::getSOImmVal(AbsImm) != -1; 19348 if (Subtarget->isThumb2()) 19349 return ARM_AM::getT2SOImmVal(AbsImm) != -1; 19350 // Thumb1 only has 8-bit unsigned immediate. 19351 return AbsImm >= 0 && AbsImm <= 255; 19352 } 19353 19354 // Return false to prevent folding 19355 // (mul (add r, c0), c1) -> (add (mul r, c1), c0*c1) in DAGCombine, 19356 // if the folding leads to worse code. 19357 bool ARMTargetLowering::isMulAddWithConstProfitable( 19358 const SDValue &AddNode, const SDValue &ConstNode) const { 19359 // Let the DAGCombiner decide for vector types and large types. 19360 const EVT VT = AddNode.getValueType(); 19361 if (VT.isVector() || VT.getScalarSizeInBits() > 32) 19362 return true; 19363 19364 // It is worse if c0 is legal add immediate, while c1*c0 is not 19365 // and has to be composed by at least two instructions. 19366 const ConstantSDNode *C0Node = cast<ConstantSDNode>(AddNode.getOperand(1)); 19367 const ConstantSDNode *C1Node = cast<ConstantSDNode>(ConstNode); 19368 const int64_t C0 = C0Node->getSExtValue(); 19369 APInt CA = C0Node->getAPIntValue() * C1Node->getAPIntValue(); 19370 if (!isLegalAddImmediate(C0) || isLegalAddImmediate(CA.getSExtValue())) 19371 return true; 19372 if (ConstantMaterializationCost((unsigned)CA.getZExtValue(), Subtarget) > 1) 19373 return false; 19374 19375 // Default to true and let the DAGCombiner decide. 19376 return true; 19377 } 19378 19379 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, 19380 bool isSEXTLoad, SDValue &Base, 19381 SDValue &Offset, bool &isInc, 19382 SelectionDAG &DAG) { 19383 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 19384 return false; 19385 19386 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) { 19387 // AddressingMode 3 19388 Base = Ptr->getOperand(0); 19389 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 19390 int RHSC = (int)RHS->getZExtValue(); 19391 if (RHSC < 0 && RHSC > -256) { 19392 assert(Ptr->getOpcode() == ISD::ADD); 19393 isInc = false; 19394 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 19395 return true; 19396 } 19397 } 19398 isInc = (Ptr->getOpcode() == ISD::ADD); 19399 Offset = Ptr->getOperand(1); 19400 return true; 19401 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) { 19402 // AddressingMode 2 19403 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 19404 int RHSC = (int)RHS->getZExtValue(); 19405 if (RHSC < 0 && RHSC > -0x1000) { 19406 assert(Ptr->getOpcode() == ISD::ADD); 19407 isInc = false; 19408 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 19409 Base = Ptr->getOperand(0); 19410 return true; 19411 } 19412 } 19413 19414 if (Ptr->getOpcode() == ISD::ADD) { 19415 isInc = true; 19416 ARM_AM::ShiftOpc ShOpcVal= 19417 ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode()); 19418 if (ShOpcVal != ARM_AM::no_shift) { 19419 Base = Ptr->getOperand(1); 19420 Offset = Ptr->getOperand(0); 19421 } else { 19422 Base = Ptr->getOperand(0); 19423 Offset = Ptr->getOperand(1); 19424 } 19425 return true; 19426 } 19427 19428 isInc = (Ptr->getOpcode() == ISD::ADD); 19429 Base = Ptr->getOperand(0); 19430 Offset = Ptr->getOperand(1); 19431 return true; 19432 } 19433 19434 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store. 19435 return false; 19436 } 19437 19438 static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, 19439 bool isSEXTLoad, SDValue &Base, 19440 SDValue &Offset, bool &isInc, 19441 SelectionDAG &DAG) { 19442 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 19443 return false; 19444 19445 Base = Ptr->getOperand(0); 19446 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 19447 int RHSC = (int)RHS->getZExtValue(); 19448 if (RHSC < 0 && RHSC > -0x100) { // 8 bits. 19449 assert(Ptr->getOpcode() == ISD::ADD); 19450 isInc = false; 19451 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 19452 return true; 19453 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero. 19454 isInc = Ptr->getOpcode() == ISD::ADD; 19455 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); 19456 return true; 19457 } 19458 } 19459 19460 return false; 19461 } 19462 19463 static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment, 19464 bool isSEXTLoad, bool IsMasked, bool isLE, 19465 SDValue &Base, SDValue &Offset, 19466 bool &isInc, SelectionDAG &DAG) { 19467 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 19468 return false; 19469 if (!isa<ConstantSDNode>(Ptr->getOperand(1))) 19470 return false; 19471 19472 // We allow LE non-masked loads to change the type (for example use a vldrb.8 19473 // as opposed to a vldrw.32). This can allow extra addressing modes or 19474 // alignments for what is otherwise an equivalent instruction. 19475 bool CanChangeType = isLE && !IsMasked; 19476 19477 ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1)); 19478 int RHSC = (int)RHS->getZExtValue(); 19479 19480 auto IsInRange = [&](int RHSC, int Limit, int Scale) { 19481 if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) { 19482 assert(Ptr->getOpcode() == ISD::ADD); 19483 isInc = false; 19484 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 19485 return true; 19486 } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) { 19487 isInc = Ptr->getOpcode() == ISD::ADD; 19488 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); 19489 return true; 19490 } 19491 return false; 19492 }; 19493 19494 // Try to find a matching instruction based on s/zext, Alignment, Offset and 19495 // (in BE/masked) type. 19496 Base = Ptr->getOperand(0); 19497 if (VT == MVT::v4i16) { 19498 if (Alignment >= 2 && IsInRange(RHSC, 0x80, 2)) 19499 return true; 19500 } else if (VT == MVT::v4i8 || VT == MVT::v8i8) { 19501 if (IsInRange(RHSC, 0x80, 1)) 19502 return true; 19503 } else if (Alignment >= 4 && 19504 (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) && 19505 IsInRange(RHSC, 0x80, 4)) 19506 return true; 19507 else if (Alignment >= 2 && 19508 (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) && 19509 IsInRange(RHSC, 0x80, 2)) 19510 return true; 19511 else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1)) 19512 return true; 19513 return false; 19514 } 19515 19516 /// getPreIndexedAddressParts - returns true by value, base pointer and 19517 /// offset pointer and addressing mode by reference if the node's address 19518 /// can be legally represented as pre-indexed load / store address. 19519 bool 19520 ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, 19521 SDValue &Offset, 19522 ISD::MemIndexedMode &AM, 19523 SelectionDAG &DAG) const { 19524 if (Subtarget->isThumb1Only()) 19525 return false; 19526 19527 EVT VT; 19528 SDValue Ptr; 19529 Align Alignment; 19530 bool isSEXTLoad = false; 19531 bool IsMasked = false; 19532 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 19533 Ptr = LD->getBasePtr(); 19534 VT = LD->getMemoryVT(); 19535 Alignment = LD->getAlign(); 19536 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 19537 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 19538 Ptr = ST->getBasePtr(); 19539 VT = ST->getMemoryVT(); 19540 Alignment = ST->getAlign(); 19541 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) { 19542 Ptr = LD->getBasePtr(); 19543 VT = LD->getMemoryVT(); 19544 Alignment = LD->getAlign(); 19545 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 19546 IsMasked = true; 19547 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) { 19548 Ptr = ST->getBasePtr(); 19549 VT = ST->getMemoryVT(); 19550 Alignment = ST->getAlign(); 19551 IsMasked = true; 19552 } else 19553 return false; 19554 19555 bool isInc; 19556 bool isLegal = false; 19557 if (VT.isVector()) 19558 isLegal = Subtarget->hasMVEIntegerOps() && 19559 getMVEIndexedAddressParts( 19560 Ptr.getNode(), VT, Alignment, isSEXTLoad, IsMasked, 19561 Subtarget->isLittle(), Base, Offset, isInc, DAG); 19562 else { 19563 if (Subtarget->isThumb2()) 19564 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 19565 Offset, isInc, DAG); 19566 else 19567 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 19568 Offset, isInc, DAG); 19569 } 19570 if (!isLegal) 19571 return false; 19572 19573 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC; 19574 return true; 19575 } 19576 19577 /// getPostIndexedAddressParts - returns true by value, base pointer and 19578 /// offset pointer and addressing mode by reference if this node can be 19579 /// combined with a load / store to form a post-indexed load / store. 19580 bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, 19581 SDValue &Base, 19582 SDValue &Offset, 19583 ISD::MemIndexedMode &AM, 19584 SelectionDAG &DAG) const { 19585 EVT VT; 19586 SDValue Ptr; 19587 Align Alignment; 19588 bool isSEXTLoad = false, isNonExt; 19589 bool IsMasked = false; 19590 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 19591 VT = LD->getMemoryVT(); 19592 Ptr = LD->getBasePtr(); 19593 Alignment = LD->getAlign(); 19594 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 19595 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; 19596 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 19597 VT = ST->getMemoryVT(); 19598 Ptr = ST->getBasePtr(); 19599 Alignment = ST->getAlign(); 19600 isNonExt = !ST->isTruncatingStore(); 19601 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) { 19602 VT = LD->getMemoryVT(); 19603 Ptr = LD->getBasePtr(); 19604 Alignment = LD->getAlign(); 19605 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 19606 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; 19607 IsMasked = true; 19608 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) { 19609 VT = ST->getMemoryVT(); 19610 Ptr = ST->getBasePtr(); 19611 Alignment = ST->getAlign(); 19612 isNonExt = !ST->isTruncatingStore(); 19613 IsMasked = true; 19614 } else 19615 return false; 19616 19617 if (Subtarget->isThumb1Only()) { 19618 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It 19619 // must be non-extending/truncating, i32, with an offset of 4. 19620 assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!"); 19621 if (Op->getOpcode() != ISD::ADD || !isNonExt) 19622 return false; 19623 auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 19624 if (!RHS || RHS->getZExtValue() != 4) 19625 return false; 19626 if (Alignment < Align(4)) 19627 return false; 19628 19629 Offset = Op->getOperand(1); 19630 Base = Op->getOperand(0); 19631 AM = ISD::POST_INC; 19632 return true; 19633 } 19634 19635 bool isInc; 19636 bool isLegal = false; 19637 if (VT.isVector()) 19638 isLegal = Subtarget->hasMVEIntegerOps() && 19639 getMVEIndexedAddressParts(Op, VT, Alignment, isSEXTLoad, IsMasked, 19640 Subtarget->isLittle(), Base, Offset, 19641 isInc, DAG); 19642 else { 19643 if (Subtarget->isThumb2()) 19644 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 19645 isInc, DAG); 19646 else 19647 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 19648 isInc, DAG); 19649 } 19650 if (!isLegal) 19651 return false; 19652 19653 if (Ptr != Base) { 19654 // Swap base ptr and offset to catch more post-index load / store when 19655 // it's legal. In Thumb2 mode, offset must be an immediate. 19656 if (Ptr == Offset && Op->getOpcode() == ISD::ADD && 19657 !Subtarget->isThumb2()) 19658 std::swap(Base, Offset); 19659 19660 // Post-indexed load / store update the base pointer. 19661 if (Ptr != Base) 19662 return false; 19663 } 19664 19665 AM = isInc ? ISD::POST_INC : ISD::POST_DEC; 19666 return true; 19667 } 19668 19669 void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, 19670 KnownBits &Known, 19671 const APInt &DemandedElts, 19672 const SelectionDAG &DAG, 19673 unsigned Depth) const { 19674 unsigned BitWidth = Known.getBitWidth(); 19675 Known.resetAll(); 19676 switch (Op.getOpcode()) { 19677 default: break; 19678 case ARMISD::ADDC: 19679 case ARMISD::ADDE: 19680 case ARMISD::SUBC: 19681 case ARMISD::SUBE: 19682 // Special cases when we convert a carry to a boolean. 19683 if (Op.getResNo() == 0) { 19684 SDValue LHS = Op.getOperand(0); 19685 SDValue RHS = Op.getOperand(1); 19686 // (ADDE 0, 0, C) will give us a single bit. 19687 if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) && 19688 isNullConstant(RHS)) { 19689 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); 19690 return; 19691 } 19692 } 19693 break; 19694 case ARMISD::CMOV: { 19695 // Bits are known zero/one if known on the LHS and RHS. 19696 Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1); 19697 if (Known.isUnknown()) 19698 return; 19699 19700 KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1); 19701 Known = KnownBits::commonBits(Known, KnownRHS); 19702 return; 19703 } 19704 case ISD::INTRINSIC_W_CHAIN: { 19705 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); 19706 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); 19707 switch (IntID) { 19708 default: return; 19709 case Intrinsic::arm_ldaex: 19710 case Intrinsic::arm_ldrex: { 19711 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); 19712 unsigned MemBits = VT.getScalarSizeInBits(); 19713 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); 19714 return; 19715 } 19716 } 19717 } 19718 case ARMISD::BFI: { 19719 // Conservatively, we can recurse down the first operand 19720 // and just mask out all affected bits. 19721 Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); 19722 19723 // The operand to BFI is already a mask suitable for removing the bits it 19724 // sets. 19725 ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2)); 19726 const APInt &Mask = CI->getAPIntValue(); 19727 Known.Zero &= Mask; 19728 Known.One &= Mask; 19729 return; 19730 } 19731 case ARMISD::VGETLANEs: 19732 case ARMISD::VGETLANEu: { 19733 const SDValue &SrcSV = Op.getOperand(0); 19734 EVT VecVT = SrcSV.getValueType(); 19735 assert(VecVT.isVector() && "VGETLANE expected a vector type"); 19736 const unsigned NumSrcElts = VecVT.getVectorNumElements(); 19737 ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode()); 19738 assert(Pos->getAPIntValue().ult(NumSrcElts) && 19739 "VGETLANE index out of bounds"); 19740 unsigned Idx = Pos->getZExtValue(); 19741 APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx); 19742 Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1); 19743 19744 EVT VT = Op.getValueType(); 19745 const unsigned DstSz = VT.getScalarSizeInBits(); 19746 const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits(); 19747 (void)SrcSz; 19748 assert(SrcSz == Known.getBitWidth()); 19749 assert(DstSz > SrcSz); 19750 if (Op.getOpcode() == ARMISD::VGETLANEs) 19751 Known = Known.sext(DstSz); 19752 else { 19753 Known = Known.zext(DstSz); 19754 } 19755 assert(DstSz == Known.getBitWidth()); 19756 break; 19757 } 19758 case ARMISD::VMOVrh: { 19759 KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); 19760 assert(KnownOp.getBitWidth() == 16); 19761 Known = KnownOp.zext(32); 19762 break; 19763 } 19764 case ARMISD::CSINC: 19765 case ARMISD::CSINV: 19766 case ARMISD::CSNEG: { 19767 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); 19768 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); 19769 19770 // The result is either: 19771 // CSINC: KnownOp0 or KnownOp1 + 1 19772 // CSINV: KnownOp0 or ~KnownOp1 19773 // CSNEG: KnownOp0 or KnownOp1 * -1 19774 if (Op.getOpcode() == ARMISD::CSINC) 19775 KnownOp1 = KnownBits::computeForAddSub( 19776 true, false, KnownOp1, KnownBits::makeConstant(APInt(32, 1))); 19777 else if (Op.getOpcode() == ARMISD::CSINV) 19778 std::swap(KnownOp1.Zero, KnownOp1.One); 19779 else if (Op.getOpcode() == ARMISD::CSNEG) 19780 KnownOp1 = KnownBits::mul( 19781 KnownOp1, KnownBits::makeConstant(APInt(32, -1))); 19782 19783 Known = KnownBits::commonBits(KnownOp0, KnownOp1); 19784 break; 19785 } 19786 } 19787 } 19788 19789 bool ARMTargetLowering::targetShrinkDemandedConstant( 19790 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, 19791 TargetLoweringOpt &TLO) const { 19792 // Delay optimization, so we don't have to deal with illegal types, or block 19793 // optimizations. 19794 if (!TLO.LegalOps) 19795 return false; 19796 19797 // Only optimize AND for now. 19798 if (Op.getOpcode() != ISD::AND) 19799 return false; 19800 19801 EVT VT = Op.getValueType(); 19802 19803 // Ignore vectors. 19804 if (VT.isVector()) 19805 return false; 19806 19807 assert(VT == MVT::i32 && "Unexpected integer type"); 19808 19809 // Make sure the RHS really is a constant. 19810 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 19811 if (!C) 19812 return false; 19813 19814 unsigned Mask = C->getZExtValue(); 19815 19816 unsigned Demanded = DemandedBits.getZExtValue(); 19817 unsigned ShrunkMask = Mask & Demanded; 19818 unsigned ExpandedMask = Mask | ~Demanded; 19819 19820 // If the mask is all zeros, let the target-independent code replace the 19821 // result with zero. 19822 if (ShrunkMask == 0) 19823 return false; 19824 19825 // If the mask is all ones, erase the AND. (Currently, the target-independent 19826 // code won't do this, so we have to do it explicitly to avoid an infinite 19827 // loop in obscure cases.) 19828 if (ExpandedMask == ~0U) 19829 return TLO.CombineTo(Op, Op.getOperand(0)); 19830 19831 auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool { 19832 return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0; 19833 }; 19834 auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool { 19835 if (NewMask == Mask) 19836 return true; 19837 SDLoc DL(Op); 19838 SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT); 19839 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC); 19840 return TLO.CombineTo(Op, NewOp); 19841 }; 19842 19843 // Prefer uxtb mask. 19844 if (IsLegalMask(0xFF)) 19845 return UseMask(0xFF); 19846 19847 // Prefer uxth mask. 19848 if (IsLegalMask(0xFFFF)) 19849 return UseMask(0xFFFF); 19850 19851 // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2. 19852 // FIXME: Prefer a contiguous sequence of bits for other optimizations. 19853 if (ShrunkMask < 256) 19854 return UseMask(ShrunkMask); 19855 19856 // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2. 19857 // FIXME: Prefer a contiguous sequence of bits for other optimizations. 19858 if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256) 19859 return UseMask(ExpandedMask); 19860 19861 // Potential improvements: 19862 // 19863 // We could try to recognize lsls+lsrs or lsrs+lsls pairs here. 19864 // We could try to prefer Thumb1 immediates which can be lowered to a 19865 // two-instruction sequence. 19866 // We could try to recognize more legal ARM/Thumb2 immediates here. 19867 19868 return false; 19869 } 19870 19871 bool ARMTargetLowering::SimplifyDemandedBitsForTargetNode( 19872 SDValue Op, const APInt &OriginalDemandedBits, 19873 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, 19874 unsigned Depth) const { 19875 unsigned Opc = Op.getOpcode(); 19876 19877 switch (Opc) { 19878 case ARMISD::ASRL: 19879 case ARMISD::LSRL: { 19880 // If this is result 0 and the other result is unused, see if the demand 19881 // bits allow us to shrink this long shift into a standard small shift in 19882 // the opposite direction. 19883 if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) && 19884 isa<ConstantSDNode>(Op->getOperand(2))) { 19885 unsigned ShAmt = Op->getConstantOperandVal(2); 19886 if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf(APInt::getAllOnes(32) 19887 << (32 - ShAmt))) 19888 return TLO.CombineTo( 19889 Op, TLO.DAG.getNode( 19890 ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1), 19891 TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32))); 19892 } 19893 break; 19894 } 19895 } 19896 19897 return TargetLowering::SimplifyDemandedBitsForTargetNode( 19898 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth); 19899 } 19900 19901 //===----------------------------------------------------------------------===// 19902 // ARM Inline Assembly Support 19903 //===----------------------------------------------------------------------===// 19904 19905 bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const { 19906 // Looking for "rev" which is V6+. 19907 if (!Subtarget->hasV6Ops()) 19908 return false; 19909 19910 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand()); 19911 std::string AsmStr = IA->getAsmString(); 19912 SmallVector<StringRef, 4> AsmPieces; 19913 SplitString(AsmStr, AsmPieces, ";\n"); 19914 19915 switch (AsmPieces.size()) { 19916 default: return false; 19917 case 1: 19918 AsmStr = std::string(AsmPieces[0]); 19919 AsmPieces.clear(); 19920 SplitString(AsmStr, AsmPieces, " \t,"); 19921 19922 // rev $0, $1 19923 if (AsmPieces.size() == 3 && 19924 AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" && 19925 IA->getConstraintString().compare(0, 4, "=l,l") == 0) { 19926 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 19927 if (Ty && Ty->getBitWidth() == 32) 19928 return IntrinsicLowering::LowerToByteSwap(CI); 19929 } 19930 break; 19931 } 19932 19933 return false; 19934 } 19935 19936 const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const { 19937 // At this point, we have to lower this constraint to something else, so we 19938 // lower it to an "r" or "w". However, by doing this we will force the result 19939 // to be in register, while the X constraint is much more permissive. 19940 // 19941 // Although we are correct (we are free to emit anything, without 19942 // constraints), we might break use cases that would expect us to be more 19943 // efficient and emit something else. 19944 if (!Subtarget->hasVFP2Base()) 19945 return "r"; 19946 if (ConstraintVT.isFloatingPoint()) 19947 return "w"; 19948 if (ConstraintVT.isVector() && Subtarget->hasNEON() && 19949 (ConstraintVT.getSizeInBits() == 64 || 19950 ConstraintVT.getSizeInBits() == 128)) 19951 return "w"; 19952 19953 return "r"; 19954 } 19955 19956 /// getConstraintType - Given a constraint letter, return the type of 19957 /// constraint it is for this target. 19958 ARMTargetLowering::ConstraintType 19959 ARMTargetLowering::getConstraintType(StringRef Constraint) const { 19960 unsigned S = Constraint.size(); 19961 if (S == 1) { 19962 switch (Constraint[0]) { 19963 default: break; 19964 case 'l': return C_RegisterClass; 19965 case 'w': return C_RegisterClass; 19966 case 'h': return C_RegisterClass; 19967 case 'x': return C_RegisterClass; 19968 case 't': return C_RegisterClass; 19969 case 'j': return C_Immediate; // Constant for movw. 19970 // An address with a single base register. Due to the way we 19971 // currently handle addresses it is the same as an 'r' memory constraint. 19972 case 'Q': return C_Memory; 19973 } 19974 } else if (S == 2) { 19975 switch (Constraint[0]) { 19976 default: break; 19977 case 'T': return C_RegisterClass; 19978 // All 'U+' constraints are addresses. 19979 case 'U': return C_Memory; 19980 } 19981 } 19982 return TargetLowering::getConstraintType(Constraint); 19983 } 19984 19985 /// Examine constraint type and operand type and determine a weight value. 19986 /// This object must already have been set up with the operand type 19987 /// and the current alternative constraint selected. 19988 TargetLowering::ConstraintWeight 19989 ARMTargetLowering::getSingleConstraintMatchWeight( 19990 AsmOperandInfo &info, const char *constraint) const { 19991 ConstraintWeight weight = CW_Invalid; 19992 Value *CallOperandVal = info.CallOperandVal; 19993 // If we don't have a value, we can't do a match, 19994 // but allow it at the lowest weight. 19995 if (!CallOperandVal) 19996 return CW_Default; 19997 Type *type = CallOperandVal->getType(); 19998 // Look at the constraint type. 19999 switch (*constraint) { 20000 default: 20001 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 20002 break; 20003 case 'l': 20004 if (type->isIntegerTy()) { 20005 if (Subtarget->isThumb()) 20006 weight = CW_SpecificReg; 20007 else 20008 weight = CW_Register; 20009 } 20010 break; 20011 case 'w': 20012 if (type->isFloatingPointTy()) 20013 weight = CW_Register; 20014 break; 20015 } 20016 return weight; 20017 } 20018 20019 using RCPair = std::pair<unsigned, const TargetRegisterClass *>; 20020 20021 RCPair ARMTargetLowering::getRegForInlineAsmConstraint( 20022 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { 20023 switch (Constraint.size()) { 20024 case 1: 20025 // GCC ARM Constraint Letters 20026 switch (Constraint[0]) { 20027 case 'l': // Low regs or general regs. 20028 if (Subtarget->isThumb()) 20029 return RCPair(0U, &ARM::tGPRRegClass); 20030 return RCPair(0U, &ARM::GPRRegClass); 20031 case 'h': // High regs or no regs. 20032 if (Subtarget->isThumb()) 20033 return RCPair(0U, &ARM::hGPRRegClass); 20034 break; 20035 case 'r': 20036 if (Subtarget->isThumb1Only()) 20037 return RCPair(0U, &ARM::tGPRRegClass); 20038 return RCPair(0U, &ARM::GPRRegClass); 20039 case 'w': 20040 if (VT == MVT::Other) 20041 break; 20042 if (VT == MVT::f32) 20043 return RCPair(0U, &ARM::SPRRegClass); 20044 if (VT.getSizeInBits() == 64) 20045 return RCPair(0U, &ARM::DPRRegClass); 20046 if (VT.getSizeInBits() == 128) 20047 return RCPair(0U, &ARM::QPRRegClass); 20048 break; 20049 case 'x': 20050 if (VT == MVT::Other) 20051 break; 20052 if (VT == MVT::f32) 20053 return RCPair(0U, &ARM::SPR_8RegClass); 20054 if (VT.getSizeInBits() == 64) 20055 return RCPair(0U, &ARM::DPR_8RegClass); 20056 if (VT.getSizeInBits() == 128) 20057 return RCPair(0U, &ARM::QPR_8RegClass); 20058 break; 20059 case 't': 20060 if (VT == MVT::Other) 20061 break; 20062 if (VT == MVT::f32 || VT == MVT::i32) 20063 return RCPair(0U, &ARM::SPRRegClass); 20064 if (VT.getSizeInBits() == 64) 20065 return RCPair(0U, &ARM::DPR_VFP2RegClass); 20066 if (VT.getSizeInBits() == 128) 20067 return RCPair(0U, &ARM::QPR_VFP2RegClass); 20068 break; 20069 } 20070 break; 20071 20072 case 2: 20073 if (Constraint[0] == 'T') { 20074 switch (Constraint[1]) { 20075 default: 20076 break; 20077 case 'e': 20078 return RCPair(0U, &ARM::tGPREvenRegClass); 20079 case 'o': 20080 return RCPair(0U, &ARM::tGPROddRegClass); 20081 } 20082 } 20083 break; 20084 20085 default: 20086 break; 20087 } 20088 20089 if (StringRef("{cc}").equals_insensitive(Constraint)) 20090 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass); 20091 20092 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 20093 } 20094 20095 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 20096 /// vector. If it is invalid, don't add anything to Ops. 20097 void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, 20098 std::string &Constraint, 20099 std::vector<SDValue>&Ops, 20100 SelectionDAG &DAG) const { 20101 SDValue Result; 20102 20103 // Currently only support length 1 constraints. 20104 if (Constraint.length() != 1) return; 20105 20106 char ConstraintLetter = Constraint[0]; 20107 switch (ConstraintLetter) { 20108 default: break; 20109 case 'j': 20110 case 'I': case 'J': case 'K': case 'L': 20111 case 'M': case 'N': case 'O': 20112 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 20113 if (!C) 20114 return; 20115 20116 int64_t CVal64 = C->getSExtValue(); 20117 int CVal = (int) CVal64; 20118 // None of these constraints allow values larger than 32 bits. Check 20119 // that the value fits in an int. 20120 if (CVal != CVal64) 20121 return; 20122 20123 switch (ConstraintLetter) { 20124 case 'j': 20125 // Constant suitable for movw, must be between 0 and 20126 // 65535. 20127 if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps())) 20128 if (CVal >= 0 && CVal <= 65535) 20129 break; 20130 return; 20131 case 'I': 20132 if (Subtarget->isThumb1Only()) { 20133 // This must be a constant between 0 and 255, for ADD 20134 // immediates. 20135 if (CVal >= 0 && CVal <= 255) 20136 break; 20137 } else if (Subtarget->isThumb2()) { 20138 // A constant that can be used as an immediate value in a 20139 // data-processing instruction. 20140 if (ARM_AM::getT2SOImmVal(CVal) != -1) 20141 break; 20142 } else { 20143 // A constant that can be used as an immediate value in a 20144 // data-processing instruction. 20145 if (ARM_AM::getSOImmVal(CVal) != -1) 20146 break; 20147 } 20148 return; 20149 20150 case 'J': 20151 if (Subtarget->isThumb1Only()) { 20152 // This must be a constant between -255 and -1, for negated ADD 20153 // immediates. This can be used in GCC with an "n" modifier that 20154 // prints the negated value, for use with SUB instructions. It is 20155 // not useful otherwise but is implemented for compatibility. 20156 if (CVal >= -255 && CVal <= -1) 20157 break; 20158 } else { 20159 // This must be a constant between -4095 and 4095. It is not clear 20160 // what this constraint is intended for. Implemented for 20161 // compatibility with GCC. 20162 if (CVal >= -4095 && CVal <= 4095) 20163 break; 20164 } 20165 return; 20166 20167 case 'K': 20168 if (Subtarget->isThumb1Only()) { 20169 // A 32-bit value where only one byte has a nonzero value. Exclude 20170 // zero to match GCC. This constraint is used by GCC internally for 20171 // constants that can be loaded with a move/shift combination. 20172 // It is not useful otherwise but is implemented for compatibility. 20173 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal)) 20174 break; 20175 } else if (Subtarget->isThumb2()) { 20176 // A constant whose bitwise inverse can be used as an immediate 20177 // value in a data-processing instruction. This can be used in GCC 20178 // with a "B" modifier that prints the inverted value, for use with 20179 // BIC and MVN instructions. It is not useful otherwise but is 20180 // implemented for compatibility. 20181 if (ARM_AM::getT2SOImmVal(~CVal) != -1) 20182 break; 20183 } else { 20184 // A constant whose bitwise inverse can be used as an immediate 20185 // value in a data-processing instruction. This can be used in GCC 20186 // with a "B" modifier that prints the inverted value, for use with 20187 // BIC and MVN instructions. It is not useful otherwise but is 20188 // implemented for compatibility. 20189 if (ARM_AM::getSOImmVal(~CVal) != -1) 20190 break; 20191 } 20192 return; 20193 20194 case 'L': 20195 if (Subtarget->isThumb1Only()) { 20196 // This must be a constant between -7 and 7, 20197 // for 3-operand ADD/SUB immediate instructions. 20198 if (CVal >= -7 && CVal < 7) 20199 break; 20200 } else if (Subtarget->isThumb2()) { 20201 // A constant whose negation can be used as an immediate value in a 20202 // data-processing instruction. This can be used in GCC with an "n" 20203 // modifier that prints the negated value, for use with SUB 20204 // instructions. It is not useful otherwise but is implemented for 20205 // compatibility. 20206 if (ARM_AM::getT2SOImmVal(-CVal) != -1) 20207 break; 20208 } else { 20209 // A constant whose negation can be used as an immediate value in a 20210 // data-processing instruction. This can be used in GCC with an "n" 20211 // modifier that prints the negated value, for use with SUB 20212 // instructions. It is not useful otherwise but is implemented for 20213 // compatibility. 20214 if (ARM_AM::getSOImmVal(-CVal) != -1) 20215 break; 20216 } 20217 return; 20218 20219 case 'M': 20220 if (Subtarget->isThumb1Only()) { 20221 // This must be a multiple of 4 between 0 and 1020, for 20222 // ADD sp + immediate. 20223 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0)) 20224 break; 20225 } else { 20226 // A power of two or a constant between 0 and 32. This is used in 20227 // GCC for the shift amount on shifted register operands, but it is 20228 // useful in general for any shift amounts. 20229 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0)) 20230 break; 20231 } 20232 return; 20233 20234 case 'N': 20235 if (Subtarget->isThumb1Only()) { 20236 // This must be a constant between 0 and 31, for shift amounts. 20237 if (CVal >= 0 && CVal <= 31) 20238 break; 20239 } 20240 return; 20241 20242 case 'O': 20243 if (Subtarget->isThumb1Only()) { 20244 // This must be a multiple of 4 between -508 and 508, for 20245 // ADD/SUB sp = sp + immediate. 20246 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0)) 20247 break; 20248 } 20249 return; 20250 } 20251 Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType()); 20252 break; 20253 } 20254 20255 if (Result.getNode()) { 20256 Ops.push_back(Result); 20257 return; 20258 } 20259 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 20260 } 20261 20262 static RTLIB::Libcall getDivRemLibcall( 20263 const SDNode *N, MVT::SimpleValueType SVT) { 20264 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || 20265 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && 20266 "Unhandled Opcode in getDivRemLibcall"); 20267 bool isSigned = N->getOpcode() == ISD::SDIVREM || 20268 N->getOpcode() == ISD::SREM; 20269 RTLIB::Libcall LC; 20270 switch (SVT) { 20271 default: llvm_unreachable("Unexpected request for libcall!"); 20272 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break; 20273 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break; 20274 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break; 20275 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break; 20276 } 20277 return LC; 20278 } 20279 20280 static TargetLowering::ArgListTy getDivRemArgList( 20281 const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) { 20282 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || 20283 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && 20284 "Unhandled Opcode in getDivRemArgList"); 20285 bool isSigned = N->getOpcode() == ISD::SDIVREM || 20286 N->getOpcode() == ISD::SREM; 20287 TargetLowering::ArgListTy Args; 20288 TargetLowering::ArgListEntry Entry; 20289 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 20290 EVT ArgVT = N->getOperand(i).getValueType(); 20291 Type *ArgTy = ArgVT.getTypeForEVT(*Context); 20292 Entry.Node = N->getOperand(i); 20293 Entry.Ty = ArgTy; 20294 Entry.IsSExt = isSigned; 20295 Entry.IsZExt = !isSigned; 20296 Args.push_back(Entry); 20297 } 20298 if (Subtarget->isTargetWindows() && Args.size() >= 2) 20299 std::swap(Args[0], Args[1]); 20300 return Args; 20301 } 20302 20303 SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { 20304 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || 20305 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || 20306 Subtarget->isTargetWindows()) && 20307 "Register-based DivRem lowering only"); 20308 unsigned Opcode = Op->getOpcode(); 20309 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) && 20310 "Invalid opcode for Div/Rem lowering"); 20311 bool isSigned = (Opcode == ISD::SDIVREM); 20312 EVT VT = Op->getValueType(0); 20313 Type *Ty = VT.getTypeForEVT(*DAG.getContext()); 20314 SDLoc dl(Op); 20315 20316 // If the target has hardware divide, use divide + multiply + subtract: 20317 // div = a / b 20318 // rem = a - b * div 20319 // return {div, rem} 20320 // This should be lowered into UDIV/SDIV + MLS later on. 20321 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() 20322 : Subtarget->hasDivideInARMMode(); 20323 if (hasDivide && Op->getValueType(0).isSimple() && 20324 Op->getSimpleValueType(0) == MVT::i32) { 20325 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV; 20326 const SDValue Dividend = Op->getOperand(0); 20327 const SDValue Divisor = Op->getOperand(1); 20328 SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor); 20329 SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor); 20330 SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul); 20331 20332 SDValue Values[2] = {Div, Rem}; 20333 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values); 20334 } 20335 20336 RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(), 20337 VT.getSimpleVT().SimpleTy); 20338 SDValue InChain = DAG.getEntryNode(); 20339 20340 TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(), 20341 DAG.getContext(), 20342 Subtarget); 20343 20344 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 20345 getPointerTy(DAG.getDataLayout())); 20346 20347 Type *RetTy = StructType::get(Ty, Ty); 20348 20349 if (Subtarget->isTargetWindows()) 20350 InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain); 20351 20352 TargetLowering::CallLoweringInfo CLI(DAG); 20353 CLI.setDebugLoc(dl).setChain(InChain) 20354 .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) 20355 .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); 20356 20357 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); 20358 return CallInfo.first; 20359 } 20360 20361 // Lowers REM using divmod helpers 20362 // see RTABI section 4.2/4.3 20363 SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const { 20364 // Build return types (div and rem) 20365 std::vector<Type*> RetTyParams; 20366 Type *RetTyElement; 20367 20368 switch (N->getValueType(0).getSimpleVT().SimpleTy) { 20369 default: llvm_unreachable("Unexpected request for libcall!"); 20370 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break; 20371 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break; 20372 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break; 20373 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break; 20374 } 20375 20376 RetTyParams.push_back(RetTyElement); 20377 RetTyParams.push_back(RetTyElement); 20378 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams); 20379 Type *RetTy = StructType::get(*DAG.getContext(), ret); 20380 20381 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT(). 20382 SimpleTy); 20383 SDValue InChain = DAG.getEntryNode(); 20384 TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext(), 20385 Subtarget); 20386 bool isSigned = N->getOpcode() == ISD::SREM; 20387 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 20388 getPointerTy(DAG.getDataLayout())); 20389 20390 if (Subtarget->isTargetWindows()) 20391 InChain = WinDBZCheckDenominator(DAG, N, InChain); 20392 20393 // Lower call 20394 CallLoweringInfo CLI(DAG); 20395 CLI.setChain(InChain) 20396 .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args)) 20397 .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N)); 20398 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 20399 20400 // Return second (rem) result operand (first contains div) 20401 SDNode *ResNode = CallResult.first.getNode(); 20402 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands"); 20403 return ResNode->getOperand(1); 20404 } 20405 20406 SDValue 20407 ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { 20408 assert(Subtarget->isTargetWindows() && "unsupported target platform"); 20409 SDLoc DL(Op); 20410 20411 // Get the inputs. 20412 SDValue Chain = Op.getOperand(0); 20413 SDValue Size = Op.getOperand(1); 20414 20415 if (DAG.getMachineFunction().getFunction().hasFnAttribute( 20416 "no-stack-arg-probe")) { 20417 MaybeAlign Align = 20418 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue(); 20419 SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); 20420 Chain = SP.getValue(1); 20421 SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size); 20422 if (Align) 20423 SP = 20424 DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0), 20425 DAG.getConstant(-(uint64_t)Align->value(), DL, MVT::i32)); 20426 Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP); 20427 SDValue Ops[2] = { SP, Chain }; 20428 return DAG.getMergeValues(Ops, DL); 20429 } 20430 20431 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size, 20432 DAG.getConstant(2, DL, MVT::i32)); 20433 20434 SDValue Flag; 20435 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag); 20436 Flag = Chain.getValue(1); 20437 20438 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 20439 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag); 20440 20441 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); 20442 Chain = NewSP.getValue(1); 20443 20444 SDValue Ops[2] = { NewSP, Chain }; 20445 return DAG.getMergeValues(Ops, DL); 20446 } 20447 20448 SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { 20449 bool IsStrict = Op->isStrictFPOpcode(); 20450 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 20451 const unsigned DstSz = Op.getValueType().getSizeInBits(); 20452 const unsigned SrcSz = SrcVal.getValueType().getSizeInBits(); 20453 assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 && 20454 "Unexpected type for custom-lowering FP_EXTEND"); 20455 20456 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) && 20457 "With both FP DP and 16, any FP conversion is legal!"); 20458 20459 assert(!(DstSz == 32 && Subtarget->hasFP16()) && 20460 "With FP16, 16 to 32 conversion is legal!"); 20461 20462 // Converting from 32 -> 64 is valid if we have FP64. 20463 if (SrcSz == 32 && DstSz == 64 && Subtarget->hasFP64()) { 20464 // FIXME: Remove this when we have strict fp instruction selection patterns 20465 if (IsStrict) { 20466 SDLoc Loc(Op); 20467 SDValue Result = DAG.getNode(ISD::FP_EXTEND, 20468 Loc, Op.getValueType(), SrcVal); 20469 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc); 20470 } 20471 return Op; 20472 } 20473 20474 // Either we are converting from 16 -> 64, without FP16 and/or 20475 // FP.double-precision or without Armv8-fp. So we must do it in two 20476 // steps. 20477 // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32 20478 // without FP16. So we must do a function call. 20479 SDLoc Loc(Op); 20480 RTLIB::Libcall LC; 20481 MakeLibCallOptions CallOptions; 20482 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); 20483 for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) { 20484 bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64()); 20485 MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32); 20486 MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64); 20487 if (Supported) { 20488 if (IsStrict) { 20489 SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc, 20490 {DstVT, MVT::Other}, {Chain, SrcVal}); 20491 Chain = SrcVal.getValue(1); 20492 } else { 20493 SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, DstVT, SrcVal); 20494 } 20495 } else { 20496 LC = RTLIB::getFPEXT(SrcVT, DstVT); 20497 assert(LC != RTLIB::UNKNOWN_LIBCALL && 20498 "Unexpected type for custom-lowering FP_EXTEND"); 20499 std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions, 20500 Loc, Chain); 20501 } 20502 } 20503 20504 return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal; 20505 } 20506 20507 SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { 20508 bool IsStrict = Op->isStrictFPOpcode(); 20509 20510 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 20511 EVT SrcVT = SrcVal.getValueType(); 20512 EVT DstVT = Op.getValueType(); 20513 const unsigned DstSz = Op.getValueType().getSizeInBits(); 20514 const unsigned SrcSz = SrcVT.getSizeInBits(); 20515 (void)DstSz; 20516 assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 && 20517 "Unexpected type for custom-lowering FP_ROUND"); 20518 20519 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) && 20520 "With both FP DP and 16, any FP conversion is legal!"); 20521 20522 SDLoc Loc(Op); 20523 20524 // Instruction from 32 -> 16 if hasFP16 is valid 20525 if (SrcSz == 32 && Subtarget->hasFP16()) 20526 return Op; 20527 20528 // Lib call from 32 -> 16 / 64 -> [32, 16] 20529 RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT); 20530 assert(LC != RTLIB::UNKNOWN_LIBCALL && 20531 "Unexpected type for custom-lowering FP_ROUND"); 20532 MakeLibCallOptions CallOptions; 20533 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); 20534 SDValue Result; 20535 std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions, 20536 Loc, Chain); 20537 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result; 20538 } 20539 20540 void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results, 20541 SelectionDAG &DAG) const { 20542 assert(N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS."); 20543 MVT HalfT = MVT::i32; 20544 SDLoc dl(N); 20545 SDValue Hi, Lo, Tmp; 20546 20547 if (!isOperationLegalOrCustom(ISD::ADDCARRY, HalfT) || 20548 !isOperationLegalOrCustom(ISD::UADDO, HalfT)) 20549 return ; 20550 20551 unsigned OpTypeBits = HalfT.getScalarSizeInBits(); 20552 SDVTList VTList = DAG.getVTList(HalfT, MVT::i1); 20553 20554 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), 20555 DAG.getConstant(0, dl, HalfT)); 20556 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), 20557 DAG.getConstant(1, dl, HalfT)); 20558 20559 Tmp = DAG.getNode(ISD::SRA, dl, HalfT, Hi, 20560 DAG.getConstant(OpTypeBits - 1, dl, 20561 getShiftAmountTy(HalfT, DAG.getDataLayout()))); 20562 Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo); 20563 Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi, 20564 SDValue(Lo.getNode(), 1)); 20565 Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi); 20566 Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo); 20567 20568 Results.push_back(Lo); 20569 Results.push_back(Hi); 20570 } 20571 20572 bool 20573 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { 20574 // The ARM target isn't yet aware of offsets. 20575 return false; 20576 } 20577 20578 bool ARM::isBitFieldInvertedMask(unsigned v) { 20579 if (v == 0xffffffff) 20580 return false; 20581 20582 // there can be 1's on either or both "outsides", all the "inside" 20583 // bits must be 0's 20584 return isShiftedMask_32(~v); 20585 } 20586 20587 /// isFPImmLegal - Returns true if the target can instruction select the 20588 /// specified FP immediate natively. If false, the legalizer will 20589 /// materialize the FP immediate as a load from a constant pool. 20590 bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, 20591 bool ForCodeSize) const { 20592 if (!Subtarget->hasVFP3Base()) 20593 return false; 20594 if (VT == MVT::f16 && Subtarget->hasFullFP16()) 20595 return ARM_AM::getFP16Imm(Imm) != -1; 20596 if (VT == MVT::f32 && Subtarget->hasFullFP16() && 20597 ARM_AM::getFP32FP16Imm(Imm) != -1) 20598 return true; 20599 if (VT == MVT::f32) 20600 return ARM_AM::getFP32Imm(Imm) != -1; 20601 if (VT == MVT::f64 && Subtarget->hasFP64()) 20602 return ARM_AM::getFP64Imm(Imm) != -1; 20603 return false; 20604 } 20605 20606 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as 20607 /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment 20608 /// specified in the intrinsic calls. 20609 bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 20610 const CallInst &I, 20611 MachineFunction &MF, 20612 unsigned Intrinsic) const { 20613 switch (Intrinsic) { 20614 case Intrinsic::arm_neon_vld1: 20615 case Intrinsic::arm_neon_vld2: 20616 case Intrinsic::arm_neon_vld3: 20617 case Intrinsic::arm_neon_vld4: 20618 case Intrinsic::arm_neon_vld2lane: 20619 case Intrinsic::arm_neon_vld3lane: 20620 case Intrinsic::arm_neon_vld4lane: 20621 case Intrinsic::arm_neon_vld2dup: 20622 case Intrinsic::arm_neon_vld3dup: 20623 case Intrinsic::arm_neon_vld4dup: { 20624 Info.opc = ISD::INTRINSIC_W_CHAIN; 20625 // Conservatively set memVT to the entire set of vectors loaded. 20626 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 20627 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 20628 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 20629 Info.ptrVal = I.getArgOperand(0); 20630 Info.offset = 0; 20631 Value *AlignArg = I.getArgOperand(I.arg_size() - 1); 20632 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue(); 20633 // volatile loads with NEON intrinsics not supported 20634 Info.flags = MachineMemOperand::MOLoad; 20635 return true; 20636 } 20637 case Intrinsic::arm_neon_vld1x2: 20638 case Intrinsic::arm_neon_vld1x3: 20639 case Intrinsic::arm_neon_vld1x4: { 20640 Info.opc = ISD::INTRINSIC_W_CHAIN; 20641 // Conservatively set memVT to the entire set of vectors loaded. 20642 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 20643 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 20644 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 20645 Info.ptrVal = I.getArgOperand(I.arg_size() - 1); 20646 Info.offset = 0; 20647 Info.align.reset(); 20648 // volatile loads with NEON intrinsics not supported 20649 Info.flags = MachineMemOperand::MOLoad; 20650 return true; 20651 } 20652 case Intrinsic::arm_neon_vst1: 20653 case Intrinsic::arm_neon_vst2: 20654 case Intrinsic::arm_neon_vst3: 20655 case Intrinsic::arm_neon_vst4: 20656 case Intrinsic::arm_neon_vst2lane: 20657 case Intrinsic::arm_neon_vst3lane: 20658 case Intrinsic::arm_neon_vst4lane: { 20659 Info.opc = ISD::INTRINSIC_VOID; 20660 // Conservatively set memVT to the entire set of vectors stored. 20661 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 20662 unsigned NumElts = 0; 20663 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) { 20664 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 20665 if (!ArgTy->isVectorTy()) 20666 break; 20667 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 20668 } 20669 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 20670 Info.ptrVal = I.getArgOperand(0); 20671 Info.offset = 0; 20672 Value *AlignArg = I.getArgOperand(I.arg_size() - 1); 20673 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue(); 20674 // volatile stores with NEON intrinsics not supported 20675 Info.flags = MachineMemOperand::MOStore; 20676 return true; 20677 } 20678 case Intrinsic::arm_neon_vst1x2: 20679 case Intrinsic::arm_neon_vst1x3: 20680 case Intrinsic::arm_neon_vst1x4: { 20681 Info.opc = ISD::INTRINSIC_VOID; 20682 // Conservatively set memVT to the entire set of vectors stored. 20683 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 20684 unsigned NumElts = 0; 20685 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) { 20686 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 20687 if (!ArgTy->isVectorTy()) 20688 break; 20689 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 20690 } 20691 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 20692 Info.ptrVal = I.getArgOperand(0); 20693 Info.offset = 0; 20694 Info.align.reset(); 20695 // volatile stores with NEON intrinsics not supported 20696 Info.flags = MachineMemOperand::MOStore; 20697 return true; 20698 } 20699 case Intrinsic::arm_mve_vld2q: 20700 case Intrinsic::arm_mve_vld4q: { 20701 Info.opc = ISD::INTRINSIC_W_CHAIN; 20702 // Conservatively set memVT to the entire set of vectors loaded. 20703 Type *VecTy = cast<StructType>(I.getType())->getElementType(1); 20704 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4; 20705 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2); 20706 Info.ptrVal = I.getArgOperand(0); 20707 Info.offset = 0; 20708 Info.align = Align(VecTy->getScalarSizeInBits() / 8); 20709 // volatile loads with MVE intrinsics not supported 20710 Info.flags = MachineMemOperand::MOLoad; 20711 return true; 20712 } 20713 case Intrinsic::arm_mve_vst2q: 20714 case Intrinsic::arm_mve_vst4q: { 20715 Info.opc = ISD::INTRINSIC_VOID; 20716 // Conservatively set memVT to the entire set of vectors stored. 20717 Type *VecTy = I.getArgOperand(1)->getType(); 20718 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4; 20719 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2); 20720 Info.ptrVal = I.getArgOperand(0); 20721 Info.offset = 0; 20722 Info.align = Align(VecTy->getScalarSizeInBits() / 8); 20723 // volatile stores with MVE intrinsics not supported 20724 Info.flags = MachineMemOperand::MOStore; 20725 return true; 20726 } 20727 case Intrinsic::arm_mve_vldr_gather_base: 20728 case Intrinsic::arm_mve_vldr_gather_base_predicated: { 20729 Info.opc = ISD::INTRINSIC_W_CHAIN; 20730 Info.ptrVal = nullptr; 20731 Info.memVT = MVT::getVT(I.getType()); 20732 Info.align = Align(1); 20733 Info.flags |= MachineMemOperand::MOLoad; 20734 return true; 20735 } 20736 case Intrinsic::arm_mve_vldr_gather_base_wb: 20737 case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: { 20738 Info.opc = ISD::INTRINSIC_W_CHAIN; 20739 Info.ptrVal = nullptr; 20740 Info.memVT = MVT::getVT(I.getType()->getContainedType(0)); 20741 Info.align = Align(1); 20742 Info.flags |= MachineMemOperand::MOLoad; 20743 return true; 20744 } 20745 case Intrinsic::arm_mve_vldr_gather_offset: 20746 case Intrinsic::arm_mve_vldr_gather_offset_predicated: { 20747 Info.opc = ISD::INTRINSIC_W_CHAIN; 20748 Info.ptrVal = nullptr; 20749 MVT DataVT = MVT::getVT(I.getType()); 20750 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue(); 20751 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize), 20752 DataVT.getVectorNumElements()); 20753 Info.align = Align(1); 20754 Info.flags |= MachineMemOperand::MOLoad; 20755 return true; 20756 } 20757 case Intrinsic::arm_mve_vstr_scatter_base: 20758 case Intrinsic::arm_mve_vstr_scatter_base_predicated: { 20759 Info.opc = ISD::INTRINSIC_VOID; 20760 Info.ptrVal = nullptr; 20761 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType()); 20762 Info.align = Align(1); 20763 Info.flags |= MachineMemOperand::MOStore; 20764 return true; 20765 } 20766 case Intrinsic::arm_mve_vstr_scatter_base_wb: 20767 case Intrinsic::arm_mve_vstr_scatter_base_wb_predicated: { 20768 Info.opc = ISD::INTRINSIC_W_CHAIN; 20769 Info.ptrVal = nullptr; 20770 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType()); 20771 Info.align = Align(1); 20772 Info.flags |= MachineMemOperand::MOStore; 20773 return true; 20774 } 20775 case Intrinsic::arm_mve_vstr_scatter_offset: 20776 case Intrinsic::arm_mve_vstr_scatter_offset_predicated: { 20777 Info.opc = ISD::INTRINSIC_VOID; 20778 Info.ptrVal = nullptr; 20779 MVT DataVT = MVT::getVT(I.getArgOperand(2)->getType()); 20780 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue(); 20781 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize), 20782 DataVT.getVectorNumElements()); 20783 Info.align = Align(1); 20784 Info.flags |= MachineMemOperand::MOStore; 20785 return true; 20786 } 20787 case Intrinsic::arm_ldaex: 20788 case Intrinsic::arm_ldrex: { 20789 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 20790 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType()); 20791 Info.opc = ISD::INTRINSIC_W_CHAIN; 20792 Info.memVT = MVT::getVT(PtrTy->getPointerElementType()); 20793 Info.ptrVal = I.getArgOperand(0); 20794 Info.offset = 0; 20795 Info.align = DL.getABITypeAlign(PtrTy->getPointerElementType()); 20796 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 20797 return true; 20798 } 20799 case Intrinsic::arm_stlex: 20800 case Intrinsic::arm_strex: { 20801 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 20802 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); 20803 Info.opc = ISD::INTRINSIC_W_CHAIN; 20804 Info.memVT = MVT::getVT(PtrTy->getPointerElementType()); 20805 Info.ptrVal = I.getArgOperand(1); 20806 Info.offset = 0; 20807 Info.align = DL.getABITypeAlign(PtrTy->getPointerElementType()); 20808 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 20809 return true; 20810 } 20811 case Intrinsic::arm_stlexd: 20812 case Intrinsic::arm_strexd: 20813 Info.opc = ISD::INTRINSIC_W_CHAIN; 20814 Info.memVT = MVT::i64; 20815 Info.ptrVal = I.getArgOperand(2); 20816 Info.offset = 0; 20817 Info.align = Align(8); 20818 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 20819 return true; 20820 20821 case Intrinsic::arm_ldaexd: 20822 case Intrinsic::arm_ldrexd: 20823 Info.opc = ISD::INTRINSIC_W_CHAIN; 20824 Info.memVT = MVT::i64; 20825 Info.ptrVal = I.getArgOperand(0); 20826 Info.offset = 0; 20827 Info.align = Align(8); 20828 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 20829 return true; 20830 20831 default: 20832 break; 20833 } 20834 20835 return false; 20836 } 20837 20838 /// Returns true if it is beneficial to convert a load of a constant 20839 /// to just the constant itself. 20840 bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 20841 Type *Ty) const { 20842 assert(Ty->isIntegerTy()); 20843 20844 unsigned Bits = Ty->getPrimitiveSizeInBits(); 20845 if (Bits == 0 || Bits > 32) 20846 return false; 20847 return true; 20848 } 20849 20850 bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, 20851 unsigned Index) const { 20852 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) 20853 return false; 20854 20855 return (Index == 0 || Index == ResVT.getVectorNumElements()); 20856 } 20857 20858 Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder, 20859 ARM_MB::MemBOpt Domain) const { 20860 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 20861 20862 // First, if the target has no DMB, see what fallback we can use. 20863 if (!Subtarget->hasDataBarrier()) { 20864 // Some ARMv6 cpus can support data barriers with an mcr instruction. 20865 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 20866 // here. 20867 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) { 20868 Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr); 20869 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0), 20870 Builder.getInt32(0), Builder.getInt32(7), 20871 Builder.getInt32(10), Builder.getInt32(5)}; 20872 return Builder.CreateCall(MCR, args); 20873 } else { 20874 // Instead of using barriers, atomic accesses on these subtargets use 20875 // libcalls. 20876 llvm_unreachable("makeDMB on a target so old that it has no barriers"); 20877 } 20878 } else { 20879 Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb); 20880 // Only a full system barrier exists in the M-class architectures. 20881 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain; 20882 Constant *CDomain = Builder.getInt32(Domain); 20883 return Builder.CreateCall(DMB, CDomain); 20884 } 20885 } 20886 20887 // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html 20888 Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder, 20889 Instruction *Inst, 20890 AtomicOrdering Ord) const { 20891 switch (Ord) { 20892 case AtomicOrdering::NotAtomic: 20893 case AtomicOrdering::Unordered: 20894 llvm_unreachable("Invalid fence: unordered/non-atomic"); 20895 case AtomicOrdering::Monotonic: 20896 case AtomicOrdering::Acquire: 20897 return nullptr; // Nothing to do 20898 case AtomicOrdering::SequentiallyConsistent: 20899 if (!Inst->hasAtomicStore()) 20900 return nullptr; // Nothing to do 20901 LLVM_FALLTHROUGH; 20902 case AtomicOrdering::Release: 20903 case AtomicOrdering::AcquireRelease: 20904 if (Subtarget->preferISHSTBarriers()) 20905 return makeDMB(Builder, ARM_MB::ISHST); 20906 // FIXME: add a comment with a link to documentation justifying this. 20907 else 20908 return makeDMB(Builder, ARM_MB::ISH); 20909 } 20910 llvm_unreachable("Unknown fence ordering in emitLeadingFence"); 20911 } 20912 20913 Instruction *ARMTargetLowering::emitTrailingFence(IRBuilderBase &Builder, 20914 Instruction *Inst, 20915 AtomicOrdering Ord) const { 20916 switch (Ord) { 20917 case AtomicOrdering::NotAtomic: 20918 case AtomicOrdering::Unordered: 20919 llvm_unreachable("Invalid fence: unordered/not-atomic"); 20920 case AtomicOrdering::Monotonic: 20921 case AtomicOrdering::Release: 20922 return nullptr; // Nothing to do 20923 case AtomicOrdering::Acquire: 20924 case AtomicOrdering::AcquireRelease: 20925 case AtomicOrdering::SequentiallyConsistent: 20926 return makeDMB(Builder, ARM_MB::ISH); 20927 } 20928 llvm_unreachable("Unknown fence ordering in emitTrailingFence"); 20929 } 20930 20931 // Loads and stores less than 64-bits are already atomic; ones above that 20932 // are doomed anyway, so defer to the default libcall and blame the OS when 20933 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit 20934 // anything for those. 20935 bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { 20936 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); 20937 return (Size == 64) && !Subtarget->isMClass(); 20938 } 20939 20940 // Loads and stores less than 64-bits are already atomic; ones above that 20941 // are doomed anyway, so defer to the default libcall and blame the OS when 20942 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit 20943 // anything for those. 20944 // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that 20945 // guarantee, see DDI0406C ARM architecture reference manual, 20946 // sections A8.8.72-74 LDRD) 20947 TargetLowering::AtomicExpansionKind 20948 ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { 20949 unsigned Size = LI->getType()->getPrimitiveSizeInBits(); 20950 return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly 20951 : AtomicExpansionKind::None; 20952 } 20953 20954 // For the real atomic operations, we have ldrex/strex up to 32 bits, 20955 // and up to 64 bits on the non-M profiles 20956 TargetLowering::AtomicExpansionKind 20957 ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 20958 if (AI->isFloatingPointOperation()) 20959 return AtomicExpansionKind::CmpXChg; 20960 20961 // At -O0, fast-regalloc cannot cope with the live vregs necessary to 20962 // implement atomicrmw without spilling. If the target address is also on the 20963 // stack and close enough to the spill slot, this can lead to a situation 20964 // where the monitor always gets cleared and the atomic operation can never 20965 // succeed. So at -O0 lower this operation to a CAS loop. 20966 if (getTargetMachine().getOptLevel() == CodeGenOpt::None) 20967 return AtomicExpansionKind::CmpXChg; 20968 20969 unsigned Size = AI->getType()->getPrimitiveSizeInBits(); 20970 bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); 20971 return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) 20972 ? AtomicExpansionKind::LLSC 20973 : AtomicExpansionKind::None; 20974 } 20975 20976 // Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used up to 32 20977 // bits, and up to 64 bits on the non-M profiles. 20978 TargetLowering::AtomicExpansionKind 20979 ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const { 20980 // At -O0, fast-regalloc cannot cope with the live vregs necessary to 20981 // implement cmpxchg without spilling. If the address being exchanged is also 20982 // on the stack and close enough to the spill slot, this can lead to a 20983 // situation where the monitor always gets cleared and the atomic operation 20984 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. 20985 unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits(); 20986 bool HasAtomicCmpXchg = 20987 !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); 20988 if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg && 20989 Size <= (Subtarget->isMClass() ? 32U : 64U)) 20990 return AtomicExpansionKind::LLSC; 20991 return AtomicExpansionKind::None; 20992 } 20993 20994 bool ARMTargetLowering::shouldInsertFencesForAtomic( 20995 const Instruction *I) const { 20996 return InsertFencesForAtomic; 20997 } 20998 20999 bool ARMTargetLowering::useLoadStackGuardNode() const { return true; } 21000 21001 void ARMTargetLowering::insertSSPDeclarations(Module &M) const { 21002 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 21003 return TargetLowering::insertSSPDeclarations(M); 21004 21005 // MSVC CRT has a global variable holding security cookie. 21006 M.getOrInsertGlobal("__security_cookie", 21007 Type::getInt8PtrTy(M.getContext())); 21008 21009 // MSVC CRT has a function to validate security cookie. 21010 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( 21011 "__security_check_cookie", Type::getVoidTy(M.getContext()), 21012 Type::getInt8PtrTy(M.getContext())); 21013 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) 21014 F->addParamAttr(0, Attribute::AttrKind::InReg); 21015 } 21016 21017 Value *ARMTargetLowering::getSDagStackGuard(const Module &M) const { 21018 // MSVC CRT has a global variable holding security cookie. 21019 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 21020 return M.getGlobalVariable("__security_cookie"); 21021 return TargetLowering::getSDagStackGuard(M); 21022 } 21023 21024 Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const { 21025 // MSVC CRT has a function to validate security cookie. 21026 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 21027 return M.getFunction("__security_check_cookie"); 21028 return TargetLowering::getSSPStackGuardCheck(M); 21029 } 21030 21031 bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, 21032 unsigned &Cost) const { 21033 // If we do not have NEON, vector types are not natively supported. 21034 if (!Subtarget->hasNEON()) 21035 return false; 21036 21037 // Floating point values and vector values map to the same register file. 21038 // Therefore, although we could do a store extract of a vector type, this is 21039 // better to leave at float as we have more freedom in the addressing mode for 21040 // those. 21041 if (VectorTy->isFPOrFPVectorTy()) 21042 return false; 21043 21044 // If the index is unknown at compile time, this is very expensive to lower 21045 // and it is not possible to combine the store with the extract. 21046 if (!isa<ConstantInt>(Idx)) 21047 return false; 21048 21049 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type"); 21050 unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedSize(); 21051 // We can do a store + vector extract on any vector that fits perfectly in a D 21052 // or Q register. 21053 if (BitWidth == 64 || BitWidth == 128) { 21054 Cost = 0; 21055 return true; 21056 } 21057 return false; 21058 } 21059 21060 bool ARMTargetLowering::isCheapToSpeculateCttz() const { 21061 return Subtarget->hasV6T2Ops(); 21062 } 21063 21064 bool ARMTargetLowering::isCheapToSpeculateCtlz() const { 21065 return Subtarget->hasV6T2Ops(); 21066 } 21067 21068 bool ARMTargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const { 21069 return !Subtarget->hasMinSize() || Subtarget->isTargetWindows(); 21070 } 21071 21072 Value *ARMTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, 21073 Value *Addr, 21074 AtomicOrdering Ord) const { 21075 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 21076 bool IsAcquire = isAcquireOrStronger(Ord); 21077 21078 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd 21079 // intrinsic must return {i32, i32} and we have to recombine them into a 21080 // single i64 here. 21081 if (ValueTy->getPrimitiveSizeInBits() == 64) { 21082 Intrinsic::ID Int = 21083 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd; 21084 Function *Ldrex = Intrinsic::getDeclaration(M, Int); 21085 21086 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 21087 Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi"); 21088 21089 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); 21090 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); 21091 if (!Subtarget->isLittle()) 21092 std::swap (Lo, Hi); 21093 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64"); 21094 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64"); 21095 return Builder.CreateOr( 21096 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 32)), "val64"); 21097 } 21098 21099 Type *Tys[] = { Addr->getType() }; 21100 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex; 21101 Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys); 21102 21103 return Builder.CreateTruncOrBitCast(Builder.CreateCall(Ldrex, Addr), ValueTy); 21104 } 21105 21106 void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance( 21107 IRBuilderBase &Builder) const { 21108 if (!Subtarget->hasV7Ops()) 21109 return; 21110 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 21111 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex)); 21112 } 21113 21114 Value *ARMTargetLowering::emitStoreConditional(IRBuilderBase &Builder, 21115 Value *Val, Value *Addr, 21116 AtomicOrdering Ord) const { 21117 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 21118 bool IsRelease = isReleaseOrStronger(Ord); 21119 21120 // Since the intrinsics must have legal type, the i64 intrinsics take two 21121 // parameters: "i32, i32". We must marshal Val into the appropriate form 21122 // before the call. 21123 if (Val->getType()->getPrimitiveSizeInBits() == 64) { 21124 Intrinsic::ID Int = 21125 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd; 21126 Function *Strex = Intrinsic::getDeclaration(M, Int); 21127 Type *Int32Ty = Type::getInt32Ty(M->getContext()); 21128 21129 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo"); 21130 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi"); 21131 if (!Subtarget->isLittle()) 21132 std::swap(Lo, Hi); 21133 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 21134 return Builder.CreateCall(Strex, {Lo, Hi, Addr}); 21135 } 21136 21137 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex; 21138 Type *Tys[] = { Addr->getType() }; 21139 Function *Strex = Intrinsic::getDeclaration(M, Int, Tys); 21140 21141 return Builder.CreateCall( 21142 Strex, {Builder.CreateZExtOrBitCast( 21143 Val, Strex->getFunctionType()->getParamType(0)), 21144 Addr}); 21145 } 21146 21147 21148 bool ARMTargetLowering::alignLoopsWithOptSize() const { 21149 return Subtarget->isMClass(); 21150 } 21151 21152 /// A helper function for determining the number of interleaved accesses we 21153 /// will generate when lowering accesses of the given type. 21154 unsigned 21155 ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy, 21156 const DataLayout &DL) const { 21157 return (DL.getTypeSizeInBits(VecTy) + 127) / 128; 21158 } 21159 21160 bool ARMTargetLowering::isLegalInterleavedAccessType( 21161 unsigned Factor, FixedVectorType *VecTy, Align Alignment, 21162 const DataLayout &DL) const { 21163 21164 unsigned VecSize = DL.getTypeSizeInBits(VecTy); 21165 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); 21166 21167 if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps()) 21168 return false; 21169 21170 // Ensure the vector doesn't have f16 elements. Even though we could do an 21171 // i16 vldN, we can't hold the f16 vectors and will end up converting via 21172 // f32. 21173 if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy()) 21174 return false; 21175 if (Subtarget->hasMVEIntegerOps() && Factor == 3) 21176 return false; 21177 21178 // Ensure the number of vector elements is greater than 1. 21179 if (VecTy->getNumElements() < 2) 21180 return false; 21181 21182 // Ensure the element type is legal. 21183 if (ElSize != 8 && ElSize != 16 && ElSize != 32) 21184 return false; 21185 // And the alignment if high enough under MVE. 21186 if (Subtarget->hasMVEIntegerOps() && Alignment < ElSize / 8) 21187 return false; 21188 21189 // Ensure the total vector size is 64 or a multiple of 128. Types larger than 21190 // 128 will be split into multiple interleaved accesses. 21191 if (Subtarget->hasNEON() && VecSize == 64) 21192 return true; 21193 return VecSize % 128 == 0; 21194 } 21195 21196 unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const { 21197 if (Subtarget->hasNEON()) 21198 return 4; 21199 if (Subtarget->hasMVEIntegerOps()) 21200 return MVEMaxSupportedInterleaveFactor; 21201 return TargetLoweringBase::getMaxSupportedInterleaveFactor(); 21202 } 21203 21204 /// Lower an interleaved load into a vldN intrinsic. 21205 /// 21206 /// E.g. Lower an interleaved load (Factor = 2): 21207 /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4 21208 /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements 21209 /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements 21210 /// 21211 /// Into: 21212 /// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4) 21213 /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0 21214 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 21215 bool ARMTargetLowering::lowerInterleavedLoad( 21216 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, 21217 ArrayRef<unsigned> Indices, unsigned Factor) const { 21218 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 21219 "Invalid interleave factor"); 21220 assert(!Shuffles.empty() && "Empty shufflevector input"); 21221 assert(Shuffles.size() == Indices.size() && 21222 "Unmatched number of shufflevectors and indices"); 21223 21224 auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType()); 21225 Type *EltTy = VecTy->getElementType(); 21226 21227 const DataLayout &DL = LI->getModule()->getDataLayout(); 21228 Align Alignment = LI->getAlign(); 21229 21230 // Skip if we do not have NEON and skip illegal vector types. We can 21231 // "legalize" wide vector types into multiple interleaved accesses as long as 21232 // the vector types are divisible by 128. 21233 if (!isLegalInterleavedAccessType(Factor, VecTy, Alignment, DL)) 21234 return false; 21235 21236 unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); 21237 21238 // A pointer vector can not be the return type of the ldN intrinsics. Need to 21239 // load integer vectors first and then convert to pointer vectors. 21240 if (EltTy->isPointerTy()) 21241 VecTy = FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy); 21242 21243 IRBuilder<> Builder(LI); 21244 21245 // The base address of the load. 21246 Value *BaseAddr = LI->getPointerOperand(); 21247 21248 if (NumLoads > 1) { 21249 // If we're going to generate more than one load, reset the sub-vector type 21250 // to something legal. 21251 VecTy = FixedVectorType::get(VecTy->getElementType(), 21252 VecTy->getNumElements() / NumLoads); 21253 21254 // We will compute the pointer operand of each load from the original base 21255 // address using GEPs. Cast the base address to a pointer to the scalar 21256 // element type. 21257 BaseAddr = Builder.CreateBitCast( 21258 BaseAddr, 21259 VecTy->getElementType()->getPointerTo(LI->getPointerAddressSpace())); 21260 } 21261 21262 assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!"); 21263 21264 auto createLoadIntrinsic = [&](Value *BaseAddr) { 21265 if (Subtarget->hasNEON()) { 21266 Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); 21267 Type *Tys[] = {VecTy, Int8Ptr}; 21268 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, 21269 Intrinsic::arm_neon_vld3, 21270 Intrinsic::arm_neon_vld4}; 21271 Function *VldnFunc = 21272 Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); 21273 21274 SmallVector<Value *, 2> Ops; 21275 Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); 21276 Ops.push_back(Builder.getInt32(LI->getAlignment())); 21277 21278 return Builder.CreateCall(VldnFunc, Ops, "vldN"); 21279 } else { 21280 assert((Factor == 2 || Factor == 4) && 21281 "expected interleave factor of 2 or 4 for MVE"); 21282 Intrinsic::ID LoadInts = 21283 Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q; 21284 Type *VecEltTy = 21285 VecTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()); 21286 Type *Tys[] = {VecTy, VecEltTy}; 21287 Function *VldnFunc = 21288 Intrinsic::getDeclaration(LI->getModule(), LoadInts, Tys); 21289 21290 SmallVector<Value *, 2> Ops; 21291 Ops.push_back(Builder.CreateBitCast(BaseAddr, VecEltTy)); 21292 return Builder.CreateCall(VldnFunc, Ops, "vldN"); 21293 } 21294 }; 21295 21296 // Holds sub-vectors extracted from the load intrinsic return values. The 21297 // sub-vectors are associated with the shufflevector instructions they will 21298 // replace. 21299 DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs; 21300 21301 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) { 21302 // If we're generating more than one load, compute the base address of 21303 // subsequent loads as an offset from the previous. 21304 if (LoadCount > 0) 21305 BaseAddr = Builder.CreateConstGEP1_32(VecTy->getElementType(), BaseAddr, 21306 VecTy->getNumElements() * Factor); 21307 21308 CallInst *VldN = createLoadIntrinsic(BaseAddr); 21309 21310 // Replace uses of each shufflevector with the corresponding vector loaded 21311 // by ldN. 21312 for (unsigned i = 0; i < Shuffles.size(); i++) { 21313 ShuffleVectorInst *SV = Shuffles[i]; 21314 unsigned Index = Indices[i]; 21315 21316 Value *SubVec = Builder.CreateExtractValue(VldN, Index); 21317 21318 // Convert the integer vector to pointer vector if the element is pointer. 21319 if (EltTy->isPointerTy()) 21320 SubVec = Builder.CreateIntToPtr( 21321 SubVec, 21322 FixedVectorType::get(SV->getType()->getElementType(), VecTy)); 21323 21324 SubVecs[SV].push_back(SubVec); 21325 } 21326 } 21327 21328 // Replace uses of the shufflevector instructions with the sub-vectors 21329 // returned by the load intrinsic. If a shufflevector instruction is 21330 // associated with more than one sub-vector, those sub-vectors will be 21331 // concatenated into a single wide vector. 21332 for (ShuffleVectorInst *SVI : Shuffles) { 21333 auto &SubVec = SubVecs[SVI]; 21334 auto *WideVec = 21335 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0]; 21336 SVI->replaceAllUsesWith(WideVec); 21337 } 21338 21339 return true; 21340 } 21341 21342 /// Lower an interleaved store into a vstN intrinsic. 21343 /// 21344 /// E.g. Lower an interleaved store (Factor = 3): 21345 /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, 21346 /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> 21347 /// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4 21348 /// 21349 /// Into: 21350 /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> 21351 /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> 21352 /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> 21353 /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) 21354 /// 21355 /// Note that the new shufflevectors will be removed and we'll only generate one 21356 /// vst3 instruction in CodeGen. 21357 /// 21358 /// Example for a more general valid mask (Factor 3). Lower: 21359 /// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1, 21360 /// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19> 21361 /// store <12 x i32> %i.vec, <12 x i32>* %ptr 21362 /// 21363 /// Into: 21364 /// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7> 21365 /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> 21366 /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> 21367 /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) 21368 bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, 21369 ShuffleVectorInst *SVI, 21370 unsigned Factor) const { 21371 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 21372 "Invalid interleave factor"); 21373 21374 auto *VecTy = cast<FixedVectorType>(SVI->getType()); 21375 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); 21376 21377 unsigned LaneLen = VecTy->getNumElements() / Factor; 21378 Type *EltTy = VecTy->getElementType(); 21379 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen); 21380 21381 const DataLayout &DL = SI->getModule()->getDataLayout(); 21382 Align Alignment = SI->getAlign(); 21383 21384 // Skip if we do not have NEON and skip illegal vector types. We can 21385 // "legalize" wide vector types into multiple interleaved accesses as long as 21386 // the vector types are divisible by 128. 21387 if (!isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL)) 21388 return false; 21389 21390 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); 21391 21392 Value *Op0 = SVI->getOperand(0); 21393 Value *Op1 = SVI->getOperand(1); 21394 IRBuilder<> Builder(SI); 21395 21396 // StN intrinsics don't support pointer vectors as arguments. Convert pointer 21397 // vectors to integer vectors. 21398 if (EltTy->isPointerTy()) { 21399 Type *IntTy = DL.getIntPtrType(EltTy); 21400 21401 // Convert to the corresponding integer vector. 21402 auto *IntVecTy = 21403 FixedVectorType::get(IntTy, cast<FixedVectorType>(Op0->getType())); 21404 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); 21405 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); 21406 21407 SubVecTy = FixedVectorType::get(IntTy, LaneLen); 21408 } 21409 21410 // The base address of the store. 21411 Value *BaseAddr = SI->getPointerOperand(); 21412 21413 if (NumStores > 1) { 21414 // If we're going to generate more than one store, reset the lane length 21415 // and sub-vector type to something legal. 21416 LaneLen /= NumStores; 21417 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen); 21418 21419 // We will compute the pointer operand of each store from the original base 21420 // address using GEPs. Cast the base address to a pointer to the scalar 21421 // element type. 21422 BaseAddr = Builder.CreateBitCast( 21423 BaseAddr, 21424 SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace())); 21425 } 21426 21427 assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!"); 21428 21429 auto Mask = SVI->getShuffleMask(); 21430 21431 auto createStoreIntrinsic = [&](Value *BaseAddr, 21432 SmallVectorImpl<Value *> &Shuffles) { 21433 if (Subtarget->hasNEON()) { 21434 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, 21435 Intrinsic::arm_neon_vst3, 21436 Intrinsic::arm_neon_vst4}; 21437 Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); 21438 Type *Tys[] = {Int8Ptr, SubVecTy}; 21439 21440 Function *VstNFunc = Intrinsic::getDeclaration( 21441 SI->getModule(), StoreInts[Factor - 2], Tys); 21442 21443 SmallVector<Value *, 6> Ops; 21444 Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); 21445 append_range(Ops, Shuffles); 21446 Ops.push_back(Builder.getInt32(SI->getAlignment())); 21447 Builder.CreateCall(VstNFunc, Ops); 21448 } else { 21449 assert((Factor == 2 || Factor == 4) && 21450 "expected interleave factor of 2 or 4 for MVE"); 21451 Intrinsic::ID StoreInts = 21452 Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q; 21453 Type *EltPtrTy = SubVecTy->getElementType()->getPointerTo( 21454 SI->getPointerAddressSpace()); 21455 Type *Tys[] = {EltPtrTy, SubVecTy}; 21456 Function *VstNFunc = 21457 Intrinsic::getDeclaration(SI->getModule(), StoreInts, Tys); 21458 21459 SmallVector<Value *, 6> Ops; 21460 Ops.push_back(Builder.CreateBitCast(BaseAddr, EltPtrTy)); 21461 append_range(Ops, Shuffles); 21462 for (unsigned F = 0; F < Factor; F++) { 21463 Ops.push_back(Builder.getInt32(F)); 21464 Builder.CreateCall(VstNFunc, Ops); 21465 Ops.pop_back(); 21466 } 21467 } 21468 }; 21469 21470 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { 21471 // If we generating more than one store, we compute the base address of 21472 // subsequent stores as an offset from the previous. 21473 if (StoreCount > 0) 21474 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(), 21475 BaseAddr, LaneLen * Factor); 21476 21477 SmallVector<Value *, 4> Shuffles; 21478 21479 // Split the shufflevector operands into sub vectors for the new vstN call. 21480 for (unsigned i = 0; i < Factor; i++) { 21481 unsigned IdxI = StoreCount * LaneLen * Factor + i; 21482 if (Mask[IdxI] >= 0) { 21483 Shuffles.push_back(Builder.CreateShuffleVector( 21484 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0))); 21485 } else { 21486 unsigned StartMask = 0; 21487 for (unsigned j = 1; j < LaneLen; j++) { 21488 unsigned IdxJ = StoreCount * LaneLen * Factor + j; 21489 if (Mask[IdxJ * Factor + IdxI] >= 0) { 21490 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ; 21491 break; 21492 } 21493 } 21494 // Note: If all elements in a chunk are undefs, StartMask=0! 21495 // Note: Filling undef gaps with random elements is ok, since 21496 // those elements were being written anyway (with undefs). 21497 // In the case of all undefs we're defaulting to using elems from 0 21498 // Note: StartMask cannot be negative, it's checked in 21499 // isReInterleaveMask 21500 Shuffles.push_back(Builder.CreateShuffleVector( 21501 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0))); 21502 } 21503 } 21504 21505 createStoreIntrinsic(BaseAddr, Shuffles); 21506 } 21507 return true; 21508 } 21509 21510 enum HABaseType { 21511 HA_UNKNOWN = 0, 21512 HA_FLOAT, 21513 HA_DOUBLE, 21514 HA_VECT64, 21515 HA_VECT128 21516 }; 21517 21518 static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, 21519 uint64_t &Members) { 21520 if (auto *ST = dyn_cast<StructType>(Ty)) { 21521 for (unsigned i = 0; i < ST->getNumElements(); ++i) { 21522 uint64_t SubMembers = 0; 21523 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers)) 21524 return false; 21525 Members += SubMembers; 21526 } 21527 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) { 21528 uint64_t SubMembers = 0; 21529 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers)) 21530 return false; 21531 Members += SubMembers * AT->getNumElements(); 21532 } else if (Ty->isFloatTy()) { 21533 if (Base != HA_UNKNOWN && Base != HA_FLOAT) 21534 return false; 21535 Members = 1; 21536 Base = HA_FLOAT; 21537 } else if (Ty->isDoubleTy()) { 21538 if (Base != HA_UNKNOWN && Base != HA_DOUBLE) 21539 return false; 21540 Members = 1; 21541 Base = HA_DOUBLE; 21542 } else if (auto *VT = dyn_cast<VectorType>(Ty)) { 21543 Members = 1; 21544 switch (Base) { 21545 case HA_FLOAT: 21546 case HA_DOUBLE: 21547 return false; 21548 case HA_VECT64: 21549 return VT->getPrimitiveSizeInBits().getFixedSize() == 64; 21550 case HA_VECT128: 21551 return VT->getPrimitiveSizeInBits().getFixedSize() == 128; 21552 case HA_UNKNOWN: 21553 switch (VT->getPrimitiveSizeInBits().getFixedSize()) { 21554 case 64: 21555 Base = HA_VECT64; 21556 return true; 21557 case 128: 21558 Base = HA_VECT128; 21559 return true; 21560 default: 21561 return false; 21562 } 21563 } 21564 } 21565 21566 return (Members > 0 && Members <= 4); 21567 } 21568 21569 /// Return the correct alignment for the current calling convention. 21570 Align ARMTargetLowering::getABIAlignmentForCallingConv( 21571 Type *ArgTy, const DataLayout &DL) const { 21572 const Align ABITypeAlign = DL.getABITypeAlign(ArgTy); 21573 if (!ArgTy->isVectorTy()) 21574 return ABITypeAlign; 21575 21576 // Avoid over-aligning vector parameters. It would require realigning the 21577 // stack and waste space for no real benefit. 21578 return std::min(ABITypeAlign, DL.getStackAlignment()); 21579 } 21580 21581 /// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of 21582 /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when 21583 /// passing according to AAPCS rules. 21584 bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( 21585 Type *Ty, CallingConv::ID CallConv, bool isVarArg, 21586 const DataLayout &DL) const { 21587 if (getEffectiveCallingConv(CallConv, isVarArg) != 21588 CallingConv::ARM_AAPCS_VFP) 21589 return false; 21590 21591 HABaseType Base = HA_UNKNOWN; 21592 uint64_t Members = 0; 21593 bool IsHA = isHomogeneousAggregate(Ty, Base, Members); 21594 LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump()); 21595 21596 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy(); 21597 return IsHA || IsIntArray; 21598 } 21599 21600 Register ARMTargetLowering::getExceptionPointerRegister( 21601 const Constant *PersonalityFn) const { 21602 // Platforms which do not use SjLj EH may return values in these registers 21603 // via the personality function. 21604 return Subtarget->useSjLjEH() ? Register() : ARM::R0; 21605 } 21606 21607 Register ARMTargetLowering::getExceptionSelectorRegister( 21608 const Constant *PersonalityFn) const { 21609 // Platforms which do not use SjLj EH may return values in these registers 21610 // via the personality function. 21611 return Subtarget->useSjLjEH() ? Register() : ARM::R1; 21612 } 21613 21614 void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { 21615 // Update IsSplitCSR in ARMFunctionInfo. 21616 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>(); 21617 AFI->setIsSplitCSR(true); 21618 } 21619 21620 void ARMTargetLowering::insertCopiesSplitCSR( 21621 MachineBasicBlock *Entry, 21622 const SmallVectorImpl<MachineBasicBlock *> &Exits) const { 21623 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 21624 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); 21625 if (!IStart) 21626 return; 21627 21628 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 21629 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); 21630 MachineBasicBlock::iterator MBBI = Entry->begin(); 21631 for (const MCPhysReg *I = IStart; *I; ++I) { 21632 const TargetRegisterClass *RC = nullptr; 21633 if (ARM::GPRRegClass.contains(*I)) 21634 RC = &ARM::GPRRegClass; 21635 else if (ARM::DPRRegClass.contains(*I)) 21636 RC = &ARM::DPRRegClass; 21637 else 21638 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 21639 21640 Register NewVR = MRI->createVirtualRegister(RC); 21641 // Create copy from CSR to a virtual register. 21642 // FIXME: this currently does not emit CFI pseudo-instructions, it works 21643 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be 21644 // nounwind. If we want to generalize this later, we may need to emit 21645 // CFI pseudo-instructions. 21646 assert(Entry->getParent()->getFunction().hasFnAttribute( 21647 Attribute::NoUnwind) && 21648 "Function should be nounwind in insertCopiesSplitCSR!"); 21649 Entry->addLiveIn(*I); 21650 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) 21651 .addReg(*I); 21652 21653 // Insert the copy-back instructions right before the terminator. 21654 for (auto *Exit : Exits) 21655 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), 21656 TII->get(TargetOpcode::COPY), *I) 21657 .addReg(NewVR); 21658 } 21659 } 21660 21661 void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const { 21662 MF.getFrameInfo().computeMaxCallFrameSize(MF); 21663 TargetLoweringBase::finalizeLowering(MF); 21664 } 21665