1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the AArch64TargetLowering class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AArch64ISelLowering.h" 14 #include "AArch64CallingConvention.h" 15 #include "AArch64ExpandImm.h" 16 #include "AArch64MachineFunctionInfo.h" 17 #include "AArch64PerfectShuffle.h" 18 #include "AArch64RegisterInfo.h" 19 #include "AArch64Subtarget.h" 20 #include "MCTargetDesc/AArch64AddressingModes.h" 21 #include "Utils/AArch64BaseInfo.h" 22 #include "llvm/ADT/APFloat.h" 23 #include "llvm/ADT/APInt.h" 24 #include "llvm/ADT/ArrayRef.h" 25 #include "llvm/ADT/STLExtras.h" 26 #include "llvm/ADT/SmallSet.h" 27 #include "llvm/ADT/SmallVector.h" 28 #include "llvm/ADT/Statistic.h" 29 #include "llvm/ADT/StringRef.h" 30 #include "llvm/ADT/Triple.h" 31 #include "llvm/ADT/Twine.h" 32 #include "llvm/Analysis/ObjCARCUtil.h" 33 #include "llvm/Analysis/VectorUtils.h" 34 #include "llvm/CodeGen/Analysis.h" 35 #include "llvm/CodeGen/CallingConvLower.h" 36 #include "llvm/CodeGen/MachineBasicBlock.h" 37 #include "llvm/CodeGen/MachineFrameInfo.h" 38 #include "llvm/CodeGen/MachineFunction.h" 39 #include "llvm/CodeGen/MachineInstr.h" 40 #include "llvm/CodeGen/MachineInstrBuilder.h" 41 #include "llvm/CodeGen/MachineMemOperand.h" 42 #include "llvm/CodeGen/MachineRegisterInfo.h" 43 #include "llvm/CodeGen/RuntimeLibcalls.h" 44 #include "llvm/CodeGen/SelectionDAG.h" 45 #include "llvm/CodeGen/SelectionDAGNodes.h" 46 #include "llvm/CodeGen/TargetCallingConv.h" 47 #include "llvm/CodeGen/TargetInstrInfo.h" 48 #include "llvm/CodeGen/ValueTypes.h" 49 #include "llvm/IR/Attributes.h" 50 #include "llvm/IR/Constants.h" 51 #include "llvm/IR/DataLayout.h" 52 #include "llvm/IR/DebugLoc.h" 53 #include "llvm/IR/DerivedTypes.h" 54 #include "llvm/IR/Function.h" 55 #include "llvm/IR/GetElementPtrTypeIterator.h" 56 #include "llvm/IR/GlobalValue.h" 57 #include "llvm/IR/IRBuilder.h" 58 #include "llvm/IR/Instruction.h" 59 #include "llvm/IR/Instructions.h" 60 #include "llvm/IR/IntrinsicInst.h" 61 #include "llvm/IR/Intrinsics.h" 62 #include "llvm/IR/IntrinsicsAArch64.h" 63 #include "llvm/IR/Module.h" 64 #include "llvm/IR/OperandTraits.h" 65 #include "llvm/IR/PatternMatch.h" 66 #include "llvm/IR/Type.h" 67 #include "llvm/IR/Use.h" 68 #include "llvm/IR/Value.h" 69 #include "llvm/MC/MCRegisterInfo.h" 70 #include "llvm/Support/Casting.h" 71 #include "llvm/Support/CodeGen.h" 72 #include "llvm/Support/CommandLine.h" 73 #include "llvm/Support/Compiler.h" 74 #include "llvm/Support/Debug.h" 75 #include "llvm/Support/ErrorHandling.h" 76 #include "llvm/Support/KnownBits.h" 77 #include "llvm/Support/MachineValueType.h" 78 #include "llvm/Support/MathExtras.h" 79 #include "llvm/Support/raw_ostream.h" 80 #include "llvm/Target/TargetMachine.h" 81 #include "llvm/Target/TargetOptions.h" 82 #include <algorithm> 83 #include <bitset> 84 #include <cassert> 85 #include <cctype> 86 #include <cstdint> 87 #include <cstdlib> 88 #include <iterator> 89 #include <limits> 90 #include <tuple> 91 #include <utility> 92 #include <vector> 93 94 using namespace llvm; 95 using namespace llvm::PatternMatch; 96 97 #define DEBUG_TYPE "aarch64-lower" 98 99 STATISTIC(NumTailCalls, "Number of tail calls"); 100 STATISTIC(NumShiftInserts, "Number of vector shift inserts"); 101 STATISTIC(NumOptimizedImms, "Number of times immediates were optimized"); 102 103 // FIXME: The necessary dtprel relocations don't seem to be supported 104 // well in the GNU bfd and gold linkers at the moment. Therefore, by 105 // default, for now, fall back to GeneralDynamic code generation. 106 cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration( 107 "aarch64-elf-ldtls-generation", cl::Hidden, 108 cl::desc("Allow AArch64 Local Dynamic TLS code generation"), 109 cl::init(false)); 110 111 static cl::opt<bool> 112 EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, 113 cl::desc("Enable AArch64 logical imm instruction " 114 "optimization"), 115 cl::init(true)); 116 117 // Temporary option added for the purpose of testing functionality added 118 // to DAGCombiner.cpp in D92230. It is expected that this can be removed 119 // in future when both implementations will be based off MGATHER rather 120 // than the GLD1 nodes added for the SVE gather load intrinsics. 121 static cl::opt<bool> 122 EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, 123 cl::desc("Combine extends of AArch64 masked " 124 "gather intrinsics"), 125 cl::init(true)); 126 127 /// Value type used for condition codes. 128 static const MVT MVT_CC = MVT::i32; 129 130 static inline EVT getPackedSVEVectorVT(EVT VT) { 131 switch (VT.getSimpleVT().SimpleTy) { 132 default: 133 llvm_unreachable("unexpected element type for vector"); 134 case MVT::i8: 135 return MVT::nxv16i8; 136 case MVT::i16: 137 return MVT::nxv8i16; 138 case MVT::i32: 139 return MVT::nxv4i32; 140 case MVT::i64: 141 return MVT::nxv2i64; 142 case MVT::f16: 143 return MVT::nxv8f16; 144 case MVT::f32: 145 return MVT::nxv4f32; 146 case MVT::f64: 147 return MVT::nxv2f64; 148 case MVT::bf16: 149 return MVT::nxv8bf16; 150 } 151 } 152 153 // NOTE: Currently there's only a need to return integer vector types. If this 154 // changes then just add an extra "type" parameter. 155 static inline EVT getPackedSVEVectorVT(ElementCount EC) { 156 switch (EC.getKnownMinValue()) { 157 default: 158 llvm_unreachable("unexpected element count for vector"); 159 case 16: 160 return MVT::nxv16i8; 161 case 8: 162 return MVT::nxv8i16; 163 case 4: 164 return MVT::nxv4i32; 165 case 2: 166 return MVT::nxv2i64; 167 } 168 } 169 170 static inline EVT getPromotedVTForPredicate(EVT VT) { 171 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) && 172 "Expected scalable predicate vector type!"); 173 switch (VT.getVectorMinNumElements()) { 174 default: 175 llvm_unreachable("unexpected element count for vector"); 176 case 2: 177 return MVT::nxv2i64; 178 case 4: 179 return MVT::nxv4i32; 180 case 8: 181 return MVT::nxv8i16; 182 case 16: 183 return MVT::nxv16i8; 184 } 185 } 186 187 /// Returns true if VT's elements occupy the lowest bit positions of its 188 /// associated register class without any intervening space. 189 /// 190 /// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the 191 /// same register class, but only nxv8f16 can be treated as a packed vector. 192 static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) { 193 assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && 194 "Expected legal vector type!"); 195 return VT.isFixedLengthVector() || 196 VT.getSizeInBits().getKnownMinSize() == AArch64::SVEBitsPerBlock; 197 } 198 199 // Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading 200 // predicate and end with a passthru value matching the result type. 201 static bool isMergePassthruOpcode(unsigned Opc) { 202 switch (Opc) { 203 default: 204 return false; 205 case AArch64ISD::BITREVERSE_MERGE_PASSTHRU: 206 case AArch64ISD::BSWAP_MERGE_PASSTHRU: 207 case AArch64ISD::CTLZ_MERGE_PASSTHRU: 208 case AArch64ISD::CTPOP_MERGE_PASSTHRU: 209 case AArch64ISD::DUP_MERGE_PASSTHRU: 210 case AArch64ISD::ABS_MERGE_PASSTHRU: 211 case AArch64ISD::NEG_MERGE_PASSTHRU: 212 case AArch64ISD::FNEG_MERGE_PASSTHRU: 213 case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU: 214 case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU: 215 case AArch64ISD::FCEIL_MERGE_PASSTHRU: 216 case AArch64ISD::FFLOOR_MERGE_PASSTHRU: 217 case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU: 218 case AArch64ISD::FRINT_MERGE_PASSTHRU: 219 case AArch64ISD::FROUND_MERGE_PASSTHRU: 220 case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU: 221 case AArch64ISD::FTRUNC_MERGE_PASSTHRU: 222 case AArch64ISD::FP_ROUND_MERGE_PASSTHRU: 223 case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU: 224 case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU: 225 case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU: 226 case AArch64ISD::FCVTZU_MERGE_PASSTHRU: 227 case AArch64ISD::FCVTZS_MERGE_PASSTHRU: 228 case AArch64ISD::FSQRT_MERGE_PASSTHRU: 229 case AArch64ISD::FRECPX_MERGE_PASSTHRU: 230 case AArch64ISD::FABS_MERGE_PASSTHRU: 231 return true; 232 } 233 } 234 235 AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, 236 const AArch64Subtarget &STI) 237 : TargetLowering(TM), Subtarget(&STI) { 238 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so 239 // we have to make something up. Arbitrarily, choose ZeroOrOne. 240 setBooleanContents(ZeroOrOneBooleanContent); 241 // When comparing vectors the result sets the different elements in the 242 // vector to all-one or all-zero. 243 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 244 245 // Set up the register classes. 246 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass); 247 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass); 248 249 if (Subtarget->hasLS64()) { 250 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass); 251 setOperationAction(ISD::LOAD, MVT::i64x8, Custom); 252 setOperationAction(ISD::STORE, MVT::i64x8, Custom); 253 } 254 255 if (Subtarget->hasFPARMv8()) { 256 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); 257 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass); 258 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); 259 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass); 260 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); 261 } 262 263 if (Subtarget->hasNEON()) { 264 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass); 265 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass); 266 // Someone set us up the NEON. 267 addDRTypeForNEON(MVT::v2f32); 268 addDRTypeForNEON(MVT::v8i8); 269 addDRTypeForNEON(MVT::v4i16); 270 addDRTypeForNEON(MVT::v2i32); 271 addDRTypeForNEON(MVT::v1i64); 272 addDRTypeForNEON(MVT::v1f64); 273 addDRTypeForNEON(MVT::v4f16); 274 if (Subtarget->hasBF16()) 275 addDRTypeForNEON(MVT::v4bf16); 276 277 addQRTypeForNEON(MVT::v4f32); 278 addQRTypeForNEON(MVT::v2f64); 279 addQRTypeForNEON(MVT::v16i8); 280 addQRTypeForNEON(MVT::v8i16); 281 addQRTypeForNEON(MVT::v4i32); 282 addQRTypeForNEON(MVT::v2i64); 283 addQRTypeForNEON(MVT::v8f16); 284 if (Subtarget->hasBF16()) 285 addQRTypeForNEON(MVT::v8bf16); 286 } 287 288 if (Subtarget->hasSVE()) { 289 // Add legal sve predicate types 290 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass); 291 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass); 292 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass); 293 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass); 294 295 // Add legal sve data types 296 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass); 297 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass); 298 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass); 299 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass); 300 301 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass); 302 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass); 303 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass); 304 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass); 305 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass); 306 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass); 307 308 if (Subtarget->hasBF16()) { 309 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass); 310 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass); 311 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass); 312 } 313 314 if (Subtarget->useSVEForFixedLengthVectors()) { 315 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) 316 if (useSVEForFixedLengthVectorVT(VT)) 317 addRegisterClass(VT, &AArch64::ZPRRegClass); 318 319 for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) 320 if (useSVEForFixedLengthVectorVT(VT)) 321 addRegisterClass(VT, &AArch64::ZPRRegClass); 322 } 323 324 for (auto VT : { MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64 }) { 325 setOperationAction(ISD::SADDSAT, VT, Legal); 326 setOperationAction(ISD::UADDSAT, VT, Legal); 327 setOperationAction(ISD::SSUBSAT, VT, Legal); 328 setOperationAction(ISD::USUBSAT, VT, Legal); 329 setOperationAction(ISD::UREM, VT, Expand); 330 setOperationAction(ISD::SREM, VT, Expand); 331 setOperationAction(ISD::SDIVREM, VT, Expand); 332 setOperationAction(ISD::UDIVREM, VT, Expand); 333 } 334 335 for (auto VT : 336 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8, 337 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 }) 338 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal); 339 340 for (auto VT : 341 { MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32, MVT::nxv4f32, 342 MVT::nxv2f64 }) { 343 setCondCodeAction(ISD::SETO, VT, Expand); 344 setCondCodeAction(ISD::SETOLT, VT, Expand); 345 setCondCodeAction(ISD::SETLT, VT, Expand); 346 setCondCodeAction(ISD::SETOLE, VT, Expand); 347 setCondCodeAction(ISD::SETLE, VT, Expand); 348 setCondCodeAction(ISD::SETULT, VT, Expand); 349 setCondCodeAction(ISD::SETULE, VT, Expand); 350 setCondCodeAction(ISD::SETUGE, VT, Expand); 351 setCondCodeAction(ISD::SETUGT, VT, Expand); 352 setCondCodeAction(ISD::SETUEQ, VT, Expand); 353 setCondCodeAction(ISD::SETUNE, VT, Expand); 354 355 setOperationAction(ISD::FREM, VT, Expand); 356 setOperationAction(ISD::FPOW, VT, Expand); 357 setOperationAction(ISD::FPOWI, VT, Expand); 358 setOperationAction(ISD::FCOS, VT, Expand); 359 setOperationAction(ISD::FSIN, VT, Expand); 360 setOperationAction(ISD::FSINCOS, VT, Expand); 361 setOperationAction(ISD::FEXP, VT, Expand); 362 setOperationAction(ISD::FEXP2, VT, Expand); 363 setOperationAction(ISD::FLOG, VT, Expand); 364 setOperationAction(ISD::FLOG2, VT, Expand); 365 setOperationAction(ISD::FLOG10, VT, Expand); 366 } 367 } 368 369 // Compute derived properties from the register classes 370 computeRegisterProperties(Subtarget->getRegisterInfo()); 371 372 // Provide all sorts of operation actions 373 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 374 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 375 setOperationAction(ISD::SETCC, MVT::i32, Custom); 376 setOperationAction(ISD::SETCC, MVT::i64, Custom); 377 setOperationAction(ISD::SETCC, MVT::f16, Custom); 378 setOperationAction(ISD::SETCC, MVT::f32, Custom); 379 setOperationAction(ISD::SETCC, MVT::f64, Custom); 380 setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom); 381 setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom); 382 setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom); 383 setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom); 384 setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom); 385 setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom); 386 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 387 setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); 388 setOperationAction(ISD::BRCOND, MVT::Other, Expand); 389 setOperationAction(ISD::BR_CC, MVT::i32, Custom); 390 setOperationAction(ISD::BR_CC, MVT::i64, Custom); 391 setOperationAction(ISD::BR_CC, MVT::f16, Custom); 392 setOperationAction(ISD::BR_CC, MVT::f32, Custom); 393 setOperationAction(ISD::BR_CC, MVT::f64, Custom); 394 setOperationAction(ISD::SELECT, MVT::i32, Custom); 395 setOperationAction(ISD::SELECT, MVT::i64, Custom); 396 setOperationAction(ISD::SELECT, MVT::f16, Custom); 397 setOperationAction(ISD::SELECT, MVT::f32, Custom); 398 setOperationAction(ISD::SELECT, MVT::f64, Custom); 399 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 400 setOperationAction(ISD::SELECT_CC, MVT::i64, Custom); 401 setOperationAction(ISD::SELECT_CC, MVT::f16, Custom); 402 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 403 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 404 setOperationAction(ISD::BR_JT, MVT::Other, Custom); 405 setOperationAction(ISD::JumpTable, MVT::i64, Custom); 406 407 setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom); 408 setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom); 409 setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom); 410 411 setOperationAction(ISD::FREM, MVT::f32, Expand); 412 setOperationAction(ISD::FREM, MVT::f64, Expand); 413 setOperationAction(ISD::FREM, MVT::f80, Expand); 414 415 setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand); 416 417 // Custom lowering hooks are needed for XOR 418 // to fold it into CSINC/CSINV. 419 setOperationAction(ISD::XOR, MVT::i32, Custom); 420 setOperationAction(ISD::XOR, MVT::i64, Custom); 421 422 // Virtually no operation on f128 is legal, but LLVM can't expand them when 423 // there's a valid register class, so we need custom operations in most cases. 424 setOperationAction(ISD::FABS, MVT::f128, Expand); 425 setOperationAction(ISD::FADD, MVT::f128, LibCall); 426 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); 427 setOperationAction(ISD::FCOS, MVT::f128, Expand); 428 setOperationAction(ISD::FDIV, MVT::f128, LibCall); 429 setOperationAction(ISD::FMA, MVT::f128, Expand); 430 setOperationAction(ISD::FMUL, MVT::f128, LibCall); 431 setOperationAction(ISD::FNEG, MVT::f128, Expand); 432 setOperationAction(ISD::FPOW, MVT::f128, Expand); 433 setOperationAction(ISD::FREM, MVT::f128, Expand); 434 setOperationAction(ISD::FRINT, MVT::f128, Expand); 435 setOperationAction(ISD::FSIN, MVT::f128, Expand); 436 setOperationAction(ISD::FSINCOS, MVT::f128, Expand); 437 setOperationAction(ISD::FSQRT, MVT::f128, Expand); 438 setOperationAction(ISD::FSUB, MVT::f128, LibCall); 439 setOperationAction(ISD::FTRUNC, MVT::f128, Expand); 440 setOperationAction(ISD::SETCC, MVT::f128, Custom); 441 setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom); 442 setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom); 443 setOperationAction(ISD::BR_CC, MVT::f128, Custom); 444 setOperationAction(ISD::SELECT, MVT::f128, Custom); 445 setOperationAction(ISD::SELECT_CC, MVT::f128, Custom); 446 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); 447 448 // Lowering for many of the conversions is actually specified by the non-f128 449 // type. The LowerXXX function will be trivial when f128 isn't involved. 450 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 451 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 452 setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom); 453 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom); 454 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom); 455 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom); 456 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 457 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); 458 setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom); 459 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom); 460 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom); 461 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom); 462 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 463 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); 464 setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom); 465 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom); 466 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom); 467 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom); 468 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 469 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); 470 setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom); 471 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom); 472 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom); 473 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom); 474 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); 475 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); 476 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); 477 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); 478 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom); 479 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom); 480 481 setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i32, Custom); 482 setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom); 483 setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i32, Custom); 484 setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom); 485 486 // Variable arguments. 487 setOperationAction(ISD::VASTART, MVT::Other, Custom); 488 setOperationAction(ISD::VAARG, MVT::Other, Custom); 489 setOperationAction(ISD::VACOPY, MVT::Other, Custom); 490 setOperationAction(ISD::VAEND, MVT::Other, Expand); 491 492 // Variable-sized objects. 493 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 494 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 495 496 if (Subtarget->isTargetWindows()) 497 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom); 498 else 499 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 500 501 // Constant pool entries 502 setOperationAction(ISD::ConstantPool, MVT::i64, Custom); 503 504 // BlockAddress 505 setOperationAction(ISD::BlockAddress, MVT::i64, Custom); 506 507 // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences. 508 setOperationAction(ISD::ADDC, MVT::i32, Custom); 509 setOperationAction(ISD::ADDE, MVT::i32, Custom); 510 setOperationAction(ISD::SUBC, MVT::i32, Custom); 511 setOperationAction(ISD::SUBE, MVT::i32, Custom); 512 setOperationAction(ISD::ADDC, MVT::i64, Custom); 513 setOperationAction(ISD::ADDE, MVT::i64, Custom); 514 setOperationAction(ISD::SUBC, MVT::i64, Custom); 515 setOperationAction(ISD::SUBE, MVT::i64, Custom); 516 517 // AArch64 lacks both left-rotate and popcount instructions. 518 setOperationAction(ISD::ROTL, MVT::i32, Expand); 519 setOperationAction(ISD::ROTL, MVT::i64, Expand); 520 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 521 setOperationAction(ISD::ROTL, VT, Expand); 522 setOperationAction(ISD::ROTR, VT, Expand); 523 } 524 525 // AArch64 doesn't have i32 MULH{S|U}. 526 setOperationAction(ISD::MULHU, MVT::i32, Expand); 527 setOperationAction(ISD::MULHS, MVT::i32, Expand); 528 529 // AArch64 doesn't have {U|S}MUL_LOHI. 530 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); 531 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); 532 533 setOperationAction(ISD::CTPOP, MVT::i32, Custom); 534 setOperationAction(ISD::CTPOP, MVT::i64, Custom); 535 setOperationAction(ISD::CTPOP, MVT::i128, Custom); 536 537 setOperationAction(ISD::ABS, MVT::i32, Custom); 538 setOperationAction(ISD::ABS, MVT::i64, Custom); 539 540 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 541 setOperationAction(ISD::SDIVREM, MVT::i64, Expand); 542 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 543 setOperationAction(ISD::SDIVREM, VT, Expand); 544 setOperationAction(ISD::UDIVREM, VT, Expand); 545 } 546 setOperationAction(ISD::SREM, MVT::i32, Expand); 547 setOperationAction(ISD::SREM, MVT::i64, Expand); 548 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 549 setOperationAction(ISD::UDIVREM, MVT::i64, Expand); 550 setOperationAction(ISD::UREM, MVT::i32, Expand); 551 setOperationAction(ISD::UREM, MVT::i64, Expand); 552 553 // Custom lower Add/Sub/Mul with overflow. 554 setOperationAction(ISD::SADDO, MVT::i32, Custom); 555 setOperationAction(ISD::SADDO, MVT::i64, Custom); 556 setOperationAction(ISD::UADDO, MVT::i32, Custom); 557 setOperationAction(ISD::UADDO, MVT::i64, Custom); 558 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 559 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 560 setOperationAction(ISD::USUBO, MVT::i32, Custom); 561 setOperationAction(ISD::USUBO, MVT::i64, Custom); 562 setOperationAction(ISD::SMULO, MVT::i32, Custom); 563 setOperationAction(ISD::SMULO, MVT::i64, Custom); 564 setOperationAction(ISD::UMULO, MVT::i32, Custom); 565 setOperationAction(ISD::UMULO, MVT::i64, Custom); 566 567 setOperationAction(ISD::FSIN, MVT::f32, Expand); 568 setOperationAction(ISD::FSIN, MVT::f64, Expand); 569 setOperationAction(ISD::FCOS, MVT::f32, Expand); 570 setOperationAction(ISD::FCOS, MVT::f64, Expand); 571 setOperationAction(ISD::FPOW, MVT::f32, Expand); 572 setOperationAction(ISD::FPOW, MVT::f64, Expand); 573 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 574 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 575 if (Subtarget->hasFullFP16()) 576 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom); 577 else 578 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote); 579 580 setOperationAction(ISD::FREM, MVT::f16, Promote); 581 setOperationAction(ISD::FREM, MVT::v4f16, Expand); 582 setOperationAction(ISD::FREM, MVT::v8f16, Expand); 583 setOperationAction(ISD::FPOW, MVT::f16, Promote); 584 setOperationAction(ISD::FPOW, MVT::v4f16, Expand); 585 setOperationAction(ISD::FPOW, MVT::v8f16, Expand); 586 setOperationAction(ISD::FPOWI, MVT::f16, Promote); 587 setOperationAction(ISD::FPOWI, MVT::v4f16, Expand); 588 setOperationAction(ISD::FPOWI, MVT::v8f16, Expand); 589 setOperationAction(ISD::FCOS, MVT::f16, Promote); 590 setOperationAction(ISD::FCOS, MVT::v4f16, Expand); 591 setOperationAction(ISD::FCOS, MVT::v8f16, Expand); 592 setOperationAction(ISD::FSIN, MVT::f16, Promote); 593 setOperationAction(ISD::FSIN, MVT::v4f16, Expand); 594 setOperationAction(ISD::FSIN, MVT::v8f16, Expand); 595 setOperationAction(ISD::FSINCOS, MVT::f16, Promote); 596 setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand); 597 setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand); 598 setOperationAction(ISD::FEXP, MVT::f16, Promote); 599 setOperationAction(ISD::FEXP, MVT::v4f16, Expand); 600 setOperationAction(ISD::FEXP, MVT::v8f16, Expand); 601 setOperationAction(ISD::FEXP2, MVT::f16, Promote); 602 setOperationAction(ISD::FEXP2, MVT::v4f16, Expand); 603 setOperationAction(ISD::FEXP2, MVT::v8f16, Expand); 604 setOperationAction(ISD::FLOG, MVT::f16, Promote); 605 setOperationAction(ISD::FLOG, MVT::v4f16, Expand); 606 setOperationAction(ISD::FLOG, MVT::v8f16, Expand); 607 setOperationAction(ISD::FLOG2, MVT::f16, Promote); 608 setOperationAction(ISD::FLOG2, MVT::v4f16, Expand); 609 setOperationAction(ISD::FLOG2, MVT::v8f16, Expand); 610 setOperationAction(ISD::FLOG10, MVT::f16, Promote); 611 setOperationAction(ISD::FLOG10, MVT::v4f16, Expand); 612 setOperationAction(ISD::FLOG10, MVT::v8f16, Expand); 613 614 if (!Subtarget->hasFullFP16()) { 615 setOperationAction(ISD::SELECT, MVT::f16, Promote); 616 setOperationAction(ISD::SELECT_CC, MVT::f16, Promote); 617 setOperationAction(ISD::SETCC, MVT::f16, Promote); 618 setOperationAction(ISD::BR_CC, MVT::f16, Promote); 619 setOperationAction(ISD::FADD, MVT::f16, Promote); 620 setOperationAction(ISD::FSUB, MVT::f16, Promote); 621 setOperationAction(ISD::FMUL, MVT::f16, Promote); 622 setOperationAction(ISD::FDIV, MVT::f16, Promote); 623 setOperationAction(ISD::FMA, MVT::f16, Promote); 624 setOperationAction(ISD::FNEG, MVT::f16, Promote); 625 setOperationAction(ISD::FABS, MVT::f16, Promote); 626 setOperationAction(ISD::FCEIL, MVT::f16, Promote); 627 setOperationAction(ISD::FSQRT, MVT::f16, Promote); 628 setOperationAction(ISD::FFLOOR, MVT::f16, Promote); 629 setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote); 630 setOperationAction(ISD::FRINT, MVT::f16, Promote); 631 setOperationAction(ISD::FROUND, MVT::f16, Promote); 632 setOperationAction(ISD::FROUNDEVEN, MVT::f16, Promote); 633 setOperationAction(ISD::FTRUNC, MVT::f16, Promote); 634 setOperationAction(ISD::FMINNUM, MVT::f16, Promote); 635 setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); 636 setOperationAction(ISD::FMINIMUM, MVT::f16, Promote); 637 setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote); 638 639 // promote v4f16 to v4f32 when that is known to be safe. 640 setOperationAction(ISD::FADD, MVT::v4f16, Promote); 641 setOperationAction(ISD::FSUB, MVT::v4f16, Promote); 642 setOperationAction(ISD::FMUL, MVT::v4f16, Promote); 643 setOperationAction(ISD::FDIV, MVT::v4f16, Promote); 644 AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32); 645 AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32); 646 AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32); 647 AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32); 648 649 setOperationAction(ISD::FABS, MVT::v4f16, Expand); 650 setOperationAction(ISD::FNEG, MVT::v4f16, Expand); 651 setOperationAction(ISD::FROUND, MVT::v4f16, Expand); 652 setOperationAction(ISD::FROUNDEVEN, MVT::v4f16, Expand); 653 setOperationAction(ISD::FMA, MVT::v4f16, Expand); 654 setOperationAction(ISD::SETCC, MVT::v4f16, Expand); 655 setOperationAction(ISD::BR_CC, MVT::v4f16, Expand); 656 setOperationAction(ISD::SELECT, MVT::v4f16, Expand); 657 setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand); 658 setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand); 659 setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand); 660 setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand); 661 setOperationAction(ISD::FCEIL, MVT::v4f16, Expand); 662 setOperationAction(ISD::FRINT, MVT::v4f16, Expand); 663 setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand); 664 setOperationAction(ISD::FSQRT, MVT::v4f16, Expand); 665 666 setOperationAction(ISD::FABS, MVT::v8f16, Expand); 667 setOperationAction(ISD::FADD, MVT::v8f16, Expand); 668 setOperationAction(ISD::FCEIL, MVT::v8f16, Expand); 669 setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand); 670 setOperationAction(ISD::FDIV, MVT::v8f16, Expand); 671 setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand); 672 setOperationAction(ISD::FMA, MVT::v8f16, Expand); 673 setOperationAction(ISD::FMUL, MVT::v8f16, Expand); 674 setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand); 675 setOperationAction(ISD::FNEG, MVT::v8f16, Expand); 676 setOperationAction(ISD::FROUND, MVT::v8f16, Expand); 677 setOperationAction(ISD::FROUNDEVEN, MVT::v8f16, Expand); 678 setOperationAction(ISD::FRINT, MVT::v8f16, Expand); 679 setOperationAction(ISD::FSQRT, MVT::v8f16, Expand); 680 setOperationAction(ISD::FSUB, MVT::v8f16, Expand); 681 setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand); 682 setOperationAction(ISD::SETCC, MVT::v8f16, Expand); 683 setOperationAction(ISD::BR_CC, MVT::v8f16, Expand); 684 setOperationAction(ISD::SELECT, MVT::v8f16, Expand); 685 setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand); 686 setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand); 687 } 688 689 // AArch64 has implementations of a lot of rounding-like FP operations. 690 for (MVT Ty : {MVT::f32, MVT::f64}) { 691 setOperationAction(ISD::FFLOOR, Ty, Legal); 692 setOperationAction(ISD::FNEARBYINT, Ty, Legal); 693 setOperationAction(ISD::FCEIL, Ty, Legal); 694 setOperationAction(ISD::FRINT, Ty, Legal); 695 setOperationAction(ISD::FTRUNC, Ty, Legal); 696 setOperationAction(ISD::FROUND, Ty, Legal); 697 setOperationAction(ISD::FROUNDEVEN, Ty, Legal); 698 setOperationAction(ISD::FMINNUM, Ty, Legal); 699 setOperationAction(ISD::FMAXNUM, Ty, Legal); 700 setOperationAction(ISD::FMINIMUM, Ty, Legal); 701 setOperationAction(ISD::FMAXIMUM, Ty, Legal); 702 setOperationAction(ISD::LROUND, Ty, Legal); 703 setOperationAction(ISD::LLROUND, Ty, Legal); 704 setOperationAction(ISD::LRINT, Ty, Legal); 705 setOperationAction(ISD::LLRINT, Ty, Legal); 706 } 707 708 if (Subtarget->hasFullFP16()) { 709 setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal); 710 setOperationAction(ISD::FFLOOR, MVT::f16, Legal); 711 setOperationAction(ISD::FCEIL, MVT::f16, Legal); 712 setOperationAction(ISD::FRINT, MVT::f16, Legal); 713 setOperationAction(ISD::FTRUNC, MVT::f16, Legal); 714 setOperationAction(ISD::FROUND, MVT::f16, Legal); 715 setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal); 716 setOperationAction(ISD::FMINNUM, MVT::f16, Legal); 717 setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); 718 setOperationAction(ISD::FMINIMUM, MVT::f16, Legal); 719 setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal); 720 } 721 722 setOperationAction(ISD::PREFETCH, MVT::Other, Custom); 723 724 setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); 725 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom); 726 727 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); 728 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); 729 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 730 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom); 731 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 732 733 // Generate outline atomics library calls only if LSE was not specified for 734 // subtarget 735 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) { 736 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, LibCall); 737 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, LibCall); 738 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall); 739 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, LibCall); 740 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, LibCall); 741 setOperationAction(ISD::ATOMIC_SWAP, MVT::i8, LibCall); 742 setOperationAction(ISD::ATOMIC_SWAP, MVT::i16, LibCall); 743 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall); 744 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, LibCall); 745 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i8, LibCall); 746 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i16, LibCall); 747 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall); 748 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, LibCall); 749 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i8, LibCall); 750 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i16, LibCall); 751 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall); 752 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, LibCall); 753 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i8, LibCall); 754 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i16, LibCall); 755 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i32, LibCall); 756 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i64, LibCall); 757 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i8, LibCall); 758 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, LibCall); 759 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall); 760 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, LibCall); 761 #define LCALLNAMES(A, B, N) \ 762 setLibcallName(A##N##_RELAX, #B #N "_relax"); \ 763 setLibcallName(A##N##_ACQ, #B #N "_acq"); \ 764 setLibcallName(A##N##_REL, #B #N "_rel"); \ 765 setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel"); 766 #define LCALLNAME4(A, B) \ 767 LCALLNAMES(A, B, 1) \ 768 LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) 769 #define LCALLNAME5(A, B) \ 770 LCALLNAMES(A, B, 1) \ 771 LCALLNAMES(A, B, 2) \ 772 LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16) 773 LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas) 774 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp) 775 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd) 776 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset) 777 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr) 778 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor) 779 #undef LCALLNAMES 780 #undef LCALLNAME4 781 #undef LCALLNAME5 782 } 783 784 // 128-bit loads and stores can be done without expanding 785 setOperationAction(ISD::LOAD, MVT::i128, Custom); 786 setOperationAction(ISD::STORE, MVT::i128, Custom); 787 788 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the 789 // custom lowering, as there are no un-paired non-temporal stores and 790 // legalization will break up 256 bit inputs. 791 setOperationAction(ISD::STORE, MVT::v32i8, Custom); 792 setOperationAction(ISD::STORE, MVT::v16i16, Custom); 793 setOperationAction(ISD::STORE, MVT::v16f16, Custom); 794 setOperationAction(ISD::STORE, MVT::v8i32, Custom); 795 setOperationAction(ISD::STORE, MVT::v8f32, Custom); 796 setOperationAction(ISD::STORE, MVT::v4f64, Custom); 797 setOperationAction(ISD::STORE, MVT::v4i64, Custom); 798 799 // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0. 800 // This requires the Performance Monitors extension. 801 if (Subtarget->hasPerfMon()) 802 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); 803 804 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && 805 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { 806 // Issue __sincos_stret if available. 807 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 808 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 809 } else { 810 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 811 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 812 } 813 814 if (Subtarget->getTargetTriple().isOSMSVCRT()) { 815 // MSVCRT doesn't have powi; fall back to pow 816 setLibcallName(RTLIB::POWI_F32, nullptr); 817 setLibcallName(RTLIB::POWI_F64, nullptr); 818 } 819 820 // Make floating-point constants legal for the large code model, so they don't 821 // become loads from the constant pool. 822 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) { 823 setOperationAction(ISD::ConstantFP, MVT::f32, Legal); 824 setOperationAction(ISD::ConstantFP, MVT::f64, Legal); 825 } 826 827 // AArch64 does not have floating-point extending loads, i1 sign-extending 828 // load, floating-point truncating stores, or v2i32->v2i16 truncating store. 829 for (MVT VT : MVT::fp_valuetypes()) { 830 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); 831 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); 832 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand); 833 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand); 834 } 835 for (MVT VT : MVT::integer_valuetypes()) 836 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand); 837 838 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 839 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 840 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 841 setTruncStoreAction(MVT::f128, MVT::f80, Expand); 842 setTruncStoreAction(MVT::f128, MVT::f64, Expand); 843 setTruncStoreAction(MVT::f128, MVT::f32, Expand); 844 setTruncStoreAction(MVT::f128, MVT::f16, Expand); 845 846 setOperationAction(ISD::BITCAST, MVT::i16, Custom); 847 setOperationAction(ISD::BITCAST, MVT::f16, Custom); 848 setOperationAction(ISD::BITCAST, MVT::bf16, Custom); 849 850 // Indexed loads and stores are supported. 851 for (unsigned im = (unsigned)ISD::PRE_INC; 852 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 853 setIndexedLoadAction(im, MVT::i8, Legal); 854 setIndexedLoadAction(im, MVT::i16, Legal); 855 setIndexedLoadAction(im, MVT::i32, Legal); 856 setIndexedLoadAction(im, MVT::i64, Legal); 857 setIndexedLoadAction(im, MVT::f64, Legal); 858 setIndexedLoadAction(im, MVT::f32, Legal); 859 setIndexedLoadAction(im, MVT::f16, Legal); 860 setIndexedLoadAction(im, MVT::bf16, Legal); 861 setIndexedStoreAction(im, MVT::i8, Legal); 862 setIndexedStoreAction(im, MVT::i16, Legal); 863 setIndexedStoreAction(im, MVT::i32, Legal); 864 setIndexedStoreAction(im, MVT::i64, Legal); 865 setIndexedStoreAction(im, MVT::f64, Legal); 866 setIndexedStoreAction(im, MVT::f32, Legal); 867 setIndexedStoreAction(im, MVT::f16, Legal); 868 setIndexedStoreAction(im, MVT::bf16, Legal); 869 } 870 871 // Trap. 872 setOperationAction(ISD::TRAP, MVT::Other, Legal); 873 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); 874 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal); 875 876 // We combine OR nodes for bitfield operations. 877 setTargetDAGCombine(ISD::OR); 878 // Try to create BICs for vector ANDs. 879 setTargetDAGCombine(ISD::AND); 880 881 // Vector add and sub nodes may conceal a high-half opportunity. 882 // Also, try to fold ADD into CSINC/CSINV.. 883 setTargetDAGCombine(ISD::ADD); 884 setTargetDAGCombine(ISD::ABS); 885 setTargetDAGCombine(ISD::SUB); 886 setTargetDAGCombine(ISD::SRL); 887 setTargetDAGCombine(ISD::XOR); 888 setTargetDAGCombine(ISD::SINT_TO_FP); 889 setTargetDAGCombine(ISD::UINT_TO_FP); 890 891 // TODO: Do the same for FP_TO_*INT_SAT. 892 setTargetDAGCombine(ISD::FP_TO_SINT); 893 setTargetDAGCombine(ISD::FP_TO_UINT); 894 setTargetDAGCombine(ISD::FDIV); 895 896 // Try and combine setcc with csel 897 setTargetDAGCombine(ISD::SETCC); 898 899 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 900 901 setTargetDAGCombine(ISD::ANY_EXTEND); 902 setTargetDAGCombine(ISD::ZERO_EXTEND); 903 setTargetDAGCombine(ISD::SIGN_EXTEND); 904 setTargetDAGCombine(ISD::VECTOR_SPLICE); 905 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); 906 setTargetDAGCombine(ISD::TRUNCATE); 907 setTargetDAGCombine(ISD::CONCAT_VECTORS); 908 setTargetDAGCombine(ISD::STORE); 909 if (Subtarget->supportsAddressTopByteIgnored()) 910 setTargetDAGCombine(ISD::LOAD); 911 912 setTargetDAGCombine(ISD::MUL); 913 914 setTargetDAGCombine(ISD::SELECT); 915 setTargetDAGCombine(ISD::VSELECT); 916 917 setTargetDAGCombine(ISD::INTRINSIC_VOID); 918 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); 919 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 920 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 921 setTargetDAGCombine(ISD::VECREDUCE_ADD); 922 setTargetDAGCombine(ISD::STEP_VECTOR); 923 924 setTargetDAGCombine(ISD::GlobalAddress); 925 926 // In case of strict alignment, avoid an excessive number of byte wide stores. 927 MaxStoresPerMemsetOptSize = 8; 928 MaxStoresPerMemset = Subtarget->requiresStrictAlign() 929 ? MaxStoresPerMemsetOptSize : 32; 930 931 MaxGluedStoresPerMemcpy = 4; 932 MaxStoresPerMemcpyOptSize = 4; 933 MaxStoresPerMemcpy = Subtarget->requiresStrictAlign() 934 ? MaxStoresPerMemcpyOptSize : 16; 935 936 MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4; 937 938 MaxLoadsPerMemcmpOptSize = 4; 939 MaxLoadsPerMemcmp = Subtarget->requiresStrictAlign() 940 ? MaxLoadsPerMemcmpOptSize : 8; 941 942 setStackPointerRegisterToSaveRestore(AArch64::SP); 943 944 setSchedulingPreference(Sched::Hybrid); 945 946 EnableExtLdPromotion = true; 947 948 // Set required alignment. 949 setMinFunctionAlignment(Align(4)); 950 // Set preferred alignments. 951 setPrefLoopAlignment(Align(1ULL << STI.getPrefLoopLogAlignment())); 952 setPrefFunctionAlignment(Align(1ULL << STI.getPrefFunctionLogAlignment())); 953 954 // Only change the limit for entries in a jump table if specified by 955 // the sub target, but not at the command line. 956 unsigned MaxJT = STI.getMaximumJumpTableSize(); 957 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX) 958 setMaximumJumpTableSize(MaxJT); 959 960 setHasExtractBitsInsn(true); 961 962 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 963 964 if (Subtarget->hasNEON()) { 965 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to 966 // silliness like this: 967 setOperationAction(ISD::FABS, MVT::v1f64, Expand); 968 setOperationAction(ISD::FADD, MVT::v1f64, Expand); 969 setOperationAction(ISD::FCEIL, MVT::v1f64, Expand); 970 setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand); 971 setOperationAction(ISD::FCOS, MVT::v1f64, Expand); 972 setOperationAction(ISD::FDIV, MVT::v1f64, Expand); 973 setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand); 974 setOperationAction(ISD::FMA, MVT::v1f64, Expand); 975 setOperationAction(ISD::FMUL, MVT::v1f64, Expand); 976 setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand); 977 setOperationAction(ISD::FNEG, MVT::v1f64, Expand); 978 setOperationAction(ISD::FPOW, MVT::v1f64, Expand); 979 setOperationAction(ISD::FREM, MVT::v1f64, Expand); 980 setOperationAction(ISD::FROUND, MVT::v1f64, Expand); 981 setOperationAction(ISD::FROUNDEVEN, MVT::v1f64, Expand); 982 setOperationAction(ISD::FRINT, MVT::v1f64, Expand); 983 setOperationAction(ISD::FSIN, MVT::v1f64, Expand); 984 setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand); 985 setOperationAction(ISD::FSQRT, MVT::v1f64, Expand); 986 setOperationAction(ISD::FSUB, MVT::v1f64, Expand); 987 setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand); 988 setOperationAction(ISD::SETCC, MVT::v1f64, Expand); 989 setOperationAction(ISD::BR_CC, MVT::v1f64, Expand); 990 setOperationAction(ISD::SELECT, MVT::v1f64, Expand); 991 setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand); 992 setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand); 993 994 setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand); 995 setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand); 996 setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand); 997 setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand); 998 setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand); 999 1000 setOperationAction(ISD::MUL, MVT::v1i64, Expand); 1001 1002 // AArch64 doesn't have a direct vector ->f32 conversion instructions for 1003 // elements smaller than i32, so promote the input to i32 first. 1004 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32); 1005 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32); 1006 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32); 1007 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32); 1008 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32); 1009 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32); 1010 1011 // Similarly, there is no direct i32 -> f64 vector conversion instruction. 1012 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); 1013 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); 1014 setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom); 1015 setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom); 1016 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the 1017 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16 1018 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom); 1019 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); 1020 1021 if (Subtarget->hasFullFP16()) { 1022 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); 1023 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); 1024 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom); 1025 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); 1026 } else { 1027 // when AArch64 doesn't have fullfp16 support, promote the input 1028 // to i32 first. 1029 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32); 1030 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32); 1031 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32); 1032 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32); 1033 } 1034 1035 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); 1036 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); 1037 setOperationAction(ISD::BITREVERSE, MVT::v8i8, Legal); 1038 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Legal); 1039 setOperationAction(ISD::BITREVERSE, MVT::v2i32, Custom); 1040 setOperationAction(ISD::BITREVERSE, MVT::v4i32, Custom); 1041 setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom); 1042 setOperationAction(ISD::BITREVERSE, MVT::v2i64, Custom); 1043 1044 // AArch64 doesn't have MUL.2d: 1045 setOperationAction(ISD::MUL, MVT::v2i64, Expand); 1046 // Custom handling for some quad-vector types to detect MULL. 1047 setOperationAction(ISD::MUL, MVT::v8i16, Custom); 1048 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 1049 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 1050 1051 // Saturates 1052 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, 1053 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { 1054 setOperationAction(ISD::SADDSAT, VT, Legal); 1055 setOperationAction(ISD::UADDSAT, VT, Legal); 1056 setOperationAction(ISD::SSUBSAT, VT, Legal); 1057 setOperationAction(ISD::USUBSAT, VT, Legal); 1058 } 1059 1060 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16, 1061 MVT::v4i32}) { 1062 setOperationAction(ISD::ABDS, VT, Legal); 1063 setOperationAction(ISD::ABDU, VT, Legal); 1064 } 1065 1066 // Vector reductions 1067 for (MVT VT : { MVT::v4f16, MVT::v2f32, 1068 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) { 1069 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) { 1070 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); 1071 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); 1072 1073 setOperationAction(ISD::VECREDUCE_FADD, VT, Legal); 1074 } 1075 } 1076 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, 1077 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { 1078 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); 1079 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); 1080 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); 1081 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); 1082 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); 1083 } 1084 setOperationAction(ISD::VECREDUCE_ADD, MVT::v2i64, Custom); 1085 1086 setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); 1087 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); 1088 // Likewise, narrowing and extending vector loads/stores aren't handled 1089 // directly. 1090 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 1091 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); 1092 1093 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) { 1094 setOperationAction(ISD::MULHS, VT, Legal); 1095 setOperationAction(ISD::MULHU, VT, Legal); 1096 } else { 1097 setOperationAction(ISD::MULHS, VT, Expand); 1098 setOperationAction(ISD::MULHU, VT, Expand); 1099 } 1100 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 1101 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 1102 1103 setOperationAction(ISD::BSWAP, VT, Expand); 1104 setOperationAction(ISD::CTTZ, VT, Expand); 1105 1106 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { 1107 setTruncStoreAction(VT, InnerVT, Expand); 1108 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); 1109 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); 1110 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); 1111 } 1112 } 1113 1114 // AArch64 has implementations of a lot of rounding-like FP operations. 1115 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) { 1116 setOperationAction(ISD::FFLOOR, Ty, Legal); 1117 setOperationAction(ISD::FNEARBYINT, Ty, Legal); 1118 setOperationAction(ISD::FCEIL, Ty, Legal); 1119 setOperationAction(ISD::FRINT, Ty, Legal); 1120 setOperationAction(ISD::FTRUNC, Ty, Legal); 1121 setOperationAction(ISD::FROUND, Ty, Legal); 1122 setOperationAction(ISD::FROUNDEVEN, Ty, Legal); 1123 } 1124 1125 if (Subtarget->hasFullFP16()) { 1126 for (MVT Ty : {MVT::v4f16, MVT::v8f16}) { 1127 setOperationAction(ISD::FFLOOR, Ty, Legal); 1128 setOperationAction(ISD::FNEARBYINT, Ty, Legal); 1129 setOperationAction(ISD::FCEIL, Ty, Legal); 1130 setOperationAction(ISD::FRINT, Ty, Legal); 1131 setOperationAction(ISD::FTRUNC, Ty, Legal); 1132 setOperationAction(ISD::FROUND, Ty, Legal); 1133 setOperationAction(ISD::FROUNDEVEN, Ty, Legal); 1134 } 1135 } 1136 1137 if (Subtarget->hasSVE()) 1138 setOperationAction(ISD::VSCALE, MVT::i32, Custom); 1139 1140 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom); 1141 1142 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom); 1143 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom); 1144 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom); 1145 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom); 1146 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom); 1147 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom); 1148 } 1149 1150 if (Subtarget->hasSVE()) { 1151 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) { 1152 setOperationAction(ISD::BITREVERSE, VT, Custom); 1153 setOperationAction(ISD::BSWAP, VT, Custom); 1154 setOperationAction(ISD::CTLZ, VT, Custom); 1155 setOperationAction(ISD::CTPOP, VT, Custom); 1156 setOperationAction(ISD::CTTZ, VT, Custom); 1157 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 1158 setOperationAction(ISD::UINT_TO_FP, VT, Custom); 1159 setOperationAction(ISD::SINT_TO_FP, VT, Custom); 1160 setOperationAction(ISD::FP_TO_UINT, VT, Custom); 1161 setOperationAction(ISD::FP_TO_SINT, VT, Custom); 1162 setOperationAction(ISD::MGATHER, VT, Custom); 1163 setOperationAction(ISD::MSCATTER, VT, Custom); 1164 setOperationAction(ISD::MLOAD, VT, Custom); 1165 setOperationAction(ISD::MUL, VT, Custom); 1166 setOperationAction(ISD::MULHS, VT, Custom); 1167 setOperationAction(ISD::MULHU, VT, Custom); 1168 setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); 1169 setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); 1170 setOperationAction(ISD::SELECT, VT, Custom); 1171 setOperationAction(ISD::SETCC, VT, Custom); 1172 setOperationAction(ISD::SDIV, VT, Custom); 1173 setOperationAction(ISD::UDIV, VT, Custom); 1174 setOperationAction(ISD::SMIN, VT, Custom); 1175 setOperationAction(ISD::UMIN, VT, Custom); 1176 setOperationAction(ISD::SMAX, VT, Custom); 1177 setOperationAction(ISD::UMAX, VT, Custom); 1178 setOperationAction(ISD::SHL, VT, Custom); 1179 setOperationAction(ISD::SRL, VT, Custom); 1180 setOperationAction(ISD::SRA, VT, Custom); 1181 setOperationAction(ISD::ABS, VT, Custom); 1182 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); 1183 setOperationAction(ISD::VECREDUCE_AND, VT, Custom); 1184 setOperationAction(ISD::VECREDUCE_OR, VT, Custom); 1185 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); 1186 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); 1187 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); 1188 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); 1189 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); 1190 1191 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 1192 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 1193 setOperationAction(ISD::SELECT_CC, VT, Expand); 1194 setOperationAction(ISD::ROTL, VT, Expand); 1195 setOperationAction(ISD::ROTR, VT, Expand); 1196 } 1197 1198 // Illegal unpacked integer vector types. 1199 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) { 1200 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 1201 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 1202 } 1203 1204 // Legalize unpacked bitcasts to REINTERPRET_CAST. 1205 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16, 1206 MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32}) 1207 setOperationAction(ISD::BITCAST, VT, Custom); 1208 1209 for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) { 1210 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 1211 setOperationAction(ISD::SELECT, VT, Custom); 1212 setOperationAction(ISD::SETCC, VT, Custom); 1213 setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); 1214 setOperationAction(ISD::TRUNCATE, VT, Custom); 1215 setOperationAction(ISD::VECREDUCE_AND, VT, Custom); 1216 setOperationAction(ISD::VECREDUCE_OR, VT, Custom); 1217 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); 1218 1219 setOperationAction(ISD::SELECT_CC, VT, Expand); 1220 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 1221 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 1222 1223 // There are no legal MVT::nxv16f## based types. 1224 if (VT != MVT::nxv16i1) { 1225 setOperationAction(ISD::SINT_TO_FP, VT, Custom); 1226 setOperationAction(ISD::UINT_TO_FP, VT, Custom); 1227 } 1228 } 1229 1230 // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does 1231 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64, 1232 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, 1233 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) { 1234 setOperationAction(ISD::MLOAD, VT, Custom); 1235 setOperationAction(ISD::MSTORE, VT, Custom); 1236 setOperationAction(ISD::MGATHER, VT, Custom); 1237 setOperationAction(ISD::MSCATTER, VT, Custom); 1238 } 1239 1240 for (MVT VT : MVT::fp_scalable_vector_valuetypes()) { 1241 for (MVT InnerVT : MVT::fp_scalable_vector_valuetypes()) { 1242 // Avoid marking truncating FP stores as legal to prevent the 1243 // DAGCombiner from creating unsupported truncating stores. 1244 setTruncStoreAction(VT, InnerVT, Expand); 1245 // SVE does not have floating-point extending loads. 1246 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); 1247 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); 1248 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); 1249 } 1250 } 1251 1252 // SVE supports truncating stores of 64 and 128-bit vectors 1253 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom); 1254 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom); 1255 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom); 1256 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); 1257 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); 1258 1259 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32, 1260 MVT::nxv4f32, MVT::nxv2f64}) { 1261 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 1262 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 1263 setOperationAction(ISD::MGATHER, VT, Custom); 1264 setOperationAction(ISD::MSCATTER, VT, Custom); 1265 setOperationAction(ISD::MLOAD, VT, Custom); 1266 setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); 1267 setOperationAction(ISD::SELECT, VT, Custom); 1268 setOperationAction(ISD::FADD, VT, Custom); 1269 setOperationAction(ISD::FDIV, VT, Custom); 1270 setOperationAction(ISD::FMA, VT, Custom); 1271 setOperationAction(ISD::FMAXIMUM, VT, Custom); 1272 setOperationAction(ISD::FMAXNUM, VT, Custom); 1273 setOperationAction(ISD::FMINIMUM, VT, Custom); 1274 setOperationAction(ISD::FMINNUM, VT, Custom); 1275 setOperationAction(ISD::FMUL, VT, Custom); 1276 setOperationAction(ISD::FNEG, VT, Custom); 1277 setOperationAction(ISD::FSUB, VT, Custom); 1278 setOperationAction(ISD::FCEIL, VT, Custom); 1279 setOperationAction(ISD::FFLOOR, VT, Custom); 1280 setOperationAction(ISD::FNEARBYINT, VT, Custom); 1281 setOperationAction(ISD::FRINT, VT, Custom); 1282 setOperationAction(ISD::FROUND, VT, Custom); 1283 setOperationAction(ISD::FROUNDEVEN, VT, Custom); 1284 setOperationAction(ISD::FTRUNC, VT, Custom); 1285 setOperationAction(ISD::FSQRT, VT, Custom); 1286 setOperationAction(ISD::FABS, VT, Custom); 1287 setOperationAction(ISD::FP_EXTEND, VT, Custom); 1288 setOperationAction(ISD::FP_ROUND, VT, Custom); 1289 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); 1290 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); 1291 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); 1292 setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); 1293 setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); 1294 1295 setOperationAction(ISD::SELECT_CC, VT, Expand); 1296 } 1297 1298 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) { 1299 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 1300 setOperationAction(ISD::MGATHER, VT, Custom); 1301 setOperationAction(ISD::MSCATTER, VT, Custom); 1302 setOperationAction(ISD::MLOAD, VT, Custom); 1303 } 1304 1305 setOperationAction(ISD::SPLAT_VECTOR, MVT::nxv8bf16, Custom); 1306 1307 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom); 1308 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom); 1309 1310 // NOTE: Currently this has to happen after computeRegisterProperties rather 1311 // than the preferred option of combining it with the addRegisterClass call. 1312 if (Subtarget->useSVEForFixedLengthVectors()) { 1313 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) 1314 if (useSVEForFixedLengthVectorVT(VT)) 1315 addTypeForFixedLengthSVE(VT); 1316 for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) 1317 if (useSVEForFixedLengthVectorVT(VT)) 1318 addTypeForFixedLengthSVE(VT); 1319 1320 // 64bit results can mean a bigger than NEON input. 1321 for (auto VT : {MVT::v8i8, MVT::v4i16}) 1322 setOperationAction(ISD::TRUNCATE, VT, Custom); 1323 setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom); 1324 1325 // 128bit results imply a bigger than NEON input. 1326 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32}) 1327 setOperationAction(ISD::TRUNCATE, VT, Custom); 1328 for (auto VT : {MVT::v8f16, MVT::v4f32}) 1329 setOperationAction(ISD::FP_ROUND, VT, Custom); 1330 1331 // These operations are not supported on NEON but SVE can do them. 1332 setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom); 1333 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom); 1334 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom); 1335 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); 1336 setOperationAction(ISD::MUL, MVT::v1i64, Custom); 1337 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 1338 setOperationAction(ISD::MULHS, MVT::v1i64, Custom); 1339 setOperationAction(ISD::MULHS, MVT::v2i64, Custom); 1340 setOperationAction(ISD::MULHU, MVT::v1i64, Custom); 1341 setOperationAction(ISD::MULHU, MVT::v2i64, Custom); 1342 setOperationAction(ISD::SDIV, MVT::v8i8, Custom); 1343 setOperationAction(ISD::SDIV, MVT::v16i8, Custom); 1344 setOperationAction(ISD::SDIV, MVT::v4i16, Custom); 1345 setOperationAction(ISD::SDIV, MVT::v8i16, Custom); 1346 setOperationAction(ISD::SDIV, MVT::v2i32, Custom); 1347 setOperationAction(ISD::SDIV, MVT::v4i32, Custom); 1348 setOperationAction(ISD::SDIV, MVT::v1i64, Custom); 1349 setOperationAction(ISD::SDIV, MVT::v2i64, Custom); 1350 setOperationAction(ISD::SMAX, MVT::v1i64, Custom); 1351 setOperationAction(ISD::SMAX, MVT::v2i64, Custom); 1352 setOperationAction(ISD::SMIN, MVT::v1i64, Custom); 1353 setOperationAction(ISD::SMIN, MVT::v2i64, Custom); 1354 setOperationAction(ISD::UDIV, MVT::v8i8, Custom); 1355 setOperationAction(ISD::UDIV, MVT::v16i8, Custom); 1356 setOperationAction(ISD::UDIV, MVT::v4i16, Custom); 1357 setOperationAction(ISD::UDIV, MVT::v8i16, Custom); 1358 setOperationAction(ISD::UDIV, MVT::v2i32, Custom); 1359 setOperationAction(ISD::UDIV, MVT::v4i32, Custom); 1360 setOperationAction(ISD::UDIV, MVT::v1i64, Custom); 1361 setOperationAction(ISD::UDIV, MVT::v2i64, Custom); 1362 setOperationAction(ISD::UMAX, MVT::v1i64, Custom); 1363 setOperationAction(ISD::UMAX, MVT::v2i64, Custom); 1364 setOperationAction(ISD::UMIN, MVT::v1i64, Custom); 1365 setOperationAction(ISD::UMIN, MVT::v2i64, Custom); 1366 setOperationAction(ISD::VECREDUCE_SMAX, MVT::v2i64, Custom); 1367 setOperationAction(ISD::VECREDUCE_SMIN, MVT::v2i64, Custom); 1368 setOperationAction(ISD::VECREDUCE_UMAX, MVT::v2i64, Custom); 1369 setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom); 1370 1371 // Int operations with no NEON support. 1372 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, 1373 MVT::v2i32, MVT::v4i32, MVT::v2i64}) { 1374 setOperationAction(ISD::BITREVERSE, VT, Custom); 1375 setOperationAction(ISD::CTTZ, VT, Custom); 1376 setOperationAction(ISD::VECREDUCE_AND, VT, Custom); 1377 setOperationAction(ISD::VECREDUCE_OR, VT, Custom); 1378 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); 1379 } 1380 1381 // FP operations with no NEON support. 1382 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, 1383 MVT::v1f64, MVT::v2f64}) 1384 setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); 1385 1386 // Use SVE for vectors with more than 2 elements. 1387 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32}) 1388 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); 1389 } 1390 1391 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64); 1392 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32); 1393 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16); 1394 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8); 1395 } 1396 1397 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); 1398 } 1399 1400 void AArch64TargetLowering::addTypeForNEON(MVT VT) { 1401 assert(VT.isVector() && "VT should be a vector type"); 1402 1403 if (VT.isFloatingPoint()) { 1404 MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT(); 1405 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo); 1406 setOperationPromotedToType(ISD::STORE, VT, PromoteTo); 1407 } 1408 1409 // Mark vector float intrinsics as expand. 1410 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) { 1411 setOperationAction(ISD::FSIN, VT, Expand); 1412 setOperationAction(ISD::FCOS, VT, Expand); 1413 setOperationAction(ISD::FPOW, VT, Expand); 1414 setOperationAction(ISD::FLOG, VT, Expand); 1415 setOperationAction(ISD::FLOG2, VT, Expand); 1416 setOperationAction(ISD::FLOG10, VT, Expand); 1417 setOperationAction(ISD::FEXP, VT, Expand); 1418 setOperationAction(ISD::FEXP2, VT, Expand); 1419 } 1420 1421 // But we do support custom-lowering for FCOPYSIGN. 1422 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 || 1423 ((VT == MVT::v4f16 || VT == MVT::v8f16) && Subtarget->hasFullFP16())) 1424 setOperationAction(ISD::FCOPYSIGN, VT, Custom); 1425 1426 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 1427 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 1428 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 1429 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 1430 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 1431 setOperationAction(ISD::SRA, VT, Custom); 1432 setOperationAction(ISD::SRL, VT, Custom); 1433 setOperationAction(ISD::SHL, VT, Custom); 1434 setOperationAction(ISD::OR, VT, Custom); 1435 setOperationAction(ISD::SETCC, VT, Custom); 1436 setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); 1437 1438 setOperationAction(ISD::SELECT, VT, Expand); 1439 setOperationAction(ISD::SELECT_CC, VT, Expand); 1440 setOperationAction(ISD::VSELECT, VT, Expand); 1441 for (MVT InnerVT : MVT::all_valuetypes()) 1442 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); 1443 1444 // CNT supports only B element sizes, then use UADDLP to widen. 1445 if (VT != MVT::v8i8 && VT != MVT::v16i8) 1446 setOperationAction(ISD::CTPOP, VT, Custom); 1447 1448 setOperationAction(ISD::UDIV, VT, Expand); 1449 setOperationAction(ISD::SDIV, VT, Expand); 1450 setOperationAction(ISD::UREM, VT, Expand); 1451 setOperationAction(ISD::SREM, VT, Expand); 1452 setOperationAction(ISD::FREM, VT, Expand); 1453 1454 setOperationAction(ISD::FP_TO_SINT, VT, Custom); 1455 setOperationAction(ISD::FP_TO_UINT, VT, Custom); 1456 1457 if (!VT.isFloatingPoint()) 1458 setOperationAction(ISD::ABS, VT, Legal); 1459 1460 // [SU][MIN|MAX] are available for all NEON types apart from i64. 1461 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64) 1462 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) 1463 setOperationAction(Opcode, VT, Legal); 1464 1465 // F[MIN|MAX][NUM|NAN] are available for all FP NEON types. 1466 if (VT.isFloatingPoint() && 1467 VT.getVectorElementType() != MVT::bf16 && 1468 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16())) 1469 for (unsigned Opcode : 1470 {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM}) 1471 setOperationAction(Opcode, VT, Legal); 1472 1473 if (Subtarget->isLittleEndian()) { 1474 for (unsigned im = (unsigned)ISD::PRE_INC; 1475 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 1476 setIndexedLoadAction(im, VT, Legal); 1477 setIndexedStoreAction(im, VT, Legal); 1478 } 1479 } 1480 } 1481 1482 void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { 1483 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); 1484 1485 // By default everything must be expanded. 1486 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) 1487 setOperationAction(Op, VT, Expand); 1488 1489 // We use EXTRACT_SUBVECTOR to "cast" a scalable vector to a fixed length one. 1490 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 1491 1492 if (VT.isFloatingPoint()) { 1493 setCondCodeAction(ISD::SETO, VT, Expand); 1494 setCondCodeAction(ISD::SETOLT, VT, Expand); 1495 setCondCodeAction(ISD::SETLT, VT, Expand); 1496 setCondCodeAction(ISD::SETOLE, VT, Expand); 1497 setCondCodeAction(ISD::SETLE, VT, Expand); 1498 setCondCodeAction(ISD::SETULT, VT, Expand); 1499 setCondCodeAction(ISD::SETULE, VT, Expand); 1500 setCondCodeAction(ISD::SETUGE, VT, Expand); 1501 setCondCodeAction(ISD::SETUGT, VT, Expand); 1502 setCondCodeAction(ISD::SETUEQ, VT, Expand); 1503 setCondCodeAction(ISD::SETUNE, VT, Expand); 1504 } 1505 1506 // Mark integer truncating stores as having custom lowering 1507 if (VT.isInteger()) { 1508 MVT InnerVT = VT.changeVectorElementType(MVT::i8); 1509 while (InnerVT != VT) { 1510 setTruncStoreAction(VT, InnerVT, Custom); 1511 InnerVT = InnerVT.changeVectorElementType( 1512 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits())); 1513 } 1514 } 1515 1516 // Lower fixed length vector operations to scalable equivalents. 1517 setOperationAction(ISD::ABS, VT, Custom); 1518 setOperationAction(ISD::ADD, VT, Custom); 1519 setOperationAction(ISD::AND, VT, Custom); 1520 setOperationAction(ISD::ANY_EXTEND, VT, Custom); 1521 setOperationAction(ISD::BITCAST, VT, Custom); 1522 setOperationAction(ISD::BITREVERSE, VT, Custom); 1523 setOperationAction(ISD::BSWAP, VT, Custom); 1524 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 1525 setOperationAction(ISD::CTLZ, VT, Custom); 1526 setOperationAction(ISD::CTPOP, VT, Custom); 1527 setOperationAction(ISD::CTTZ, VT, Custom); 1528 setOperationAction(ISD::FABS, VT, Custom); 1529 setOperationAction(ISD::FADD, VT, Custom); 1530 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 1531 setOperationAction(ISD::FCEIL, VT, Custom); 1532 setOperationAction(ISD::FDIV, VT, Custom); 1533 setOperationAction(ISD::FFLOOR, VT, Custom); 1534 setOperationAction(ISD::FMA, VT, Custom); 1535 setOperationAction(ISD::FMAXIMUM, VT, Custom); 1536 setOperationAction(ISD::FMAXNUM, VT, Custom); 1537 setOperationAction(ISD::FMINIMUM, VT, Custom); 1538 setOperationAction(ISD::FMINNUM, VT, Custom); 1539 setOperationAction(ISD::FMUL, VT, Custom); 1540 setOperationAction(ISD::FNEARBYINT, VT, Custom); 1541 setOperationAction(ISD::FNEG, VT, Custom); 1542 setOperationAction(ISD::FP_EXTEND, VT, Custom); 1543 setOperationAction(ISD::FP_ROUND, VT, Custom); 1544 setOperationAction(ISD::FP_TO_SINT, VT, Custom); 1545 setOperationAction(ISD::FP_TO_UINT, VT, Custom); 1546 setOperationAction(ISD::FRINT, VT, Custom); 1547 setOperationAction(ISD::FROUND, VT, Custom); 1548 setOperationAction(ISD::FROUNDEVEN, VT, Custom); 1549 setOperationAction(ISD::FSQRT, VT, Custom); 1550 setOperationAction(ISD::FSUB, VT, Custom); 1551 setOperationAction(ISD::FTRUNC, VT, Custom); 1552 setOperationAction(ISD::LOAD, VT, Custom); 1553 setOperationAction(ISD::MGATHER, VT, Custom); 1554 setOperationAction(ISD::MLOAD, VT, Custom); 1555 setOperationAction(ISD::MSCATTER, VT, Custom); 1556 setOperationAction(ISD::MSTORE, VT, Custom); 1557 setOperationAction(ISD::MUL, VT, Custom); 1558 setOperationAction(ISD::MULHS, VT, Custom); 1559 setOperationAction(ISD::MULHU, VT, Custom); 1560 setOperationAction(ISD::OR, VT, Custom); 1561 setOperationAction(ISD::SDIV, VT, Custom); 1562 setOperationAction(ISD::SELECT, VT, Custom); 1563 setOperationAction(ISD::SETCC, VT, Custom); 1564 setOperationAction(ISD::SHL, VT, Custom); 1565 setOperationAction(ISD::SIGN_EXTEND, VT, Custom); 1566 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom); 1567 setOperationAction(ISD::SINT_TO_FP, VT, Custom); 1568 setOperationAction(ISD::SMAX, VT, Custom); 1569 setOperationAction(ISD::SMIN, VT, Custom); 1570 setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); 1571 setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); 1572 setOperationAction(ISD::SRA, VT, Custom); 1573 setOperationAction(ISD::SRL, VT, Custom); 1574 setOperationAction(ISD::STORE, VT, Custom); 1575 setOperationAction(ISD::SUB, VT, Custom); 1576 setOperationAction(ISD::TRUNCATE, VT, Custom); 1577 setOperationAction(ISD::UDIV, VT, Custom); 1578 setOperationAction(ISD::UINT_TO_FP, VT, Custom); 1579 setOperationAction(ISD::UMAX, VT, Custom); 1580 setOperationAction(ISD::UMIN, VT, Custom); 1581 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); 1582 setOperationAction(ISD::VECREDUCE_AND, VT, Custom); 1583 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); 1584 setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); 1585 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); 1586 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); 1587 setOperationAction(ISD::VECREDUCE_OR, VT, Custom); 1588 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 1589 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); 1590 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); 1591 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); 1592 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); 1593 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); 1594 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 1595 setOperationAction(ISD::VSELECT, VT, Custom); 1596 setOperationAction(ISD::XOR, VT, Custom); 1597 setOperationAction(ISD::ZERO_EXTEND, VT, Custom); 1598 } 1599 1600 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) { 1601 addRegisterClass(VT, &AArch64::FPR64RegClass); 1602 addTypeForNEON(VT); 1603 } 1604 1605 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) { 1606 addRegisterClass(VT, &AArch64::FPR128RegClass); 1607 addTypeForNEON(VT); 1608 } 1609 1610 EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, 1611 LLVMContext &C, EVT VT) const { 1612 if (!VT.isVector()) 1613 return MVT::i32; 1614 if (VT.isScalableVector()) 1615 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount()); 1616 return VT.changeVectorElementTypeToInteger(); 1617 } 1618 1619 static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, 1620 const APInt &Demanded, 1621 TargetLowering::TargetLoweringOpt &TLO, 1622 unsigned NewOpc) { 1623 uint64_t OldImm = Imm, NewImm, Enc; 1624 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask; 1625 1626 // Return if the immediate is already all zeros, all ones, a bimm32 or a 1627 // bimm64. 1628 if (Imm == 0 || Imm == Mask || 1629 AArch64_AM::isLogicalImmediate(Imm & Mask, Size)) 1630 return false; 1631 1632 unsigned EltSize = Size; 1633 uint64_t DemandedBits = Demanded.getZExtValue(); 1634 1635 // Clear bits that are not demanded. 1636 Imm &= DemandedBits; 1637 1638 while (true) { 1639 // The goal here is to set the non-demanded bits in a way that minimizes 1640 // the number of switching between 0 and 1. In order to achieve this goal, 1641 // we set the non-demanded bits to the value of the preceding demanded bits. 1642 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a 1643 // non-demanded bit), we copy bit0 (1) to the least significant 'x', 1644 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'. 1645 // The final result is 0b11000011. 1646 uint64_t NonDemandedBits = ~DemandedBits; 1647 uint64_t InvertedImm = ~Imm & DemandedBits; 1648 uint64_t RotatedImm = 1649 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) & 1650 NonDemandedBits; 1651 uint64_t Sum = RotatedImm + NonDemandedBits; 1652 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1)); 1653 uint64_t Ones = (Sum + Carry) & NonDemandedBits; 1654 NewImm = (Imm | Ones) & Mask; 1655 1656 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate 1657 // or all-ones or all-zeros, in which case we can stop searching. Otherwise, 1658 // we halve the element size and continue the search. 1659 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask))) 1660 break; 1661 1662 // We cannot shrink the element size any further if it is 2-bits. 1663 if (EltSize == 2) 1664 return false; 1665 1666 EltSize /= 2; 1667 Mask >>= EltSize; 1668 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize; 1669 1670 // Return if there is mismatch in any of the demanded bits of Imm and Hi. 1671 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0) 1672 return false; 1673 1674 // Merge the upper and lower halves of Imm and DemandedBits. 1675 Imm |= Hi; 1676 DemandedBits |= DemandedBitsHi; 1677 } 1678 1679 ++NumOptimizedImms; 1680 1681 // Replicate the element across the register width. 1682 while (EltSize < Size) { 1683 NewImm |= NewImm << EltSize; 1684 EltSize *= 2; 1685 } 1686 1687 (void)OldImm; 1688 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 && 1689 "demanded bits should never be altered"); 1690 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm"); 1691 1692 // Create the new constant immediate node. 1693 EVT VT = Op.getValueType(); 1694 SDLoc DL(Op); 1695 SDValue New; 1696 1697 // If the new constant immediate is all-zeros or all-ones, let the target 1698 // independent DAG combine optimize this node. 1699 if (NewImm == 0 || NewImm == OrigMask) { 1700 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0), 1701 TLO.DAG.getConstant(NewImm, DL, VT)); 1702 // Otherwise, create a machine node so that target independent DAG combine 1703 // doesn't undo this optimization. 1704 } else { 1705 Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size); 1706 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT); 1707 New = SDValue( 1708 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0); 1709 } 1710 1711 return TLO.CombineTo(Op, New); 1712 } 1713 1714 bool AArch64TargetLowering::targetShrinkDemandedConstant( 1715 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, 1716 TargetLoweringOpt &TLO) const { 1717 // Delay this optimization to as late as possible. 1718 if (!TLO.LegalOps) 1719 return false; 1720 1721 if (!EnableOptimizeLogicalImm) 1722 return false; 1723 1724 EVT VT = Op.getValueType(); 1725 if (VT.isVector()) 1726 return false; 1727 1728 unsigned Size = VT.getSizeInBits(); 1729 assert((Size == 32 || Size == 64) && 1730 "i32 or i64 is expected after legalization."); 1731 1732 // Exit early if we demand all bits. 1733 if (DemandedBits.countPopulation() == Size) 1734 return false; 1735 1736 unsigned NewOpc; 1737 switch (Op.getOpcode()) { 1738 default: 1739 return false; 1740 case ISD::AND: 1741 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri; 1742 break; 1743 case ISD::OR: 1744 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri; 1745 break; 1746 case ISD::XOR: 1747 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri; 1748 break; 1749 } 1750 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 1751 if (!C) 1752 return false; 1753 uint64_t Imm = C->getZExtValue(); 1754 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc); 1755 } 1756 1757 /// computeKnownBitsForTargetNode - Determine which of the bits specified in 1758 /// Mask are known to be either zero or one and return them Known. 1759 void AArch64TargetLowering::computeKnownBitsForTargetNode( 1760 const SDValue Op, KnownBits &Known, 1761 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { 1762 switch (Op.getOpcode()) { 1763 default: 1764 break; 1765 case AArch64ISD::CSEL: { 1766 KnownBits Known2; 1767 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); 1768 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); 1769 Known = KnownBits::commonBits(Known, Known2); 1770 break; 1771 } 1772 case AArch64ISD::LOADgot: 1773 case AArch64ISD::ADDlow: { 1774 if (!Subtarget->isTargetILP32()) 1775 break; 1776 // In ILP32 mode all valid pointers are in the low 4GB of the address-space. 1777 Known.Zero = APInt::getHighBitsSet(64, 32); 1778 break; 1779 } 1780 case ISD::INTRINSIC_W_CHAIN: { 1781 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); 1782 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); 1783 switch (IntID) { 1784 default: return; 1785 case Intrinsic::aarch64_ldaxr: 1786 case Intrinsic::aarch64_ldxr: { 1787 unsigned BitWidth = Known.getBitWidth(); 1788 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); 1789 unsigned MemBits = VT.getScalarSizeInBits(); 1790 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); 1791 return; 1792 } 1793 } 1794 break; 1795 } 1796 case ISD::INTRINSIC_WO_CHAIN: 1797 case ISD::INTRINSIC_VOID: { 1798 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 1799 switch (IntNo) { 1800 default: 1801 break; 1802 case Intrinsic::aarch64_neon_umaxv: 1803 case Intrinsic::aarch64_neon_uminv: { 1804 // Figure out the datatype of the vector operand. The UMINV instruction 1805 // will zero extend the result, so we can mark as known zero all the 1806 // bits larger than the element datatype. 32-bit or larget doesn't need 1807 // this as those are legal types and will be handled by isel directly. 1808 MVT VT = Op.getOperand(1).getValueType().getSimpleVT(); 1809 unsigned BitWidth = Known.getBitWidth(); 1810 if (VT == MVT::v8i8 || VT == MVT::v16i8) { 1811 assert(BitWidth >= 8 && "Unexpected width!"); 1812 APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8); 1813 Known.Zero |= Mask; 1814 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) { 1815 assert(BitWidth >= 16 && "Unexpected width!"); 1816 APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16); 1817 Known.Zero |= Mask; 1818 } 1819 break; 1820 } break; 1821 } 1822 } 1823 } 1824 } 1825 1826 MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL, 1827 EVT) const { 1828 return MVT::i64; 1829 } 1830 1831 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( 1832 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, 1833 bool *Fast) const { 1834 if (Subtarget->requiresStrictAlign()) 1835 return false; 1836 1837 if (Fast) { 1838 // Some CPUs are fine with unaligned stores except for 128-bit ones. 1839 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 || 1840 // See comments in performSTORECombine() for more details about 1841 // these conditions. 1842 1843 // Code that uses clang vector extensions can mark that it 1844 // wants unaligned accesses to be treated as fast by 1845 // underspecifying alignment to be 1 or 2. 1846 Alignment <= 2 || 1847 1848 // Disregard v2i64. Memcpy lowering produces those and splitting 1849 // them regresses performance on micro-benchmarks and olden/bh. 1850 VT == MVT::v2i64; 1851 } 1852 return true; 1853 } 1854 1855 // Same as above but handling LLTs instead. 1856 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( 1857 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, 1858 bool *Fast) const { 1859 if (Subtarget->requiresStrictAlign()) 1860 return false; 1861 1862 if (Fast) { 1863 // Some CPUs are fine with unaligned stores except for 128-bit ones. 1864 *Fast = !Subtarget->isMisaligned128StoreSlow() || 1865 Ty.getSizeInBytes() != 16 || 1866 // See comments in performSTORECombine() for more details about 1867 // these conditions. 1868 1869 // Code that uses clang vector extensions can mark that it 1870 // wants unaligned accesses to be treated as fast by 1871 // underspecifying alignment to be 1 or 2. 1872 Alignment <= 2 || 1873 1874 // Disregard v2i64. Memcpy lowering produces those and splitting 1875 // them regresses performance on micro-benchmarks and olden/bh. 1876 Ty == LLT::fixed_vector(2, 64); 1877 } 1878 return true; 1879 } 1880 1881 FastISel * 1882 AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 1883 const TargetLibraryInfo *libInfo) const { 1884 return AArch64::createFastISel(funcInfo, libInfo); 1885 } 1886 1887 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { 1888 #define MAKE_CASE(V) \ 1889 case V: \ 1890 return #V; 1891 switch ((AArch64ISD::NodeType)Opcode) { 1892 case AArch64ISD::FIRST_NUMBER: 1893 break; 1894 MAKE_CASE(AArch64ISD::CALL) 1895 MAKE_CASE(AArch64ISD::ADRP) 1896 MAKE_CASE(AArch64ISD::ADR) 1897 MAKE_CASE(AArch64ISD::ADDlow) 1898 MAKE_CASE(AArch64ISD::LOADgot) 1899 MAKE_CASE(AArch64ISD::RET_FLAG) 1900 MAKE_CASE(AArch64ISD::BRCOND) 1901 MAKE_CASE(AArch64ISD::CSEL) 1902 MAKE_CASE(AArch64ISD::CSINV) 1903 MAKE_CASE(AArch64ISD::CSNEG) 1904 MAKE_CASE(AArch64ISD::CSINC) 1905 MAKE_CASE(AArch64ISD::THREAD_POINTER) 1906 MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ) 1907 MAKE_CASE(AArch64ISD::ADD_PRED) 1908 MAKE_CASE(AArch64ISD::MUL_PRED) 1909 MAKE_CASE(AArch64ISD::MULHS_PRED) 1910 MAKE_CASE(AArch64ISD::MULHU_PRED) 1911 MAKE_CASE(AArch64ISD::SDIV_PRED) 1912 MAKE_CASE(AArch64ISD::SHL_PRED) 1913 MAKE_CASE(AArch64ISD::SMAX_PRED) 1914 MAKE_CASE(AArch64ISD::SMIN_PRED) 1915 MAKE_CASE(AArch64ISD::SRA_PRED) 1916 MAKE_CASE(AArch64ISD::SRL_PRED) 1917 MAKE_CASE(AArch64ISD::SUB_PRED) 1918 MAKE_CASE(AArch64ISD::UDIV_PRED) 1919 MAKE_CASE(AArch64ISD::UMAX_PRED) 1920 MAKE_CASE(AArch64ISD::UMIN_PRED) 1921 MAKE_CASE(AArch64ISD::FNEG_MERGE_PASSTHRU) 1922 MAKE_CASE(AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU) 1923 MAKE_CASE(AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU) 1924 MAKE_CASE(AArch64ISD::FCEIL_MERGE_PASSTHRU) 1925 MAKE_CASE(AArch64ISD::FFLOOR_MERGE_PASSTHRU) 1926 MAKE_CASE(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU) 1927 MAKE_CASE(AArch64ISD::FRINT_MERGE_PASSTHRU) 1928 MAKE_CASE(AArch64ISD::FROUND_MERGE_PASSTHRU) 1929 MAKE_CASE(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU) 1930 MAKE_CASE(AArch64ISD::FTRUNC_MERGE_PASSTHRU) 1931 MAKE_CASE(AArch64ISD::FP_ROUND_MERGE_PASSTHRU) 1932 MAKE_CASE(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU) 1933 MAKE_CASE(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU) 1934 MAKE_CASE(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU) 1935 MAKE_CASE(AArch64ISD::FCVTZU_MERGE_PASSTHRU) 1936 MAKE_CASE(AArch64ISD::FCVTZS_MERGE_PASSTHRU) 1937 MAKE_CASE(AArch64ISD::FSQRT_MERGE_PASSTHRU) 1938 MAKE_CASE(AArch64ISD::FRECPX_MERGE_PASSTHRU) 1939 MAKE_CASE(AArch64ISD::FABS_MERGE_PASSTHRU) 1940 MAKE_CASE(AArch64ISD::ABS_MERGE_PASSTHRU) 1941 MAKE_CASE(AArch64ISD::NEG_MERGE_PASSTHRU) 1942 MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO) 1943 MAKE_CASE(AArch64ISD::ADC) 1944 MAKE_CASE(AArch64ISD::SBC) 1945 MAKE_CASE(AArch64ISD::ADDS) 1946 MAKE_CASE(AArch64ISD::SUBS) 1947 MAKE_CASE(AArch64ISD::ADCS) 1948 MAKE_CASE(AArch64ISD::SBCS) 1949 MAKE_CASE(AArch64ISD::ANDS) 1950 MAKE_CASE(AArch64ISD::CCMP) 1951 MAKE_CASE(AArch64ISD::CCMN) 1952 MAKE_CASE(AArch64ISD::FCCMP) 1953 MAKE_CASE(AArch64ISD::FCMP) 1954 MAKE_CASE(AArch64ISD::STRICT_FCMP) 1955 MAKE_CASE(AArch64ISD::STRICT_FCMPE) 1956 MAKE_CASE(AArch64ISD::DUP) 1957 MAKE_CASE(AArch64ISD::DUPLANE8) 1958 MAKE_CASE(AArch64ISD::DUPLANE16) 1959 MAKE_CASE(AArch64ISD::DUPLANE32) 1960 MAKE_CASE(AArch64ISD::DUPLANE64) 1961 MAKE_CASE(AArch64ISD::MOVI) 1962 MAKE_CASE(AArch64ISD::MOVIshift) 1963 MAKE_CASE(AArch64ISD::MOVIedit) 1964 MAKE_CASE(AArch64ISD::MOVImsl) 1965 MAKE_CASE(AArch64ISD::FMOV) 1966 MAKE_CASE(AArch64ISD::MVNIshift) 1967 MAKE_CASE(AArch64ISD::MVNImsl) 1968 MAKE_CASE(AArch64ISD::BICi) 1969 MAKE_CASE(AArch64ISD::ORRi) 1970 MAKE_CASE(AArch64ISD::BSP) 1971 MAKE_CASE(AArch64ISD::EXTR) 1972 MAKE_CASE(AArch64ISD::ZIP1) 1973 MAKE_CASE(AArch64ISD::ZIP2) 1974 MAKE_CASE(AArch64ISD::UZP1) 1975 MAKE_CASE(AArch64ISD::UZP2) 1976 MAKE_CASE(AArch64ISD::TRN1) 1977 MAKE_CASE(AArch64ISD::TRN2) 1978 MAKE_CASE(AArch64ISD::REV16) 1979 MAKE_CASE(AArch64ISD::REV32) 1980 MAKE_CASE(AArch64ISD::REV64) 1981 MAKE_CASE(AArch64ISD::EXT) 1982 MAKE_CASE(AArch64ISD::SPLICE) 1983 MAKE_CASE(AArch64ISD::VSHL) 1984 MAKE_CASE(AArch64ISD::VLSHR) 1985 MAKE_CASE(AArch64ISD::VASHR) 1986 MAKE_CASE(AArch64ISD::VSLI) 1987 MAKE_CASE(AArch64ISD::VSRI) 1988 MAKE_CASE(AArch64ISD::CMEQ) 1989 MAKE_CASE(AArch64ISD::CMGE) 1990 MAKE_CASE(AArch64ISD::CMGT) 1991 MAKE_CASE(AArch64ISD::CMHI) 1992 MAKE_CASE(AArch64ISD::CMHS) 1993 MAKE_CASE(AArch64ISD::FCMEQ) 1994 MAKE_CASE(AArch64ISD::FCMGE) 1995 MAKE_CASE(AArch64ISD::FCMGT) 1996 MAKE_CASE(AArch64ISD::CMEQz) 1997 MAKE_CASE(AArch64ISD::CMGEz) 1998 MAKE_CASE(AArch64ISD::CMGTz) 1999 MAKE_CASE(AArch64ISD::CMLEz) 2000 MAKE_CASE(AArch64ISD::CMLTz) 2001 MAKE_CASE(AArch64ISD::FCMEQz) 2002 MAKE_CASE(AArch64ISD::FCMGEz) 2003 MAKE_CASE(AArch64ISD::FCMGTz) 2004 MAKE_CASE(AArch64ISD::FCMLEz) 2005 MAKE_CASE(AArch64ISD::FCMLTz) 2006 MAKE_CASE(AArch64ISD::SADDV) 2007 MAKE_CASE(AArch64ISD::UADDV) 2008 MAKE_CASE(AArch64ISD::SRHADD) 2009 MAKE_CASE(AArch64ISD::URHADD) 2010 MAKE_CASE(AArch64ISD::SHADD) 2011 MAKE_CASE(AArch64ISD::UHADD) 2012 MAKE_CASE(AArch64ISD::SDOT) 2013 MAKE_CASE(AArch64ISD::UDOT) 2014 MAKE_CASE(AArch64ISD::SMINV) 2015 MAKE_CASE(AArch64ISD::UMINV) 2016 MAKE_CASE(AArch64ISD::SMAXV) 2017 MAKE_CASE(AArch64ISD::UMAXV) 2018 MAKE_CASE(AArch64ISD::SADDV_PRED) 2019 MAKE_CASE(AArch64ISD::UADDV_PRED) 2020 MAKE_CASE(AArch64ISD::SMAXV_PRED) 2021 MAKE_CASE(AArch64ISD::UMAXV_PRED) 2022 MAKE_CASE(AArch64ISD::SMINV_PRED) 2023 MAKE_CASE(AArch64ISD::UMINV_PRED) 2024 MAKE_CASE(AArch64ISD::ORV_PRED) 2025 MAKE_CASE(AArch64ISD::EORV_PRED) 2026 MAKE_CASE(AArch64ISD::ANDV_PRED) 2027 MAKE_CASE(AArch64ISD::CLASTA_N) 2028 MAKE_CASE(AArch64ISD::CLASTB_N) 2029 MAKE_CASE(AArch64ISD::LASTA) 2030 MAKE_CASE(AArch64ISD::LASTB) 2031 MAKE_CASE(AArch64ISD::REINTERPRET_CAST) 2032 MAKE_CASE(AArch64ISD::LS64_BUILD) 2033 MAKE_CASE(AArch64ISD::LS64_EXTRACT) 2034 MAKE_CASE(AArch64ISD::TBL) 2035 MAKE_CASE(AArch64ISD::FADD_PRED) 2036 MAKE_CASE(AArch64ISD::FADDA_PRED) 2037 MAKE_CASE(AArch64ISD::FADDV_PRED) 2038 MAKE_CASE(AArch64ISD::FDIV_PRED) 2039 MAKE_CASE(AArch64ISD::FMA_PRED) 2040 MAKE_CASE(AArch64ISD::FMAX_PRED) 2041 MAKE_CASE(AArch64ISD::FMAXV_PRED) 2042 MAKE_CASE(AArch64ISD::FMAXNM_PRED) 2043 MAKE_CASE(AArch64ISD::FMAXNMV_PRED) 2044 MAKE_CASE(AArch64ISD::FMIN_PRED) 2045 MAKE_CASE(AArch64ISD::FMINV_PRED) 2046 MAKE_CASE(AArch64ISD::FMINNM_PRED) 2047 MAKE_CASE(AArch64ISD::FMINNMV_PRED) 2048 MAKE_CASE(AArch64ISD::FMUL_PRED) 2049 MAKE_CASE(AArch64ISD::FSUB_PRED) 2050 MAKE_CASE(AArch64ISD::BIC) 2051 MAKE_CASE(AArch64ISD::BIT) 2052 MAKE_CASE(AArch64ISD::CBZ) 2053 MAKE_CASE(AArch64ISD::CBNZ) 2054 MAKE_CASE(AArch64ISD::TBZ) 2055 MAKE_CASE(AArch64ISD::TBNZ) 2056 MAKE_CASE(AArch64ISD::TC_RETURN) 2057 MAKE_CASE(AArch64ISD::PREFETCH) 2058 MAKE_CASE(AArch64ISD::SITOF) 2059 MAKE_CASE(AArch64ISD::UITOF) 2060 MAKE_CASE(AArch64ISD::NVCAST) 2061 MAKE_CASE(AArch64ISD::MRS) 2062 MAKE_CASE(AArch64ISD::SQSHL_I) 2063 MAKE_CASE(AArch64ISD::UQSHL_I) 2064 MAKE_CASE(AArch64ISD::SRSHR_I) 2065 MAKE_CASE(AArch64ISD::URSHR_I) 2066 MAKE_CASE(AArch64ISD::SQSHLU_I) 2067 MAKE_CASE(AArch64ISD::WrapperLarge) 2068 MAKE_CASE(AArch64ISD::LD2post) 2069 MAKE_CASE(AArch64ISD::LD3post) 2070 MAKE_CASE(AArch64ISD::LD4post) 2071 MAKE_CASE(AArch64ISD::ST2post) 2072 MAKE_CASE(AArch64ISD::ST3post) 2073 MAKE_CASE(AArch64ISD::ST4post) 2074 MAKE_CASE(AArch64ISD::LD1x2post) 2075 MAKE_CASE(AArch64ISD::LD1x3post) 2076 MAKE_CASE(AArch64ISD::LD1x4post) 2077 MAKE_CASE(AArch64ISD::ST1x2post) 2078 MAKE_CASE(AArch64ISD::ST1x3post) 2079 MAKE_CASE(AArch64ISD::ST1x4post) 2080 MAKE_CASE(AArch64ISD::LD1DUPpost) 2081 MAKE_CASE(AArch64ISD::LD2DUPpost) 2082 MAKE_CASE(AArch64ISD::LD3DUPpost) 2083 MAKE_CASE(AArch64ISD::LD4DUPpost) 2084 MAKE_CASE(AArch64ISD::LD1LANEpost) 2085 MAKE_CASE(AArch64ISD::LD2LANEpost) 2086 MAKE_CASE(AArch64ISD::LD3LANEpost) 2087 MAKE_CASE(AArch64ISD::LD4LANEpost) 2088 MAKE_CASE(AArch64ISD::ST2LANEpost) 2089 MAKE_CASE(AArch64ISD::ST3LANEpost) 2090 MAKE_CASE(AArch64ISD::ST4LANEpost) 2091 MAKE_CASE(AArch64ISD::SMULL) 2092 MAKE_CASE(AArch64ISD::UMULL) 2093 MAKE_CASE(AArch64ISD::FRECPE) 2094 MAKE_CASE(AArch64ISD::FRECPS) 2095 MAKE_CASE(AArch64ISD::FRSQRTE) 2096 MAKE_CASE(AArch64ISD::FRSQRTS) 2097 MAKE_CASE(AArch64ISD::STG) 2098 MAKE_CASE(AArch64ISD::STZG) 2099 MAKE_CASE(AArch64ISD::ST2G) 2100 MAKE_CASE(AArch64ISD::STZ2G) 2101 MAKE_CASE(AArch64ISD::SUNPKHI) 2102 MAKE_CASE(AArch64ISD::SUNPKLO) 2103 MAKE_CASE(AArch64ISD::UUNPKHI) 2104 MAKE_CASE(AArch64ISD::UUNPKLO) 2105 MAKE_CASE(AArch64ISD::INSR) 2106 MAKE_CASE(AArch64ISD::PTEST) 2107 MAKE_CASE(AArch64ISD::PTRUE) 2108 MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO) 2109 MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO) 2110 MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO) 2111 MAKE_CASE(AArch64ISD::LDNF1S_MERGE_ZERO) 2112 MAKE_CASE(AArch64ISD::LDFF1_MERGE_ZERO) 2113 MAKE_CASE(AArch64ISD::LDFF1S_MERGE_ZERO) 2114 MAKE_CASE(AArch64ISD::LD1RQ_MERGE_ZERO) 2115 MAKE_CASE(AArch64ISD::LD1RO_MERGE_ZERO) 2116 MAKE_CASE(AArch64ISD::SVE_LD2_MERGE_ZERO) 2117 MAKE_CASE(AArch64ISD::SVE_LD3_MERGE_ZERO) 2118 MAKE_CASE(AArch64ISD::SVE_LD4_MERGE_ZERO) 2119 MAKE_CASE(AArch64ISD::GLD1_MERGE_ZERO) 2120 MAKE_CASE(AArch64ISD::GLD1_SCALED_MERGE_ZERO) 2121 MAKE_CASE(AArch64ISD::GLD1_SXTW_MERGE_ZERO) 2122 MAKE_CASE(AArch64ISD::GLD1_UXTW_MERGE_ZERO) 2123 MAKE_CASE(AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO) 2124 MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO) 2125 MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO) 2126 MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO) 2127 MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO) 2128 MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO) 2129 MAKE_CASE(AArch64ISD::GLD1S_UXTW_MERGE_ZERO) 2130 MAKE_CASE(AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO) 2131 MAKE_CASE(AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO) 2132 MAKE_CASE(AArch64ISD::GLD1S_IMM_MERGE_ZERO) 2133 MAKE_CASE(AArch64ISD::GLDFF1_MERGE_ZERO) 2134 MAKE_CASE(AArch64ISD::GLDFF1_SCALED_MERGE_ZERO) 2135 MAKE_CASE(AArch64ISD::GLDFF1_SXTW_MERGE_ZERO) 2136 MAKE_CASE(AArch64ISD::GLDFF1_UXTW_MERGE_ZERO) 2137 MAKE_CASE(AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO) 2138 MAKE_CASE(AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO) 2139 MAKE_CASE(AArch64ISD::GLDFF1_IMM_MERGE_ZERO) 2140 MAKE_CASE(AArch64ISD::GLDFF1S_MERGE_ZERO) 2141 MAKE_CASE(AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO) 2142 MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO) 2143 MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO) 2144 MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO) 2145 MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO) 2146 MAKE_CASE(AArch64ISD::GLDFF1S_IMM_MERGE_ZERO) 2147 MAKE_CASE(AArch64ISD::GLDNT1_MERGE_ZERO) 2148 MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) 2149 MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO) 2150 MAKE_CASE(AArch64ISD::ST1_PRED) 2151 MAKE_CASE(AArch64ISD::SST1_PRED) 2152 MAKE_CASE(AArch64ISD::SST1_SCALED_PRED) 2153 MAKE_CASE(AArch64ISD::SST1_SXTW_PRED) 2154 MAKE_CASE(AArch64ISD::SST1_UXTW_PRED) 2155 MAKE_CASE(AArch64ISD::SST1_SXTW_SCALED_PRED) 2156 MAKE_CASE(AArch64ISD::SST1_UXTW_SCALED_PRED) 2157 MAKE_CASE(AArch64ISD::SST1_IMM_PRED) 2158 MAKE_CASE(AArch64ISD::SSTNT1_PRED) 2159 MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED) 2160 MAKE_CASE(AArch64ISD::LDP) 2161 MAKE_CASE(AArch64ISD::STP) 2162 MAKE_CASE(AArch64ISD::STNP) 2163 MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU) 2164 MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU) 2165 MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU) 2166 MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU) 2167 MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU) 2168 MAKE_CASE(AArch64ISD::INDEX_VECTOR) 2169 MAKE_CASE(AArch64ISD::UADDLP) 2170 MAKE_CASE(AArch64ISD::CALL_RVMARKER) 2171 } 2172 #undef MAKE_CASE 2173 return nullptr; 2174 } 2175 2176 MachineBasicBlock * 2177 AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI, 2178 MachineBasicBlock *MBB) const { 2179 // We materialise the F128CSEL pseudo-instruction as some control flow and a 2180 // phi node: 2181 2182 // OrigBB: 2183 // [... previous instrs leading to comparison ...] 2184 // b.ne TrueBB 2185 // b EndBB 2186 // TrueBB: 2187 // ; Fallthrough 2188 // EndBB: 2189 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB] 2190 2191 MachineFunction *MF = MBB->getParent(); 2192 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 2193 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 2194 DebugLoc DL = MI.getDebugLoc(); 2195 MachineFunction::iterator It = ++MBB->getIterator(); 2196 2197 Register DestReg = MI.getOperand(0).getReg(); 2198 Register IfTrueReg = MI.getOperand(1).getReg(); 2199 Register IfFalseReg = MI.getOperand(2).getReg(); 2200 unsigned CondCode = MI.getOperand(3).getImm(); 2201 bool NZCVKilled = MI.getOperand(4).isKill(); 2202 2203 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB); 2204 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB); 2205 MF->insert(It, TrueBB); 2206 MF->insert(It, EndBB); 2207 2208 // Transfer rest of current basic-block to EndBB 2209 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), 2210 MBB->end()); 2211 EndBB->transferSuccessorsAndUpdatePHIs(MBB); 2212 2213 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB); 2214 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB); 2215 MBB->addSuccessor(TrueBB); 2216 MBB->addSuccessor(EndBB); 2217 2218 // TrueBB falls through to the end. 2219 TrueBB->addSuccessor(EndBB); 2220 2221 if (!NZCVKilled) { 2222 TrueBB->addLiveIn(AArch64::NZCV); 2223 EndBB->addLiveIn(AArch64::NZCV); 2224 } 2225 2226 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg) 2227 .addReg(IfTrueReg) 2228 .addMBB(TrueBB) 2229 .addReg(IfFalseReg) 2230 .addMBB(MBB); 2231 2232 MI.eraseFromParent(); 2233 return EndBB; 2234 } 2235 2236 MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet( 2237 MachineInstr &MI, MachineBasicBlock *BB) const { 2238 assert(!isAsynchronousEHPersonality(classifyEHPersonality( 2239 BB->getParent()->getFunction().getPersonalityFn())) && 2240 "SEH does not use catchret!"); 2241 return BB; 2242 } 2243 2244 MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( 2245 MachineInstr &MI, MachineBasicBlock *BB) const { 2246 switch (MI.getOpcode()) { 2247 default: 2248 #ifndef NDEBUG 2249 MI.dump(); 2250 #endif 2251 llvm_unreachable("Unexpected instruction for custom inserter!"); 2252 2253 case AArch64::F128CSEL: 2254 return EmitF128CSEL(MI, BB); 2255 2256 case TargetOpcode::STACKMAP: 2257 case TargetOpcode::PATCHPOINT: 2258 case TargetOpcode::STATEPOINT: 2259 return emitPatchPoint(MI, BB); 2260 2261 case AArch64::CATCHRET: 2262 return EmitLoweredCatchRet(MI, BB); 2263 } 2264 } 2265 2266 //===----------------------------------------------------------------------===// 2267 // AArch64 Lowering private implementation. 2268 //===----------------------------------------------------------------------===// 2269 2270 //===----------------------------------------------------------------------===// 2271 // Lowering Code 2272 //===----------------------------------------------------------------------===// 2273 2274 // Forward declarations of SVE fixed length lowering helpers 2275 static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT); 2276 static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V); 2277 static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V); 2278 static SDValue convertFixedMaskToScalableVector(SDValue Mask, 2279 SelectionDAG &DAG); 2280 2281 /// isZerosVector - Check whether SDNode N is a zero-filled vector. 2282 static bool isZerosVector(const SDNode *N) { 2283 // Look through a bit convert. 2284 while (N->getOpcode() == ISD::BITCAST) 2285 N = N->getOperand(0).getNode(); 2286 2287 if (ISD::isConstantSplatVectorAllZeros(N)) 2288 return true; 2289 2290 if (N->getOpcode() != AArch64ISD::DUP) 2291 return false; 2292 2293 auto Opnd0 = N->getOperand(0); 2294 auto *CINT = dyn_cast<ConstantSDNode>(Opnd0); 2295 auto *CFP = dyn_cast<ConstantFPSDNode>(Opnd0); 2296 return (CINT && CINT->isNullValue()) || (CFP && CFP->isZero()); 2297 } 2298 2299 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 2300 /// CC 2301 static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) { 2302 switch (CC) { 2303 default: 2304 llvm_unreachable("Unknown condition code!"); 2305 case ISD::SETNE: 2306 return AArch64CC::NE; 2307 case ISD::SETEQ: 2308 return AArch64CC::EQ; 2309 case ISD::SETGT: 2310 return AArch64CC::GT; 2311 case ISD::SETGE: 2312 return AArch64CC::GE; 2313 case ISD::SETLT: 2314 return AArch64CC::LT; 2315 case ISD::SETLE: 2316 return AArch64CC::LE; 2317 case ISD::SETUGT: 2318 return AArch64CC::HI; 2319 case ISD::SETUGE: 2320 return AArch64CC::HS; 2321 case ISD::SETULT: 2322 return AArch64CC::LO; 2323 case ISD::SETULE: 2324 return AArch64CC::LS; 2325 } 2326 } 2327 2328 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC. 2329 static void changeFPCCToAArch64CC(ISD::CondCode CC, 2330 AArch64CC::CondCode &CondCode, 2331 AArch64CC::CondCode &CondCode2) { 2332 CondCode2 = AArch64CC::AL; 2333 switch (CC) { 2334 default: 2335 llvm_unreachable("Unknown FP condition!"); 2336 case ISD::SETEQ: 2337 case ISD::SETOEQ: 2338 CondCode = AArch64CC::EQ; 2339 break; 2340 case ISD::SETGT: 2341 case ISD::SETOGT: 2342 CondCode = AArch64CC::GT; 2343 break; 2344 case ISD::SETGE: 2345 case ISD::SETOGE: 2346 CondCode = AArch64CC::GE; 2347 break; 2348 case ISD::SETOLT: 2349 CondCode = AArch64CC::MI; 2350 break; 2351 case ISD::SETOLE: 2352 CondCode = AArch64CC::LS; 2353 break; 2354 case ISD::SETONE: 2355 CondCode = AArch64CC::MI; 2356 CondCode2 = AArch64CC::GT; 2357 break; 2358 case ISD::SETO: 2359 CondCode = AArch64CC::VC; 2360 break; 2361 case ISD::SETUO: 2362 CondCode = AArch64CC::VS; 2363 break; 2364 case ISD::SETUEQ: 2365 CondCode = AArch64CC::EQ; 2366 CondCode2 = AArch64CC::VS; 2367 break; 2368 case ISD::SETUGT: 2369 CondCode = AArch64CC::HI; 2370 break; 2371 case ISD::SETUGE: 2372 CondCode = AArch64CC::PL; 2373 break; 2374 case ISD::SETLT: 2375 case ISD::SETULT: 2376 CondCode = AArch64CC::LT; 2377 break; 2378 case ISD::SETLE: 2379 case ISD::SETULE: 2380 CondCode = AArch64CC::LE; 2381 break; 2382 case ISD::SETNE: 2383 case ISD::SETUNE: 2384 CondCode = AArch64CC::NE; 2385 break; 2386 } 2387 } 2388 2389 /// Convert a DAG fp condition code to an AArch64 CC. 2390 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that 2391 /// should be AND'ed instead of OR'ed. 2392 static void changeFPCCToANDAArch64CC(ISD::CondCode CC, 2393 AArch64CC::CondCode &CondCode, 2394 AArch64CC::CondCode &CondCode2) { 2395 CondCode2 = AArch64CC::AL; 2396 switch (CC) { 2397 default: 2398 changeFPCCToAArch64CC(CC, CondCode, CondCode2); 2399 assert(CondCode2 == AArch64CC::AL); 2400 break; 2401 case ISD::SETONE: 2402 // (a one b) 2403 // == ((a olt b) || (a ogt b)) 2404 // == ((a ord b) && (a une b)) 2405 CondCode = AArch64CC::VC; 2406 CondCode2 = AArch64CC::NE; 2407 break; 2408 case ISD::SETUEQ: 2409 // (a ueq b) 2410 // == ((a uno b) || (a oeq b)) 2411 // == ((a ule b) && (a uge b)) 2412 CondCode = AArch64CC::PL; 2413 CondCode2 = AArch64CC::LE; 2414 break; 2415 } 2416 } 2417 2418 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 2419 /// CC usable with the vector instructions. Fewer operations are available 2420 /// without a real NZCV register, so we have to use less efficient combinations 2421 /// to get the same effect. 2422 static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, 2423 AArch64CC::CondCode &CondCode, 2424 AArch64CC::CondCode &CondCode2, 2425 bool &Invert) { 2426 Invert = false; 2427 switch (CC) { 2428 default: 2429 // Mostly the scalar mappings work fine. 2430 changeFPCCToAArch64CC(CC, CondCode, CondCode2); 2431 break; 2432 case ISD::SETUO: 2433 Invert = true; 2434 LLVM_FALLTHROUGH; 2435 case ISD::SETO: 2436 CondCode = AArch64CC::MI; 2437 CondCode2 = AArch64CC::GE; 2438 break; 2439 case ISD::SETUEQ: 2440 case ISD::SETULT: 2441 case ISD::SETULE: 2442 case ISD::SETUGT: 2443 case ISD::SETUGE: 2444 // All of the compare-mask comparisons are ordered, but we can switch 2445 // between the two by a double inversion. E.g. ULE == !OGT. 2446 Invert = true; 2447 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32), 2448 CondCode, CondCode2); 2449 break; 2450 } 2451 } 2452 2453 static bool isLegalArithImmed(uint64_t C) { 2454 // Matches AArch64DAGToDAGISel::SelectArithImmed(). 2455 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0); 2456 LLVM_DEBUG(dbgs() << "Is imm " << C 2457 << " legal: " << (IsLegal ? "yes\n" : "no\n")); 2458 return IsLegal; 2459 } 2460 2461 // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on 2462 // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags 2463 // can be set differently by this operation. It comes down to whether 2464 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then 2465 // everything is fine. If not then the optimization is wrong. Thus general 2466 // comparisons are only valid if op2 != 0. 2467 // 2468 // So, finally, the only LLVM-native comparisons that don't mention C and V 2469 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in 2470 // the absence of information about op2. 2471 static bool isCMN(SDValue Op, ISD::CondCode CC) { 2472 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) && 2473 (CC == ISD::SETEQ || CC == ISD::SETNE); 2474 } 2475 2476 static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl, 2477 SelectionDAG &DAG, SDValue Chain, 2478 bool IsSignaling) { 2479 EVT VT = LHS.getValueType(); 2480 assert(VT != MVT::f128); 2481 assert(VT != MVT::f16 && "Lowering of strict fp16 not yet implemented"); 2482 unsigned Opcode = 2483 IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP; 2484 return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS}); 2485 } 2486 2487 static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, 2488 const SDLoc &dl, SelectionDAG &DAG) { 2489 EVT VT = LHS.getValueType(); 2490 const bool FullFP16 = 2491 static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16(); 2492 2493 if (VT.isFloatingPoint()) { 2494 assert(VT != MVT::f128); 2495 if (VT == MVT::f16 && !FullFP16) { 2496 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); 2497 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); 2498 VT = MVT::f32; 2499 } 2500 return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS); 2501 } 2502 2503 // The CMP instruction is just an alias for SUBS, and representing it as 2504 // SUBS means that it's possible to get CSE with subtract operations. 2505 // A later phase can perform the optimization of setting the destination 2506 // register to WZR/XZR if it ends up being unused. 2507 unsigned Opcode = AArch64ISD::SUBS; 2508 2509 if (isCMN(RHS, CC)) { 2510 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ? 2511 Opcode = AArch64ISD::ADDS; 2512 RHS = RHS.getOperand(1); 2513 } else if (isCMN(LHS, CC)) { 2514 // As we are looking for EQ/NE compares, the operands can be commuted ; can 2515 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ? 2516 Opcode = AArch64ISD::ADDS; 2517 LHS = LHS.getOperand(1); 2518 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) { 2519 if (LHS.getOpcode() == ISD::AND) { 2520 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST 2521 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one 2522 // of the signed comparisons. 2523 const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl, 2524 DAG.getVTList(VT, MVT_CC), 2525 LHS.getOperand(0), 2526 LHS.getOperand(1)); 2527 // Replace all users of (and X, Y) with newly generated (ands X, Y) 2528 DAG.ReplaceAllUsesWith(LHS, ANDSNode); 2529 return ANDSNode.getValue(1); 2530 } else if (LHS.getOpcode() == AArch64ISD::ANDS) { 2531 // Use result of ANDS 2532 return LHS.getValue(1); 2533 } 2534 } 2535 2536 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS) 2537 .getValue(1); 2538 } 2539 2540 /// \defgroup AArch64CCMP CMP;CCMP matching 2541 /// 2542 /// These functions deal with the formation of CMP;CCMP;... sequences. 2543 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of 2544 /// a comparison. They set the NZCV flags to a predefined value if their 2545 /// predicate is false. This allows to express arbitrary conjunctions, for 2546 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))" 2547 /// expressed as: 2548 /// cmp A 2549 /// ccmp B, inv(CB), CA 2550 /// check for CB flags 2551 /// 2552 /// This naturally lets us implement chains of AND operations with SETCC 2553 /// operands. And we can even implement some other situations by transforming 2554 /// them: 2555 /// - We can implement (NEG SETCC) i.e. negating a single comparison by 2556 /// negating the flags used in a CCMP/FCCMP operations. 2557 /// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations 2558 /// by negating the flags we test for afterwards. i.e. 2559 /// NEG (CMP CCMP CCCMP ...) can be implemented. 2560 /// - Note that we can only ever negate all previously processed results. 2561 /// What we can not implement by flipping the flags to test is a negation 2562 /// of two sub-trees (because the negation affects all sub-trees emitted so 2563 /// far, so the 2nd sub-tree we emit would also affect the first). 2564 /// With those tools we can implement some OR operations: 2565 /// - (OR (SETCC A) (SETCC B)) can be implemented via: 2566 /// NEG (AND (NEG (SETCC A)) (NEG (SETCC B))) 2567 /// - After transforming OR to NEG/AND combinations we may be able to use NEG 2568 /// elimination rules from earlier to implement the whole thing as a 2569 /// CCMP/FCCMP chain. 2570 /// 2571 /// As complete example: 2572 /// or (or (setCA (cmp A)) (setCB (cmp B))) 2573 /// (and (setCC (cmp C)) (setCD (cmp D)))" 2574 /// can be reassociated to: 2575 /// or (and (setCC (cmp C)) setCD (cmp D)) 2576 // (or (setCA (cmp A)) (setCB (cmp B))) 2577 /// can be transformed to: 2578 /// not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) 2579 /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))" 2580 /// which can be implemented as: 2581 /// cmp C 2582 /// ccmp D, inv(CD), CC 2583 /// ccmp A, CA, inv(CD) 2584 /// ccmp B, CB, inv(CA) 2585 /// check for CB flags 2586 /// 2587 /// A counterexample is "or (and A B) (and C D)" which translates to 2588 /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we 2589 /// can only implement 1 of the inner (not) operations, but not both! 2590 /// @{ 2591 2592 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate. 2593 static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, 2594 ISD::CondCode CC, SDValue CCOp, 2595 AArch64CC::CondCode Predicate, 2596 AArch64CC::CondCode OutCC, 2597 const SDLoc &DL, SelectionDAG &DAG) { 2598 unsigned Opcode = 0; 2599 const bool FullFP16 = 2600 static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16(); 2601 2602 if (LHS.getValueType().isFloatingPoint()) { 2603 assert(LHS.getValueType() != MVT::f128); 2604 if (LHS.getValueType() == MVT::f16 && !FullFP16) { 2605 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS); 2606 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS); 2607 } 2608 Opcode = AArch64ISD::FCCMP; 2609 } else if (RHS.getOpcode() == ISD::SUB) { 2610 SDValue SubOp0 = RHS.getOperand(0); 2611 if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { 2612 // See emitComparison() on why we can only do this for SETEQ and SETNE. 2613 Opcode = AArch64ISD::CCMN; 2614 RHS = RHS.getOperand(1); 2615 } 2616 } 2617 if (Opcode == 0) 2618 Opcode = AArch64ISD::CCMP; 2619 2620 SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC); 2621 AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); 2622 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); 2623 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); 2624 return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp); 2625 } 2626 2627 /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be 2628 /// expressed as a conjunction. See \ref AArch64CCMP. 2629 /// \param CanNegate Set to true if we can negate the whole sub-tree just by 2630 /// changing the conditions on the SETCC tests. 2631 /// (this means we can call emitConjunctionRec() with 2632 /// Negate==true on this sub-tree) 2633 /// \param MustBeFirst Set to true if this subtree needs to be negated and we 2634 /// cannot do the negation naturally. We are required to 2635 /// emit the subtree first in this case. 2636 /// \param WillNegate Is true if are called when the result of this 2637 /// subexpression must be negated. This happens when the 2638 /// outer expression is an OR. We can use this fact to know 2639 /// that we have a double negation (or (or ...) ...) that 2640 /// can be implemented for free. 2641 static bool canEmitConjunction(const SDValue Val, bool &CanNegate, 2642 bool &MustBeFirst, bool WillNegate, 2643 unsigned Depth = 0) { 2644 if (!Val.hasOneUse()) 2645 return false; 2646 unsigned Opcode = Val->getOpcode(); 2647 if (Opcode == ISD::SETCC) { 2648 if (Val->getOperand(0).getValueType() == MVT::f128) 2649 return false; 2650 CanNegate = true; 2651 MustBeFirst = false; 2652 return true; 2653 } 2654 // Protect against exponential runtime and stack overflow. 2655 if (Depth > 6) 2656 return false; 2657 if (Opcode == ISD::AND || Opcode == ISD::OR) { 2658 bool IsOR = Opcode == ISD::OR; 2659 SDValue O0 = Val->getOperand(0); 2660 SDValue O1 = Val->getOperand(1); 2661 bool CanNegateL; 2662 bool MustBeFirstL; 2663 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1)) 2664 return false; 2665 bool CanNegateR; 2666 bool MustBeFirstR; 2667 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1)) 2668 return false; 2669 2670 if (MustBeFirstL && MustBeFirstR) 2671 return false; 2672 2673 if (IsOR) { 2674 // For an OR expression we need to be able to naturally negate at least 2675 // one side or we cannot do the transformation at all. 2676 if (!CanNegateL && !CanNegateR) 2677 return false; 2678 // If we the result of the OR will be negated and we can naturally negate 2679 // the leafs, then this sub-tree as a whole negates naturally. 2680 CanNegate = WillNegate && CanNegateL && CanNegateR; 2681 // If we cannot naturally negate the whole sub-tree, then this must be 2682 // emitted first. 2683 MustBeFirst = !CanNegate; 2684 } else { 2685 assert(Opcode == ISD::AND && "Must be OR or AND"); 2686 // We cannot naturally negate an AND operation. 2687 CanNegate = false; 2688 MustBeFirst = MustBeFirstL || MustBeFirstR; 2689 } 2690 return true; 2691 } 2692 return false; 2693 } 2694 2695 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain 2696 /// of CCMP/CFCMP ops. See @ref AArch64CCMP. 2697 /// Tries to transform the given i1 producing node @p Val to a series compare 2698 /// and conditional compare operations. @returns an NZCV flags producing node 2699 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if 2700 /// transformation was not possible. 2701 /// \p Negate is true if we want this sub-tree being negated just by changing 2702 /// SETCC conditions. 2703 static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, 2704 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, 2705 AArch64CC::CondCode Predicate) { 2706 // We're at a tree leaf, produce a conditional comparison operation. 2707 unsigned Opcode = Val->getOpcode(); 2708 if (Opcode == ISD::SETCC) { 2709 SDValue LHS = Val->getOperand(0); 2710 SDValue RHS = Val->getOperand(1); 2711 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get(); 2712 bool isInteger = LHS.getValueType().isInteger(); 2713 if (Negate) 2714 CC = getSetCCInverse(CC, LHS.getValueType()); 2715 SDLoc DL(Val); 2716 // Determine OutCC and handle FP special case. 2717 if (isInteger) { 2718 OutCC = changeIntCCToAArch64CC(CC); 2719 } else { 2720 assert(LHS.getValueType().isFloatingPoint()); 2721 AArch64CC::CondCode ExtraCC; 2722 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC); 2723 // Some floating point conditions can't be tested with a single condition 2724 // code. Construct an additional comparison in this case. 2725 if (ExtraCC != AArch64CC::AL) { 2726 SDValue ExtraCmp; 2727 if (!CCOp.getNode()) 2728 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG); 2729 else 2730 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, 2731 ExtraCC, DL, DAG); 2732 CCOp = ExtraCmp; 2733 Predicate = ExtraCC; 2734 } 2735 } 2736 2737 // Produce a normal comparison if we are first in the chain 2738 if (!CCOp) 2739 return emitComparison(LHS, RHS, CC, DL, DAG); 2740 // Otherwise produce a ccmp. 2741 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL, 2742 DAG); 2743 } 2744 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree"); 2745 2746 bool IsOR = Opcode == ISD::OR; 2747 2748 SDValue LHS = Val->getOperand(0); 2749 bool CanNegateL; 2750 bool MustBeFirstL; 2751 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR); 2752 assert(ValidL && "Valid conjunction/disjunction tree"); 2753 (void)ValidL; 2754 2755 SDValue RHS = Val->getOperand(1); 2756 bool CanNegateR; 2757 bool MustBeFirstR; 2758 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR); 2759 assert(ValidR && "Valid conjunction/disjunction tree"); 2760 (void)ValidR; 2761 2762 // Swap sub-tree that must come first to the right side. 2763 if (MustBeFirstL) { 2764 assert(!MustBeFirstR && "Valid conjunction/disjunction tree"); 2765 std::swap(LHS, RHS); 2766 std::swap(CanNegateL, CanNegateR); 2767 std::swap(MustBeFirstL, MustBeFirstR); 2768 } 2769 2770 bool NegateR; 2771 bool NegateAfterR; 2772 bool NegateL; 2773 bool NegateAfterAll; 2774 if (Opcode == ISD::OR) { 2775 // Swap the sub-tree that we can negate naturally to the left. 2776 if (!CanNegateL) { 2777 assert(CanNegateR && "at least one side must be negatable"); 2778 assert(!MustBeFirstR && "invalid conjunction/disjunction tree"); 2779 assert(!Negate); 2780 std::swap(LHS, RHS); 2781 NegateR = false; 2782 NegateAfterR = true; 2783 } else { 2784 // Negate the left sub-tree if possible, otherwise negate the result. 2785 NegateR = CanNegateR; 2786 NegateAfterR = !CanNegateR; 2787 } 2788 NegateL = true; 2789 NegateAfterAll = !Negate; 2790 } else { 2791 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree"); 2792 assert(!Negate && "Valid conjunction/disjunction tree"); 2793 2794 NegateL = false; 2795 NegateR = false; 2796 NegateAfterR = false; 2797 NegateAfterAll = false; 2798 } 2799 2800 // Emit sub-trees. 2801 AArch64CC::CondCode RHSCC; 2802 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate); 2803 if (NegateAfterR) 2804 RHSCC = AArch64CC::getInvertedCondCode(RHSCC); 2805 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC); 2806 if (NegateAfterAll) 2807 OutCC = AArch64CC::getInvertedCondCode(OutCC); 2808 return CmpL; 2809 } 2810 2811 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops). 2812 /// In some cases this is even possible with OR operations in the expression. 2813 /// See \ref AArch64CCMP. 2814 /// \see emitConjunctionRec(). 2815 static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, 2816 AArch64CC::CondCode &OutCC) { 2817 bool DummyCanNegate; 2818 bool DummyMustBeFirst; 2819 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false)) 2820 return SDValue(); 2821 2822 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL); 2823 } 2824 2825 /// @} 2826 2827 /// Returns how profitable it is to fold a comparison's operand's shift and/or 2828 /// extension operations. 2829 static unsigned getCmpOperandFoldingProfit(SDValue Op) { 2830 auto isSupportedExtend = [&](SDValue V) { 2831 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG) 2832 return true; 2833 2834 if (V.getOpcode() == ISD::AND) 2835 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) { 2836 uint64_t Mask = MaskCst->getZExtValue(); 2837 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF); 2838 } 2839 2840 return false; 2841 }; 2842 2843 if (!Op.hasOneUse()) 2844 return 0; 2845 2846 if (isSupportedExtend(Op)) 2847 return 1; 2848 2849 unsigned Opc = Op.getOpcode(); 2850 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA) 2851 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 2852 uint64_t Shift = ShiftCst->getZExtValue(); 2853 if (isSupportedExtend(Op.getOperand(0))) 2854 return (Shift <= 4) ? 2 : 1; 2855 EVT VT = Op.getValueType(); 2856 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63)) 2857 return 1; 2858 } 2859 2860 return 0; 2861 } 2862 2863 static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, 2864 SDValue &AArch64cc, SelectionDAG &DAG, 2865 const SDLoc &dl) { 2866 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { 2867 EVT VT = RHS.getValueType(); 2868 uint64_t C = RHSC->getZExtValue(); 2869 if (!isLegalArithImmed(C)) { 2870 // Constant does not fit, try adjusting it by one? 2871 switch (CC) { 2872 default: 2873 break; 2874 case ISD::SETLT: 2875 case ISD::SETGE: 2876 if ((VT == MVT::i32 && C != 0x80000000 && 2877 isLegalArithImmed((uint32_t)(C - 1))) || 2878 (VT == MVT::i64 && C != 0x80000000ULL && 2879 isLegalArithImmed(C - 1ULL))) { 2880 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; 2881 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; 2882 RHS = DAG.getConstant(C, dl, VT); 2883 } 2884 break; 2885 case ISD::SETULT: 2886 case ISD::SETUGE: 2887 if ((VT == MVT::i32 && C != 0 && 2888 isLegalArithImmed((uint32_t)(C - 1))) || 2889 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) { 2890 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; 2891 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; 2892 RHS = DAG.getConstant(C, dl, VT); 2893 } 2894 break; 2895 case ISD::SETLE: 2896 case ISD::SETGT: 2897 if ((VT == MVT::i32 && C != INT32_MAX && 2898 isLegalArithImmed((uint32_t)(C + 1))) || 2899 (VT == MVT::i64 && C != INT64_MAX && 2900 isLegalArithImmed(C + 1ULL))) { 2901 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; 2902 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; 2903 RHS = DAG.getConstant(C, dl, VT); 2904 } 2905 break; 2906 case ISD::SETULE: 2907 case ISD::SETUGT: 2908 if ((VT == MVT::i32 && C != UINT32_MAX && 2909 isLegalArithImmed((uint32_t)(C + 1))) || 2910 (VT == MVT::i64 && C != UINT64_MAX && 2911 isLegalArithImmed(C + 1ULL))) { 2912 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; 2913 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; 2914 RHS = DAG.getConstant(C, dl, VT); 2915 } 2916 break; 2917 } 2918 } 2919 } 2920 2921 // Comparisons are canonicalized so that the RHS operand is simpler than the 2922 // LHS one, the extreme case being when RHS is an immediate. However, AArch64 2923 // can fold some shift+extend operations on the RHS operand, so swap the 2924 // operands if that can be done. 2925 // 2926 // For example: 2927 // lsl w13, w11, #1 2928 // cmp w13, w12 2929 // can be turned into: 2930 // cmp w12, w11, lsl #1 2931 if (!isa<ConstantSDNode>(RHS) || 2932 !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) { 2933 SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS; 2934 2935 if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) { 2936 std::swap(LHS, RHS); 2937 CC = ISD::getSetCCSwappedOperands(CC); 2938 } 2939 } 2940 2941 SDValue Cmp; 2942 AArch64CC::CondCode AArch64CC; 2943 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) { 2944 const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS); 2945 2946 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095. 2947 // For the i8 operand, the largest immediate is 255, so this can be easily 2948 // encoded in the compare instruction. For the i16 operand, however, the 2949 // largest immediate cannot be encoded in the compare. 2950 // Therefore, use a sign extending load and cmn to avoid materializing the 2951 // -1 constant. For example, 2952 // movz w1, #65535 2953 // ldrh w0, [x0, #0] 2954 // cmp w0, w1 2955 // > 2956 // ldrsh w0, [x0, #0] 2957 // cmn w0, #1 2958 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS) 2959 // if and only if (sext LHS) == (sext RHS). The checks are in place to 2960 // ensure both the LHS and RHS are truly zero extended and to make sure the 2961 // transformation is profitable. 2962 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) && 2963 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD && 2964 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 && 2965 LHS.getNode()->hasNUsesOfValue(1, 0)) { 2966 int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue(); 2967 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { 2968 SDValue SExt = 2969 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, 2970 DAG.getValueType(MVT::i16)); 2971 Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl, 2972 RHS.getValueType()), 2973 CC, dl, DAG); 2974 AArch64CC = changeIntCCToAArch64CC(CC); 2975 } 2976 } 2977 2978 if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) { 2979 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) { 2980 if ((CC == ISD::SETNE) ^ RHSC->isNullValue()) 2981 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC); 2982 } 2983 } 2984 } 2985 2986 if (!Cmp) { 2987 Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 2988 AArch64CC = changeIntCCToAArch64CC(CC); 2989 } 2990 AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC); 2991 return Cmp; 2992 } 2993 2994 static std::pair<SDValue, SDValue> 2995 getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { 2996 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) && 2997 "Unsupported value type"); 2998 SDValue Value, Overflow; 2999 SDLoc DL(Op); 3000 SDValue LHS = Op.getOperand(0); 3001 SDValue RHS = Op.getOperand(1); 3002 unsigned Opc = 0; 3003 switch (Op.getOpcode()) { 3004 default: 3005 llvm_unreachable("Unknown overflow instruction!"); 3006 case ISD::SADDO: 3007 Opc = AArch64ISD::ADDS; 3008 CC = AArch64CC::VS; 3009 break; 3010 case ISD::UADDO: 3011 Opc = AArch64ISD::ADDS; 3012 CC = AArch64CC::HS; 3013 break; 3014 case ISD::SSUBO: 3015 Opc = AArch64ISD::SUBS; 3016 CC = AArch64CC::VS; 3017 break; 3018 case ISD::USUBO: 3019 Opc = AArch64ISD::SUBS; 3020 CC = AArch64CC::LO; 3021 break; 3022 // Multiply needs a little bit extra work. 3023 case ISD::SMULO: 3024 case ISD::UMULO: { 3025 CC = AArch64CC::NE; 3026 bool IsSigned = Op.getOpcode() == ISD::SMULO; 3027 if (Op.getValueType() == MVT::i32) { 3028 // Extend to 64-bits, then perform a 64-bit multiply. 3029 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 3030 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS); 3031 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS); 3032 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); 3033 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul); 3034 3035 // Check that the result fits into a 32-bit integer. 3036 SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC); 3037 if (IsSigned) { 3038 // cmp xreg, wreg, sxtw 3039 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value); 3040 Overflow = 3041 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1); 3042 } else { 3043 // tst xreg, #0xffffffff00000000 3044 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64); 3045 Overflow = 3046 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1); 3047 } 3048 break; 3049 } 3050 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type"); 3051 // For the 64 bit multiply 3052 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); 3053 if (IsSigned) { 3054 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS); 3055 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value, 3056 DAG.getConstant(63, DL, MVT::i64)); 3057 // It is important that LowerBits is last, otherwise the arithmetic 3058 // shift will not be folded into the compare (SUBS). 3059 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); 3060 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) 3061 .getValue(1); 3062 } else { 3063 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS); 3064 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); 3065 Overflow = 3066 DAG.getNode(AArch64ISD::SUBS, DL, VTs, 3067 DAG.getConstant(0, DL, MVT::i64), 3068 UpperBits).getValue(1); 3069 } 3070 break; 3071 } 3072 } // switch (...) 3073 3074 if (Opc) { 3075 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32); 3076 3077 // Emit the AArch64 operation with overflow check. 3078 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS); 3079 Overflow = Value.getValue(1); 3080 } 3081 return std::make_pair(Value, Overflow); 3082 } 3083 3084 SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const { 3085 if (useSVEForFixedLengthVectorVT(Op.getValueType())) 3086 return LowerToScalableOp(Op, DAG); 3087 3088 SDValue Sel = Op.getOperand(0); 3089 SDValue Other = Op.getOperand(1); 3090 SDLoc dl(Sel); 3091 3092 // If the operand is an overflow checking operation, invert the condition 3093 // code and kill the Not operation. I.e., transform: 3094 // (xor (overflow_op_bool, 1)) 3095 // --> 3096 // (csel 1, 0, invert(cc), overflow_op_bool) 3097 // ... which later gets transformed to just a cset instruction with an 3098 // inverted condition code, rather than a cset + eor sequence. 3099 if (isOneConstant(Other) && ISD::isOverflowIntrOpRes(Sel)) { 3100 // Only lower legal XALUO ops. 3101 if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0))) 3102 return SDValue(); 3103 3104 SDValue TVal = DAG.getConstant(1, dl, MVT::i32); 3105 SDValue FVal = DAG.getConstant(0, dl, MVT::i32); 3106 AArch64CC::CondCode CC; 3107 SDValue Value, Overflow; 3108 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG); 3109 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32); 3110 return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal, 3111 CCVal, Overflow); 3112 } 3113 // If neither operand is a SELECT_CC, give up. 3114 if (Sel.getOpcode() != ISD::SELECT_CC) 3115 std::swap(Sel, Other); 3116 if (Sel.getOpcode() != ISD::SELECT_CC) 3117 return Op; 3118 3119 // The folding we want to perform is: 3120 // (xor x, (select_cc a, b, cc, 0, -1) ) 3121 // --> 3122 // (csel x, (xor x, -1), cc ...) 3123 // 3124 // The latter will get matched to a CSINV instruction. 3125 3126 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get(); 3127 SDValue LHS = Sel.getOperand(0); 3128 SDValue RHS = Sel.getOperand(1); 3129 SDValue TVal = Sel.getOperand(2); 3130 SDValue FVal = Sel.getOperand(3); 3131 3132 // FIXME: This could be generalized to non-integer comparisons. 3133 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) 3134 return Op; 3135 3136 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal); 3137 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal); 3138 3139 // The values aren't constants, this isn't the pattern we're looking for. 3140 if (!CFVal || !CTVal) 3141 return Op; 3142 3143 // We can commute the SELECT_CC by inverting the condition. This 3144 // might be needed to make this fit into a CSINV pattern. 3145 if (CTVal->isAllOnesValue() && CFVal->isNullValue()) { 3146 std::swap(TVal, FVal); 3147 std::swap(CTVal, CFVal); 3148 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 3149 } 3150 3151 // If the constants line up, perform the transform! 3152 if (CTVal->isNullValue() && CFVal->isAllOnesValue()) { 3153 SDValue CCVal; 3154 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); 3155 3156 FVal = Other; 3157 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other, 3158 DAG.getConstant(-1ULL, dl, Other.getValueType())); 3159 3160 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal, 3161 CCVal, Cmp); 3162 } 3163 3164 return Op; 3165 } 3166 3167 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 3168 EVT VT = Op.getValueType(); 3169 3170 // Let legalize expand this if it isn't a legal type yet. 3171 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 3172 return SDValue(); 3173 3174 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 3175 3176 unsigned Opc; 3177 bool ExtraOp = false; 3178 switch (Op.getOpcode()) { 3179 default: 3180 llvm_unreachable("Invalid code"); 3181 case ISD::ADDC: 3182 Opc = AArch64ISD::ADDS; 3183 break; 3184 case ISD::SUBC: 3185 Opc = AArch64ISD::SUBS; 3186 break; 3187 case ISD::ADDE: 3188 Opc = AArch64ISD::ADCS; 3189 ExtraOp = true; 3190 break; 3191 case ISD::SUBE: 3192 Opc = AArch64ISD::SBCS; 3193 ExtraOp = true; 3194 break; 3195 } 3196 3197 if (!ExtraOp) 3198 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1)); 3199 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1), 3200 Op.getOperand(2)); 3201 } 3202 3203 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { 3204 // Let legalize expand this if it isn't a legal type yet. 3205 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 3206 return SDValue(); 3207 3208 SDLoc dl(Op); 3209 AArch64CC::CondCode CC; 3210 // The actual operation that sets the overflow or carry flag. 3211 SDValue Value, Overflow; 3212 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG); 3213 3214 // We use 0 and 1 as false and true values. 3215 SDValue TVal = DAG.getConstant(1, dl, MVT::i32); 3216 SDValue FVal = DAG.getConstant(0, dl, MVT::i32); 3217 3218 // We use an inverted condition, because the conditional select is inverted 3219 // too. This will allow it to be selected to a single instruction: 3220 // CSINC Wd, WZR, WZR, invert(cond). 3221 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32); 3222 Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal, 3223 CCVal, Overflow); 3224 3225 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 3226 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 3227 } 3228 3229 // Prefetch operands are: 3230 // 1: Address to prefetch 3231 // 2: bool isWrite 3232 // 3: int locality (0 = no locality ... 3 = extreme locality) 3233 // 4: bool isDataCache 3234 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) { 3235 SDLoc DL(Op); 3236 unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 3237 unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 3238 unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 3239 3240 bool IsStream = !Locality; 3241 // When the locality number is set 3242 if (Locality) { 3243 // The front-end should have filtered out the out-of-range values 3244 assert(Locality <= 3 && "Prefetch locality out-of-range"); 3245 // The locality degree is the opposite of the cache speed. 3246 // Put the number the other way around. 3247 // The encoding starts at 0 for level 1 3248 Locality = 3 - Locality; 3249 } 3250 3251 // built the mask value encoding the expected behavior. 3252 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit 3253 (!IsData << 3) | // IsDataCache bit 3254 (Locality << 1) | // Cache level bits 3255 (unsigned)IsStream; // Stream bit 3256 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0), 3257 DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1)); 3258 } 3259 3260 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, 3261 SelectionDAG &DAG) const { 3262 EVT VT = Op.getValueType(); 3263 if (VT.isScalableVector()) 3264 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU); 3265 3266 if (useSVEForFixedLengthVectorVT(VT)) 3267 return LowerFixedLengthFPExtendToSVE(Op, DAG); 3268 3269 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering"); 3270 return SDValue(); 3271 } 3272 3273 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, 3274 SelectionDAG &DAG) const { 3275 if (Op.getValueType().isScalableVector()) 3276 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU); 3277 3278 bool IsStrict = Op->isStrictFPOpcode(); 3279 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 3280 EVT SrcVT = SrcVal.getValueType(); 3281 3282 if (useSVEForFixedLengthVectorVT(SrcVT)) 3283 return LowerFixedLengthFPRoundToSVE(Op, DAG); 3284 3285 if (SrcVT != MVT::f128) { 3286 // Expand cases where the input is a vector bigger than NEON. 3287 if (useSVEForFixedLengthVectorVT(SrcVT)) 3288 return SDValue(); 3289 3290 // It's legal except when f128 is involved 3291 return Op; 3292 } 3293 3294 return SDValue(); 3295 } 3296 3297 SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, 3298 SelectionDAG &DAG) const { 3299 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. 3300 // Any additional optimization in this function should be recorded 3301 // in the cost tables. 3302 EVT InVT = Op.getOperand(0).getValueType(); 3303 EVT VT = Op.getValueType(); 3304 3305 if (VT.isScalableVector()) { 3306 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT 3307 ? AArch64ISD::FCVTZU_MERGE_PASSTHRU 3308 : AArch64ISD::FCVTZS_MERGE_PASSTHRU; 3309 return LowerToPredicatedOp(Op, DAG, Opcode); 3310 } 3311 3312 if (useSVEForFixedLengthVectorVT(VT) || useSVEForFixedLengthVectorVT(InVT)) 3313 return LowerFixedLengthFPToIntToSVE(Op, DAG); 3314 3315 unsigned NumElts = InVT.getVectorNumElements(); 3316 3317 // f16 conversions are promoted to f32 when full fp16 is not supported. 3318 if (InVT.getVectorElementType() == MVT::f16 && 3319 !Subtarget->hasFullFP16()) { 3320 MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts); 3321 SDLoc dl(Op); 3322 return DAG.getNode( 3323 Op.getOpcode(), dl, Op.getValueType(), 3324 DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0))); 3325 } 3326 3327 uint64_t VTSize = VT.getFixedSizeInBits(); 3328 uint64_t InVTSize = InVT.getFixedSizeInBits(); 3329 if (VTSize < InVTSize) { 3330 SDLoc dl(Op); 3331 SDValue Cv = 3332 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(), 3333 Op.getOperand(0)); 3334 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv); 3335 } 3336 3337 if (VTSize > InVTSize) { 3338 SDLoc dl(Op); 3339 MVT ExtVT = 3340 MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()), 3341 VT.getVectorNumElements()); 3342 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0)); 3343 return DAG.getNode(Op.getOpcode(), dl, VT, Ext); 3344 } 3345 3346 // Type changing conversions are illegal. 3347 return Op; 3348 } 3349 3350 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, 3351 SelectionDAG &DAG) const { 3352 bool IsStrict = Op->isStrictFPOpcode(); 3353 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 3354 3355 if (SrcVal.getValueType().isVector()) 3356 return LowerVectorFP_TO_INT(Op, DAG); 3357 3358 // f16 conversions are promoted to f32 when full fp16 is not supported. 3359 if (SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { 3360 assert(!IsStrict && "Lowering of strict fp16 not yet implemented"); 3361 SDLoc dl(Op); 3362 return DAG.getNode( 3363 Op.getOpcode(), dl, Op.getValueType(), 3364 DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal)); 3365 } 3366 3367 if (SrcVal.getValueType() != MVT::f128) { 3368 // It's legal except when f128 is involved 3369 return Op; 3370 } 3371 3372 return SDValue(); 3373 } 3374 3375 SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, 3376 SelectionDAG &DAG) const { 3377 // AArch64 FP-to-int conversions saturate to the destination register size, so 3378 // we can lower common saturating conversions to simple instructions. 3379 SDValue SrcVal = Op.getOperand(0); 3380 3381 EVT SrcVT = SrcVal.getValueType(); 3382 EVT DstVT = Op.getValueType(); 3383 3384 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); 3385 uint64_t SatWidth = SatVT.getScalarSizeInBits(); 3386 uint64_t DstWidth = DstVT.getScalarSizeInBits(); 3387 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width"); 3388 3389 // TODO: Support lowering of NEON and SVE conversions. 3390 if (SrcVT.isVector()) 3391 return SDValue(); 3392 3393 // TODO: Saturate to SatWidth explicitly. 3394 if (SatWidth != DstWidth) 3395 return SDValue(); 3396 3397 // In the absence of FP16 support, promote f32 to f16, like LowerFP_TO_INT(). 3398 if (SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) 3399 return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), 3400 DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal), 3401 Op.getOperand(1)); 3402 3403 // Cases that we can emit directly. 3404 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 || 3405 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) && 3406 (DstVT == MVT::i64 || DstVT == MVT::i32)) 3407 return Op; 3408 3409 // For all other cases, fall back on the expanded form. 3410 return SDValue(); 3411 } 3412 3413 SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op, 3414 SelectionDAG &DAG) const { 3415 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. 3416 // Any additional optimization in this function should be recorded 3417 // in the cost tables. 3418 EVT VT = Op.getValueType(); 3419 SDLoc dl(Op); 3420 SDValue In = Op.getOperand(0); 3421 EVT InVT = In.getValueType(); 3422 unsigned Opc = Op.getOpcode(); 3423 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP; 3424 3425 if (VT.isScalableVector()) { 3426 if (InVT.getVectorElementType() == MVT::i1) { 3427 // We can't directly extend an SVE predicate; extend it first. 3428 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 3429 EVT CastVT = getPromotedVTForPredicate(InVT); 3430 In = DAG.getNode(CastOpc, dl, CastVT, In); 3431 return DAG.getNode(Opc, dl, VT, In); 3432 } 3433 3434 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU 3435 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU; 3436 return LowerToPredicatedOp(Op, DAG, Opcode); 3437 } 3438 3439 if (useSVEForFixedLengthVectorVT(VT) || useSVEForFixedLengthVectorVT(InVT)) 3440 return LowerFixedLengthIntToFPToSVE(Op, DAG); 3441 3442 uint64_t VTSize = VT.getFixedSizeInBits(); 3443 uint64_t InVTSize = InVT.getFixedSizeInBits(); 3444 if (VTSize < InVTSize) { 3445 MVT CastVT = 3446 MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()), 3447 InVT.getVectorNumElements()); 3448 In = DAG.getNode(Opc, dl, CastVT, In); 3449 return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl)); 3450 } 3451 3452 if (VTSize > InVTSize) { 3453 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 3454 EVT CastVT = VT.changeVectorElementTypeToInteger(); 3455 In = DAG.getNode(CastOpc, dl, CastVT, In); 3456 return DAG.getNode(Opc, dl, VT, In); 3457 } 3458 3459 return Op; 3460 } 3461 3462 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, 3463 SelectionDAG &DAG) const { 3464 if (Op.getValueType().isVector()) 3465 return LowerVectorINT_TO_FP(Op, DAG); 3466 3467 bool IsStrict = Op->isStrictFPOpcode(); 3468 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 3469 3470 // f16 conversions are promoted to f32 when full fp16 is not supported. 3471 if (Op.getValueType() == MVT::f16 && 3472 !Subtarget->hasFullFP16()) { 3473 assert(!IsStrict && "Lowering of strict fp16 not yet implemented"); 3474 SDLoc dl(Op); 3475 return DAG.getNode( 3476 ISD::FP_ROUND, dl, MVT::f16, 3477 DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal), 3478 DAG.getIntPtrConstant(0, dl)); 3479 } 3480 3481 // i128 conversions are libcalls. 3482 if (SrcVal.getValueType() == MVT::i128) 3483 return SDValue(); 3484 3485 // Other conversions are legal, unless it's to the completely software-based 3486 // fp128. 3487 if (Op.getValueType() != MVT::f128) 3488 return Op; 3489 return SDValue(); 3490 } 3491 3492 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, 3493 SelectionDAG &DAG) const { 3494 // For iOS, we want to call an alternative entry point: __sincos_stret, 3495 // which returns the values in two S / D registers. 3496 SDLoc dl(Op); 3497 SDValue Arg = Op.getOperand(0); 3498 EVT ArgVT = Arg.getValueType(); 3499 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 3500 3501 ArgListTy Args; 3502 ArgListEntry Entry; 3503 3504 Entry.Node = Arg; 3505 Entry.Ty = ArgTy; 3506 Entry.IsSExt = false; 3507 Entry.IsZExt = false; 3508 Args.push_back(Entry); 3509 3510 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64 3511 : RTLIB::SINCOS_STRET_F32; 3512 const char *LibcallName = getLibcallName(LC); 3513 SDValue Callee = 3514 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout())); 3515 3516 StructType *RetTy = StructType::get(ArgTy, ArgTy); 3517 TargetLowering::CallLoweringInfo CLI(DAG); 3518 CLI.setDebugLoc(dl) 3519 .setChain(DAG.getEntryNode()) 3520 .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args)); 3521 3522 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 3523 return CallResult.first; 3524 } 3525 3526 static MVT getSVEContainerType(EVT ContentTy); 3527 3528 SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op, 3529 SelectionDAG &DAG) const { 3530 EVT OpVT = Op.getValueType(); 3531 EVT ArgVT = Op.getOperand(0).getValueType(); 3532 3533 if (useSVEForFixedLengthVectorVT(OpVT)) 3534 return LowerFixedLengthBitcastToSVE(Op, DAG); 3535 3536 if (OpVT.isScalableVector()) { 3537 if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) { 3538 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() && 3539 "Expected int->fp bitcast!"); 3540 SDValue ExtResult = 3541 DAG.getNode(ISD::ANY_EXTEND, SDLoc(Op), getSVEContainerType(ArgVT), 3542 Op.getOperand(0)); 3543 return getSVESafeBitCast(OpVT, ExtResult, DAG); 3544 } 3545 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG); 3546 } 3547 3548 if (OpVT != MVT::f16 && OpVT != MVT::bf16) 3549 return SDValue(); 3550 3551 assert(ArgVT == MVT::i16); 3552 SDLoc DL(Op); 3553 3554 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0)); 3555 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op); 3556 return SDValue( 3557 DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, OpVT, Op, 3558 DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)), 3559 0); 3560 } 3561 3562 static EVT getExtensionTo64Bits(const EVT &OrigVT) { 3563 if (OrigVT.getSizeInBits() >= 64) 3564 return OrigVT; 3565 3566 assert(OrigVT.isSimple() && "Expecting a simple value type"); 3567 3568 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; 3569 switch (OrigSimpleTy) { 3570 default: llvm_unreachable("Unexpected Vector Type"); 3571 case MVT::v2i8: 3572 case MVT::v2i16: 3573 return MVT::v2i32; 3574 case MVT::v4i8: 3575 return MVT::v4i16; 3576 } 3577 } 3578 3579 static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, 3580 const EVT &OrigTy, 3581 const EVT &ExtTy, 3582 unsigned ExtOpcode) { 3583 // The vector originally had a size of OrigTy. It was then extended to ExtTy. 3584 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than 3585 // 64-bits we need to insert a new extension so that it will be 64-bits. 3586 assert(ExtTy.is128BitVector() && "Unexpected extension size"); 3587 if (OrigTy.getSizeInBits() >= 64) 3588 return N; 3589 3590 // Must extend size to at least 64 bits to be used as an operand for VMULL. 3591 EVT NewVT = getExtensionTo64Bits(OrigTy); 3592 3593 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); 3594 } 3595 3596 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, 3597 bool isSigned) { 3598 EVT VT = N->getValueType(0); 3599 3600 if (N->getOpcode() != ISD::BUILD_VECTOR) 3601 return false; 3602 3603 for (const SDValue &Elt : N->op_values()) { 3604 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { 3605 unsigned EltSize = VT.getScalarSizeInBits(); 3606 unsigned HalfSize = EltSize / 2; 3607 if (isSigned) { 3608 if (!isIntN(HalfSize, C->getSExtValue())) 3609 return false; 3610 } else { 3611 if (!isUIntN(HalfSize, C->getZExtValue())) 3612 return false; 3613 } 3614 continue; 3615 } 3616 return false; 3617 } 3618 3619 return true; 3620 } 3621 3622 static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) { 3623 if (N->getOpcode() == ISD::SIGN_EXTEND || 3624 N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND) 3625 return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG, 3626 N->getOperand(0)->getValueType(0), 3627 N->getValueType(0), 3628 N->getOpcode()); 3629 3630 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); 3631 EVT VT = N->getValueType(0); 3632 SDLoc dl(N); 3633 unsigned EltSize = VT.getScalarSizeInBits() / 2; 3634 unsigned NumElts = VT.getVectorNumElements(); 3635 MVT TruncVT = MVT::getIntegerVT(EltSize); 3636 SmallVector<SDValue, 8> Ops; 3637 for (unsigned i = 0; i != NumElts; ++i) { 3638 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); 3639 const APInt &CInt = C->getAPIntValue(); 3640 // Element types smaller than 32 bits are not legal, so use i32 elements. 3641 // The values are implicitly truncated so sext vs. zext doesn't matter. 3642 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); 3643 } 3644 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops); 3645 } 3646 3647 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { 3648 return N->getOpcode() == ISD::SIGN_EXTEND || 3649 N->getOpcode() == ISD::ANY_EXTEND || 3650 isExtendedBUILD_VECTOR(N, DAG, true); 3651 } 3652 3653 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { 3654 return N->getOpcode() == ISD::ZERO_EXTEND || 3655 N->getOpcode() == ISD::ANY_EXTEND || 3656 isExtendedBUILD_VECTOR(N, DAG, false); 3657 } 3658 3659 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { 3660 unsigned Opcode = N->getOpcode(); 3661 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 3662 SDNode *N0 = N->getOperand(0).getNode(); 3663 SDNode *N1 = N->getOperand(1).getNode(); 3664 return N0->hasOneUse() && N1->hasOneUse() && 3665 isSignExtended(N0, DAG) && isSignExtended(N1, DAG); 3666 } 3667 return false; 3668 } 3669 3670 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { 3671 unsigned Opcode = N->getOpcode(); 3672 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 3673 SDNode *N0 = N->getOperand(0).getNode(); 3674 SDNode *N1 = N->getOperand(1).getNode(); 3675 return N0->hasOneUse() && N1->hasOneUse() && 3676 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); 3677 } 3678 return false; 3679 } 3680 3681 SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 3682 SelectionDAG &DAG) const { 3683 // The rounding mode is in bits 23:22 of the FPSCR. 3684 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 3685 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) 3686 // so that the shift + and get folded into a bitfield extract. 3687 SDLoc dl(Op); 3688 3689 SDValue Chain = Op.getOperand(0); 3690 SDValue FPCR_64 = DAG.getNode( 3691 ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other}, 3692 {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)}); 3693 Chain = FPCR_64.getValue(1); 3694 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64); 3695 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32, 3696 DAG.getConstant(1U << 22, dl, MVT::i32)); 3697 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, 3698 DAG.getConstant(22, dl, MVT::i32)); 3699 SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, 3700 DAG.getConstant(3, dl, MVT::i32)); 3701 return DAG.getMergeValues({AND, Chain}, dl); 3702 } 3703 3704 SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op, 3705 SelectionDAG &DAG) const { 3706 SDLoc DL(Op); 3707 SDValue Chain = Op->getOperand(0); 3708 SDValue RMValue = Op->getOperand(1); 3709 3710 // The rounding mode is in bits 23:22 of the FPCR. 3711 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping 3712 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is 3713 // ((arg - 1) & 3) << 22). 3714 // 3715 // The argument of llvm.set.rounding must be within the segment [0, 3], so 3716 // NearestTiesToAway (4) is not handled here. It is responsibility of the code 3717 // generated llvm.set.rounding to ensure this condition. 3718 3719 // Calculate new value of FPCR[23:22]. 3720 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue, 3721 DAG.getConstant(1, DL, MVT::i32)); 3722 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue, 3723 DAG.getConstant(0x3, DL, MVT::i32)); 3724 RMValue = 3725 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue, 3726 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32)); 3727 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue); 3728 3729 // Get current value of FPCR. 3730 SDValue Ops[] = { 3731 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)}; 3732 SDValue FPCR = 3733 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops); 3734 Chain = FPCR.getValue(1); 3735 FPCR = FPCR.getValue(0); 3736 3737 // Put new rounding mode into FPSCR[23:22]. 3738 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos); 3739 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR, 3740 DAG.getConstant(RMMask, DL, MVT::i64)); 3741 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue); 3742 SDValue Ops2[] = { 3743 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), 3744 FPCR}; 3745 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2); 3746 } 3747 3748 SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { 3749 EVT VT = Op.getValueType(); 3750 3751 // If SVE is available then i64 vector multiplications can also be made legal. 3752 bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64; 3753 3754 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON)) 3755 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED, OverrideNEON); 3756 3757 // Multiplications are only custom-lowered for 128-bit vectors so that 3758 // VMULL can be detected. Otherwise v2i64 multiplications are not legal. 3759 assert(VT.is128BitVector() && VT.isInteger() && 3760 "unexpected type for custom-lowering ISD::MUL"); 3761 SDNode *N0 = Op.getOperand(0).getNode(); 3762 SDNode *N1 = Op.getOperand(1).getNode(); 3763 unsigned NewOpc = 0; 3764 bool isMLA = false; 3765 bool isN0SExt = isSignExtended(N0, DAG); 3766 bool isN1SExt = isSignExtended(N1, DAG); 3767 if (isN0SExt && isN1SExt) 3768 NewOpc = AArch64ISD::SMULL; 3769 else { 3770 bool isN0ZExt = isZeroExtended(N0, DAG); 3771 bool isN1ZExt = isZeroExtended(N1, DAG); 3772 if (isN0ZExt && isN1ZExt) 3773 NewOpc = AArch64ISD::UMULL; 3774 else if (isN1SExt || isN1ZExt) { 3775 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these 3776 // into (s/zext A * s/zext C) + (s/zext B * s/zext C) 3777 if (isN1SExt && isAddSubSExt(N0, DAG)) { 3778 NewOpc = AArch64ISD::SMULL; 3779 isMLA = true; 3780 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { 3781 NewOpc = AArch64ISD::UMULL; 3782 isMLA = true; 3783 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { 3784 std::swap(N0, N1); 3785 NewOpc = AArch64ISD::UMULL; 3786 isMLA = true; 3787 } 3788 } 3789 3790 if (!NewOpc) { 3791 if (VT == MVT::v2i64) 3792 // Fall through to expand this. It is not legal. 3793 return SDValue(); 3794 else 3795 // Other vector multiplications are legal. 3796 return Op; 3797 } 3798 } 3799 3800 // Legalize to a S/UMULL instruction 3801 SDLoc DL(Op); 3802 SDValue Op0; 3803 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG); 3804 if (!isMLA) { 3805 Op0 = skipExtensionForVectorMULL(N0, DAG); 3806 assert(Op0.getValueType().is64BitVector() && 3807 Op1.getValueType().is64BitVector() && 3808 "unexpected types for extended operands to VMULL"); 3809 return DAG.getNode(NewOpc, DL, VT, Op0, Op1); 3810 } 3811 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during 3812 // isel lowering to take advantage of no-stall back to back s/umul + s/umla. 3813 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57 3814 SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG); 3815 SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG); 3816 EVT Op1VT = Op1.getValueType(); 3817 return DAG.getNode(N0->getOpcode(), DL, VT, 3818 DAG.getNode(NewOpc, DL, VT, 3819 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), 3820 DAG.getNode(NewOpc, DL, VT, 3821 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); 3822 } 3823 3824 static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, 3825 int Pattern) { 3826 return DAG.getNode(AArch64ISD::PTRUE, DL, VT, 3827 DAG.getTargetConstant(Pattern, DL, MVT::i32)); 3828 } 3829 3830 static SDValue lowerConvertToSVBool(SDValue Op, SelectionDAG &DAG) { 3831 SDLoc DL(Op); 3832 EVT OutVT = Op.getValueType(); 3833 SDValue InOp = Op.getOperand(1); 3834 EVT InVT = InOp.getValueType(); 3835 3836 // Return the operand if the cast isn't changing type, 3837 // i.e. <n x 16 x i1> -> <n x 16 x i1> 3838 if (InVT == OutVT) 3839 return InOp; 3840 3841 SDValue Reinterpret = 3842 DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, OutVT, InOp); 3843 3844 // If the argument converted to an svbool is a ptrue or a comparison, the 3845 // lanes introduced by the widening are zero by construction. 3846 switch (InOp.getOpcode()) { 3847 case AArch64ISD::SETCC_MERGE_ZERO: 3848 return Reinterpret; 3849 case ISD::INTRINSIC_WO_CHAIN: 3850 if (InOp.getConstantOperandVal(0) == Intrinsic::aarch64_sve_ptrue) 3851 return Reinterpret; 3852 } 3853 3854 // Otherwise, zero the newly introduced lanes. 3855 SDValue Mask = getPTrue(DAG, DL, InVT, AArch64SVEPredPattern::all); 3856 SDValue MaskReinterpret = 3857 DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, OutVT, Mask); 3858 return DAG.getNode(ISD::AND, DL, OutVT, Reinterpret, MaskReinterpret); 3859 } 3860 3861 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, 3862 SelectionDAG &DAG) const { 3863 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 3864 SDLoc dl(Op); 3865 switch (IntNo) { 3866 default: return SDValue(); // Don't custom lower most intrinsics. 3867 case Intrinsic::thread_pointer: { 3868 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3869 return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT); 3870 } 3871 case Intrinsic::aarch64_neon_abs: { 3872 EVT Ty = Op.getValueType(); 3873 if (Ty == MVT::i64) { 3874 SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, 3875 Op.getOperand(1)); 3876 Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result); 3877 return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result); 3878 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) { 3879 return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1)); 3880 } else { 3881 report_fatal_error("Unexpected type for AArch64 NEON intrinic"); 3882 } 3883 } 3884 case Intrinsic::aarch64_neon_smax: 3885 return DAG.getNode(ISD::SMAX, dl, Op.getValueType(), 3886 Op.getOperand(1), Op.getOperand(2)); 3887 case Intrinsic::aarch64_neon_umax: 3888 return DAG.getNode(ISD::UMAX, dl, Op.getValueType(), 3889 Op.getOperand(1), Op.getOperand(2)); 3890 case Intrinsic::aarch64_neon_smin: 3891 return DAG.getNode(ISD::SMIN, dl, Op.getValueType(), 3892 Op.getOperand(1), Op.getOperand(2)); 3893 case Intrinsic::aarch64_neon_umin: 3894 return DAG.getNode(ISD::UMIN, dl, Op.getValueType(), 3895 Op.getOperand(1), Op.getOperand(2)); 3896 3897 case Intrinsic::aarch64_sve_sunpkhi: 3898 return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(), 3899 Op.getOperand(1)); 3900 case Intrinsic::aarch64_sve_sunpklo: 3901 return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(), 3902 Op.getOperand(1)); 3903 case Intrinsic::aarch64_sve_uunpkhi: 3904 return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(), 3905 Op.getOperand(1)); 3906 case Intrinsic::aarch64_sve_uunpklo: 3907 return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(), 3908 Op.getOperand(1)); 3909 case Intrinsic::aarch64_sve_clasta_n: 3910 return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(), 3911 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 3912 case Intrinsic::aarch64_sve_clastb_n: 3913 return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(), 3914 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 3915 case Intrinsic::aarch64_sve_lasta: 3916 return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(), 3917 Op.getOperand(1), Op.getOperand(2)); 3918 case Intrinsic::aarch64_sve_lastb: 3919 return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(), 3920 Op.getOperand(1), Op.getOperand(2)); 3921 case Intrinsic::aarch64_sve_rev: 3922 return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(), 3923 Op.getOperand(1)); 3924 case Intrinsic::aarch64_sve_tbl: 3925 return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(), 3926 Op.getOperand(1), Op.getOperand(2)); 3927 case Intrinsic::aarch64_sve_trn1: 3928 return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(), 3929 Op.getOperand(1), Op.getOperand(2)); 3930 case Intrinsic::aarch64_sve_trn2: 3931 return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(), 3932 Op.getOperand(1), Op.getOperand(2)); 3933 case Intrinsic::aarch64_sve_uzp1: 3934 return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(), 3935 Op.getOperand(1), Op.getOperand(2)); 3936 case Intrinsic::aarch64_sve_uzp2: 3937 return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(), 3938 Op.getOperand(1), Op.getOperand(2)); 3939 case Intrinsic::aarch64_sve_zip1: 3940 return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(), 3941 Op.getOperand(1), Op.getOperand(2)); 3942 case Intrinsic::aarch64_sve_zip2: 3943 return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(), 3944 Op.getOperand(1), Op.getOperand(2)); 3945 case Intrinsic::aarch64_sve_splice: 3946 return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(), 3947 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 3948 case Intrinsic::aarch64_sve_ptrue: 3949 return DAG.getNode(AArch64ISD::PTRUE, dl, Op.getValueType(), 3950 Op.getOperand(1)); 3951 case Intrinsic::aarch64_sve_clz: 3952 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(), 3953 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 3954 case Intrinsic::aarch64_sve_cnt: { 3955 SDValue Data = Op.getOperand(3); 3956 // CTPOP only supports integer operands. 3957 if (Data.getValueType().isFloatingPoint()) 3958 Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data); 3959 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(), 3960 Op.getOperand(2), Data, Op.getOperand(1)); 3961 } 3962 case Intrinsic::aarch64_sve_dupq_lane: 3963 return LowerDUPQLane(Op, DAG); 3964 case Intrinsic::aarch64_sve_convert_from_svbool: 3965 return DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, Op.getValueType(), 3966 Op.getOperand(1)); 3967 case Intrinsic::aarch64_sve_convert_to_svbool: 3968 return lowerConvertToSVBool(Op, DAG); 3969 case Intrinsic::aarch64_sve_fneg: 3970 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(), 3971 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 3972 case Intrinsic::aarch64_sve_frintp: 3973 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(), 3974 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 3975 case Intrinsic::aarch64_sve_frintm: 3976 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(), 3977 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 3978 case Intrinsic::aarch64_sve_frinti: 3979 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(), 3980 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 3981 case Intrinsic::aarch64_sve_frintx: 3982 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(), 3983 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 3984 case Intrinsic::aarch64_sve_frinta: 3985 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(), 3986 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 3987 case Intrinsic::aarch64_sve_frintn: 3988 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(), 3989 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 3990 case Intrinsic::aarch64_sve_frintz: 3991 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(), 3992 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 3993 case Intrinsic::aarch64_sve_ucvtf: 3994 return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, dl, 3995 Op.getValueType(), Op.getOperand(2), Op.getOperand(3), 3996 Op.getOperand(1)); 3997 case Intrinsic::aarch64_sve_scvtf: 3998 return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, dl, 3999 Op.getValueType(), Op.getOperand(2), Op.getOperand(3), 4000 Op.getOperand(1)); 4001 case Intrinsic::aarch64_sve_fcvtzu: 4002 return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, dl, 4003 Op.getValueType(), Op.getOperand(2), Op.getOperand(3), 4004 Op.getOperand(1)); 4005 case Intrinsic::aarch64_sve_fcvtzs: 4006 return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, dl, 4007 Op.getValueType(), Op.getOperand(2), Op.getOperand(3), 4008 Op.getOperand(1)); 4009 case Intrinsic::aarch64_sve_fsqrt: 4010 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(), 4011 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 4012 case Intrinsic::aarch64_sve_frecpx: 4013 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(), 4014 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 4015 case Intrinsic::aarch64_sve_fabs: 4016 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(), 4017 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 4018 case Intrinsic::aarch64_sve_abs: 4019 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(), 4020 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 4021 case Intrinsic::aarch64_sve_neg: 4022 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(), 4023 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 4024 case Intrinsic::aarch64_sve_insr: { 4025 SDValue Scalar = Op.getOperand(2); 4026 EVT ScalarTy = Scalar.getValueType(); 4027 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) 4028 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar); 4029 4030 return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(), 4031 Op.getOperand(1), Scalar); 4032 } 4033 case Intrinsic::aarch64_sve_rbit: 4034 return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, dl, 4035 Op.getValueType(), Op.getOperand(2), Op.getOperand(3), 4036 Op.getOperand(1)); 4037 case Intrinsic::aarch64_sve_revb: 4038 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(), 4039 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 4040 case Intrinsic::aarch64_sve_sxtb: 4041 return DAG.getNode( 4042 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), 4043 Op.getOperand(2), Op.getOperand(3), 4044 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)), 4045 Op.getOperand(1)); 4046 case Intrinsic::aarch64_sve_sxth: 4047 return DAG.getNode( 4048 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), 4049 Op.getOperand(2), Op.getOperand(3), 4050 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)), 4051 Op.getOperand(1)); 4052 case Intrinsic::aarch64_sve_sxtw: 4053 return DAG.getNode( 4054 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), 4055 Op.getOperand(2), Op.getOperand(3), 4056 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)), 4057 Op.getOperand(1)); 4058 case Intrinsic::aarch64_sve_uxtb: 4059 return DAG.getNode( 4060 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), 4061 Op.getOperand(2), Op.getOperand(3), 4062 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)), 4063 Op.getOperand(1)); 4064 case Intrinsic::aarch64_sve_uxth: 4065 return DAG.getNode( 4066 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), 4067 Op.getOperand(2), Op.getOperand(3), 4068 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)), 4069 Op.getOperand(1)); 4070 case Intrinsic::aarch64_sve_uxtw: 4071 return DAG.getNode( 4072 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), 4073 Op.getOperand(2), Op.getOperand(3), 4074 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)), 4075 Op.getOperand(1)); 4076 4077 case Intrinsic::localaddress: { 4078 const auto &MF = DAG.getMachineFunction(); 4079 const auto *RegInfo = Subtarget->getRegisterInfo(); 4080 unsigned Reg = RegInfo->getLocalAddressRegister(MF); 4081 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, 4082 Op.getSimpleValueType()); 4083 } 4084 4085 case Intrinsic::eh_recoverfp: { 4086 // FIXME: This needs to be implemented to correctly handle highly aligned 4087 // stack objects. For now we simply return the incoming FP. Refer D53541 4088 // for more details. 4089 SDValue FnOp = Op.getOperand(1); 4090 SDValue IncomingFPOp = Op.getOperand(2); 4091 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp); 4092 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr); 4093 if (!Fn) 4094 report_fatal_error( 4095 "llvm.eh.recoverfp must take a function as the first argument"); 4096 return IncomingFPOp; 4097 } 4098 4099 case Intrinsic::aarch64_neon_vsri: 4100 case Intrinsic::aarch64_neon_vsli: { 4101 EVT Ty = Op.getValueType(); 4102 4103 if (!Ty.isVector()) 4104 report_fatal_error("Unexpected type for aarch64_neon_vsli"); 4105 4106 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits()); 4107 4108 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri; 4109 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI; 4110 return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2), 4111 Op.getOperand(3)); 4112 } 4113 4114 case Intrinsic::aarch64_neon_srhadd: 4115 case Intrinsic::aarch64_neon_urhadd: 4116 case Intrinsic::aarch64_neon_shadd: 4117 case Intrinsic::aarch64_neon_uhadd: { 4118 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd || 4119 IntNo == Intrinsic::aarch64_neon_shadd); 4120 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd || 4121 IntNo == Intrinsic::aarch64_neon_urhadd); 4122 unsigned Opcode = 4123 IsSignedAdd ? (IsRoundingAdd ? AArch64ISD::SRHADD : AArch64ISD::SHADD) 4124 : (IsRoundingAdd ? AArch64ISD::URHADD : AArch64ISD::UHADD); 4125 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), 4126 Op.getOperand(2)); 4127 } 4128 case Intrinsic::aarch64_neon_sabd: 4129 case Intrinsic::aarch64_neon_uabd: { 4130 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uabd ? ISD::ABDU 4131 : ISD::ABDS; 4132 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), 4133 Op.getOperand(2)); 4134 } 4135 case Intrinsic::aarch64_neon_uaddlp: { 4136 unsigned Opcode = AArch64ISD::UADDLP; 4137 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1)); 4138 } 4139 case Intrinsic::aarch64_neon_sdot: 4140 case Intrinsic::aarch64_neon_udot: 4141 case Intrinsic::aarch64_sve_sdot: 4142 case Intrinsic::aarch64_sve_udot: { 4143 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot || 4144 IntNo == Intrinsic::aarch64_sve_udot) 4145 ? AArch64ISD::UDOT 4146 : AArch64ISD::SDOT; 4147 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), 4148 Op.getOperand(2), Op.getOperand(3)); 4149 } 4150 } 4151 } 4152 4153 bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const { 4154 if (VT.getVectorElementType() == MVT::i8 || 4155 VT.getVectorElementType() == MVT::i16) { 4156 EltTy = MVT::i32; 4157 return true; 4158 } 4159 return false; 4160 } 4161 4162 bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const { 4163 if (VT.getVectorElementType() == MVT::i32 && 4164 VT.getVectorElementCount().getKnownMinValue() >= 4 && 4165 !VT.isFixedLengthVector()) 4166 return true; 4167 4168 return false; 4169 } 4170 4171 bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { 4172 return ExtVal.getValueType().isScalableVector(); 4173 } 4174 4175 unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) { 4176 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = { 4177 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false), 4178 AArch64ISD::GLD1_MERGE_ZERO}, 4179 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true), 4180 AArch64ISD::GLD1_UXTW_MERGE_ZERO}, 4181 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false), 4182 AArch64ISD::GLD1_MERGE_ZERO}, 4183 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true), 4184 AArch64ISD::GLD1_SXTW_MERGE_ZERO}, 4185 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false), 4186 AArch64ISD::GLD1_SCALED_MERGE_ZERO}, 4187 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true), 4188 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO}, 4189 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false), 4190 AArch64ISD::GLD1_SCALED_MERGE_ZERO}, 4191 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true), 4192 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO}, 4193 }; 4194 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend); 4195 return AddrModes.find(Key)->second; 4196 } 4197 4198 unsigned getScatterVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) { 4199 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = { 4200 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false), 4201 AArch64ISD::SST1_PRED}, 4202 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true), 4203 AArch64ISD::SST1_UXTW_PRED}, 4204 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false), 4205 AArch64ISD::SST1_PRED}, 4206 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true), 4207 AArch64ISD::SST1_SXTW_PRED}, 4208 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false), 4209 AArch64ISD::SST1_SCALED_PRED}, 4210 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true), 4211 AArch64ISD::SST1_UXTW_SCALED_PRED}, 4212 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false), 4213 AArch64ISD::SST1_SCALED_PRED}, 4214 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true), 4215 AArch64ISD::SST1_SXTW_SCALED_PRED}, 4216 }; 4217 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend); 4218 return AddrModes.find(Key)->second; 4219 } 4220 4221 unsigned getSignExtendedGatherOpcode(unsigned Opcode) { 4222 switch (Opcode) { 4223 default: 4224 llvm_unreachable("unimplemented opcode"); 4225 return Opcode; 4226 case AArch64ISD::GLD1_MERGE_ZERO: 4227 return AArch64ISD::GLD1S_MERGE_ZERO; 4228 case AArch64ISD::GLD1_IMM_MERGE_ZERO: 4229 return AArch64ISD::GLD1S_IMM_MERGE_ZERO; 4230 case AArch64ISD::GLD1_UXTW_MERGE_ZERO: 4231 return AArch64ISD::GLD1S_UXTW_MERGE_ZERO; 4232 case AArch64ISD::GLD1_SXTW_MERGE_ZERO: 4233 return AArch64ISD::GLD1S_SXTW_MERGE_ZERO; 4234 case AArch64ISD::GLD1_SCALED_MERGE_ZERO: 4235 return AArch64ISD::GLD1S_SCALED_MERGE_ZERO; 4236 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO: 4237 return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO; 4238 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO: 4239 return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO; 4240 } 4241 } 4242 4243 bool getGatherScatterIndexIsExtended(SDValue Index) { 4244 unsigned Opcode = Index.getOpcode(); 4245 if (Opcode == ISD::SIGN_EXTEND_INREG) 4246 return true; 4247 4248 if (Opcode == ISD::AND) { 4249 SDValue Splat = Index.getOperand(1); 4250 if (Splat.getOpcode() != ISD::SPLAT_VECTOR) 4251 return false; 4252 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Splat.getOperand(0)); 4253 if (!Mask || Mask->getZExtValue() != 0xFFFFFFFF) 4254 return false; 4255 return true; 4256 } 4257 4258 return false; 4259 } 4260 4261 // If the base pointer of a masked gather or scatter is null, we 4262 // may be able to swap BasePtr & Index and use the vector + register 4263 // or vector + immediate addressing mode, e.g. 4264 // VECTOR + REGISTER: 4265 // getelementptr nullptr, <vscale x N x T> (splat(%offset)) + %indices) 4266 // -> getelementptr %offset, <vscale x N x T> %indices 4267 // VECTOR + IMMEDIATE: 4268 // getelementptr nullptr, <vscale x N x T> (splat(#x)) + %indices) 4269 // -> getelementptr #x, <vscale x N x T> %indices 4270 void selectGatherScatterAddrMode(SDValue &BasePtr, SDValue &Index, EVT MemVT, 4271 unsigned &Opcode, bool IsGather, 4272 SelectionDAG &DAG) { 4273 if (!isNullConstant(BasePtr)) 4274 return; 4275 4276 // FIXME: This will not match for fixed vector type codegen as the nodes in 4277 // question will have fixed<->scalable conversions around them. This should be 4278 // moved to a DAG combine or complex pattern so that is executes after all of 4279 // the fixed vector insert and extracts have been removed. This deficiency 4280 // will result in a sub-optimal addressing mode being used, i.e. an ADD not 4281 // being folded into the scatter/gather. 4282 ConstantSDNode *Offset = nullptr; 4283 if (Index.getOpcode() == ISD::ADD) 4284 if (auto SplatVal = DAG.getSplatValue(Index.getOperand(1))) { 4285 if (isa<ConstantSDNode>(SplatVal)) 4286 Offset = cast<ConstantSDNode>(SplatVal); 4287 else { 4288 BasePtr = SplatVal; 4289 Index = Index->getOperand(0); 4290 return; 4291 } 4292 } 4293 4294 unsigned NewOp = 4295 IsGather ? AArch64ISD::GLD1_IMM_MERGE_ZERO : AArch64ISD::SST1_IMM_PRED; 4296 4297 if (!Offset) { 4298 std::swap(BasePtr, Index); 4299 Opcode = NewOp; 4300 return; 4301 } 4302 4303 uint64_t OffsetVal = Offset->getZExtValue(); 4304 unsigned ScalarSizeInBytes = MemVT.getScalarSizeInBits() / 8; 4305 auto ConstOffset = DAG.getConstant(OffsetVal, SDLoc(Index), MVT::i64); 4306 4307 if (OffsetVal % ScalarSizeInBytes || OffsetVal / ScalarSizeInBytes > 31) { 4308 // Index is out of range for the immediate addressing mode 4309 BasePtr = ConstOffset; 4310 Index = Index->getOperand(0); 4311 return; 4312 } 4313 4314 // Immediate is in range 4315 Opcode = NewOp; 4316 BasePtr = Index->getOperand(0); 4317 Index = ConstOffset; 4318 } 4319 4320 SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op, 4321 SelectionDAG &DAG) const { 4322 SDLoc DL(Op); 4323 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op); 4324 assert(MGT && "Can only custom lower gather load nodes"); 4325 4326 bool IsFixedLength = MGT->getMemoryVT().isFixedLengthVector(); 4327 4328 SDValue Index = MGT->getIndex(); 4329 SDValue Chain = MGT->getChain(); 4330 SDValue PassThru = MGT->getPassThru(); 4331 SDValue Mask = MGT->getMask(); 4332 SDValue BasePtr = MGT->getBasePtr(); 4333 ISD::LoadExtType ExtTy = MGT->getExtensionType(); 4334 4335 ISD::MemIndexType IndexType = MGT->getIndexType(); 4336 bool IsScaled = 4337 IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED; 4338 bool IsSigned = 4339 IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED; 4340 bool IdxNeedsExtend = 4341 getGatherScatterIndexIsExtended(Index) || 4342 Index.getSimpleValueType().getVectorElementType() == MVT::i32; 4343 bool ResNeedsSignExtend = ExtTy == ISD::EXTLOAD || ExtTy == ISD::SEXTLOAD; 4344 4345 EVT VT = PassThru.getSimpleValueType(); 4346 EVT IndexVT = Index.getSimpleValueType(); 4347 EVT MemVT = MGT->getMemoryVT(); 4348 SDValue InputVT = DAG.getValueType(MemVT); 4349 4350 if (VT.getVectorElementType() == MVT::bf16 && 4351 !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16()) 4352 return SDValue(); 4353 4354 if (IsFixedLength) { 4355 assert(Subtarget->useSVEForFixedLengthVectors() && 4356 "Cannot lower when not using SVE for fixed vectors"); 4357 if (MemVT.getScalarSizeInBits() <= IndexVT.getScalarSizeInBits()) { 4358 IndexVT = getContainerForFixedLengthVector(DAG, IndexVT); 4359 MemVT = IndexVT.changeVectorElementType(MemVT.getVectorElementType()); 4360 } else { 4361 MemVT = getContainerForFixedLengthVector(DAG, MemVT); 4362 IndexVT = MemVT.changeTypeToInteger(); 4363 } 4364 InputVT = DAG.getValueType(MemVT.changeTypeToInteger()); 4365 Mask = DAG.getNode( 4366 ISD::ZERO_EXTEND, DL, 4367 VT.changeVectorElementType(IndexVT.getVectorElementType()), Mask); 4368 } 4369 4370 if (PassThru->isUndef() || isZerosVector(PassThru.getNode())) 4371 PassThru = SDValue(); 4372 4373 if (VT.isFloatingPoint() && !IsFixedLength) { 4374 // Handle FP data by using an integer gather and casting the result. 4375 if (PassThru) { 4376 EVT PassThruVT = getPackedSVEVectorVT(VT.getVectorElementCount()); 4377 PassThru = getSVESafeBitCast(PassThruVT, PassThru, DAG); 4378 } 4379 InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger()); 4380 } 4381 4382 SDVTList VTs = DAG.getVTList(IndexVT, MVT::Other); 4383 4384 if (getGatherScatterIndexIsExtended(Index)) 4385 Index = Index.getOperand(0); 4386 4387 unsigned Opcode = getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend); 4388 selectGatherScatterAddrMode(BasePtr, Index, MemVT, Opcode, 4389 /*isGather=*/true, DAG); 4390 4391 if (ResNeedsSignExtend) 4392 Opcode = getSignExtendedGatherOpcode(Opcode); 4393 4394 if (IsFixedLength) { 4395 if (Index.getSimpleValueType().isFixedLengthVector()) 4396 Index = convertToScalableVector(DAG, IndexVT, Index); 4397 if (BasePtr.getSimpleValueType().isFixedLengthVector()) 4398 BasePtr = convertToScalableVector(DAG, IndexVT, BasePtr); 4399 Mask = convertFixedMaskToScalableVector(Mask, DAG); 4400 } 4401 4402 SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT}; 4403 SDValue Result = DAG.getNode(Opcode, DL, VTs, Ops); 4404 Chain = Result.getValue(1); 4405 4406 if (IsFixedLength) { 4407 Result = convertFromScalableVector( 4408 DAG, VT.changeVectorElementType(IndexVT.getVectorElementType()), 4409 Result); 4410 Result = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Result); 4411 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result); 4412 4413 if (PassThru) 4414 Result = DAG.getSelect(DL, VT, MGT->getMask(), Result, PassThru); 4415 } else { 4416 if (PassThru) 4417 Result = DAG.getSelect(DL, IndexVT, Mask, Result, PassThru); 4418 4419 if (VT.isFloatingPoint()) 4420 Result = getSVESafeBitCast(VT, Result, DAG); 4421 } 4422 4423 return DAG.getMergeValues({Result, Chain}, DL); 4424 } 4425 4426 SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op, 4427 SelectionDAG &DAG) const { 4428 SDLoc DL(Op); 4429 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op); 4430 assert(MSC && "Can only custom lower scatter store nodes"); 4431 4432 bool IsFixedLength = MSC->getMemoryVT().isFixedLengthVector(); 4433 4434 SDValue Index = MSC->getIndex(); 4435 SDValue Chain = MSC->getChain(); 4436 SDValue StoreVal = MSC->getValue(); 4437 SDValue Mask = MSC->getMask(); 4438 SDValue BasePtr = MSC->getBasePtr(); 4439 4440 ISD::MemIndexType IndexType = MSC->getIndexType(); 4441 bool IsScaled = 4442 IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED; 4443 bool IsSigned = 4444 IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED; 4445 bool NeedsExtend = 4446 getGatherScatterIndexIsExtended(Index) || 4447 Index.getSimpleValueType().getVectorElementType() == MVT::i32; 4448 4449 EVT VT = StoreVal.getSimpleValueType(); 4450 EVT IndexVT = Index.getSimpleValueType(); 4451 SDVTList VTs = DAG.getVTList(MVT::Other); 4452 EVT MemVT = MSC->getMemoryVT(); 4453 SDValue InputVT = DAG.getValueType(MemVT); 4454 4455 if (VT.getVectorElementType() == MVT::bf16 && 4456 !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16()) 4457 return SDValue(); 4458 4459 if (IsFixedLength) { 4460 assert(Subtarget->useSVEForFixedLengthVectors() && 4461 "Cannot lower when not using SVE for fixed vectors"); 4462 if (MemVT.getScalarSizeInBits() <= IndexVT.getScalarSizeInBits()) { 4463 IndexVT = getContainerForFixedLengthVector(DAG, IndexVT); 4464 MemVT = IndexVT.changeVectorElementType(MemVT.getVectorElementType()); 4465 } else { 4466 MemVT = getContainerForFixedLengthVector(DAG, MemVT); 4467 IndexVT = MemVT.changeTypeToInteger(); 4468 } 4469 InputVT = DAG.getValueType(MemVT.changeTypeToInteger()); 4470 4471 StoreVal = 4472 DAG.getNode(ISD::BITCAST, DL, VT.changeTypeToInteger(), StoreVal); 4473 StoreVal = DAG.getNode( 4474 ISD::ANY_EXTEND, DL, 4475 VT.changeVectorElementType(IndexVT.getVectorElementType()), StoreVal); 4476 StoreVal = convertToScalableVector(DAG, IndexVT, StoreVal); 4477 Mask = DAG.getNode( 4478 ISD::ZERO_EXTEND, DL, 4479 VT.changeVectorElementType(IndexVT.getVectorElementType()), Mask); 4480 } else if (VT.isFloatingPoint()) { 4481 // Handle FP data by casting the data so an integer scatter can be used. 4482 EVT StoreValVT = getPackedSVEVectorVT(VT.getVectorElementCount()); 4483 StoreVal = getSVESafeBitCast(StoreValVT, StoreVal, DAG); 4484 InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger()); 4485 } 4486 4487 if (getGatherScatterIndexIsExtended(Index)) 4488 Index = Index.getOperand(0); 4489 4490 unsigned Opcode = getScatterVecOpcode(IsScaled, IsSigned, NeedsExtend); 4491 selectGatherScatterAddrMode(BasePtr, Index, MemVT, Opcode, 4492 /*isGather=*/false, DAG); 4493 4494 if (IsFixedLength) { 4495 if (Index.getSimpleValueType().isFixedLengthVector()) 4496 Index = convertToScalableVector(DAG, IndexVT, Index); 4497 if (BasePtr.getSimpleValueType().isFixedLengthVector()) 4498 BasePtr = convertToScalableVector(DAG, IndexVT, BasePtr); 4499 Mask = convertFixedMaskToScalableVector(Mask, DAG); 4500 } 4501 4502 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, InputVT}; 4503 return DAG.getNode(Opcode, DL, VTs, Ops); 4504 } 4505 4506 SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const { 4507 SDLoc DL(Op); 4508 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op); 4509 assert(LoadNode && "Expected custom lowering of a masked load node"); 4510 EVT VT = Op->getValueType(0); 4511 4512 if (useSVEForFixedLengthVectorVT(VT, true)) 4513 return LowerFixedLengthVectorMLoadToSVE(Op, DAG); 4514 4515 SDValue PassThru = LoadNode->getPassThru(); 4516 SDValue Mask = LoadNode->getMask(); 4517 4518 if (PassThru->isUndef() || isZerosVector(PassThru.getNode())) 4519 return Op; 4520 4521 SDValue Load = DAG.getMaskedLoad( 4522 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(), 4523 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(), 4524 LoadNode->getMemOperand(), LoadNode->getAddressingMode(), 4525 LoadNode->getExtensionType()); 4526 4527 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru); 4528 4529 return DAG.getMergeValues({Result, Load.getValue(1)}, DL); 4530 } 4531 4532 // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16. 4533 static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, 4534 EVT VT, EVT MemVT, 4535 SelectionDAG &DAG) { 4536 assert(VT.isVector() && "VT should be a vector type"); 4537 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16); 4538 4539 SDValue Value = ST->getValue(); 4540 4541 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract 4542 // the word lane which represent the v4i8 subvector. It optimizes the store 4543 // to: 4544 // 4545 // xtn v0.8b, v0.8h 4546 // str s0, [x0] 4547 4548 SDValue Undef = DAG.getUNDEF(MVT::i16); 4549 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL, 4550 {Undef, Undef, Undef, Undef}); 4551 4552 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, 4553 Value, UndefVec); 4554 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt); 4555 4556 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc); 4557 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, 4558 Trunc, DAG.getConstant(0, DL, MVT::i64)); 4559 4560 return DAG.getStore(ST->getChain(), DL, ExtractTrunc, 4561 ST->getBasePtr(), ST->getMemOperand()); 4562 } 4563 4564 // Custom lowering for any store, vector or scalar and/or default or with 4565 // a truncate operations. Currently only custom lower truncate operation 4566 // from vector v4i16 to v4i8 or volatile stores of i128. 4567 SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, 4568 SelectionDAG &DAG) const { 4569 SDLoc Dl(Op); 4570 StoreSDNode *StoreNode = cast<StoreSDNode>(Op); 4571 assert (StoreNode && "Can only custom lower store nodes"); 4572 4573 SDValue Value = StoreNode->getValue(); 4574 4575 EVT VT = Value.getValueType(); 4576 EVT MemVT = StoreNode->getMemoryVT(); 4577 4578 if (VT.isVector()) { 4579 if (useSVEForFixedLengthVectorVT(VT, true)) 4580 return LowerFixedLengthVectorStoreToSVE(Op, DAG); 4581 4582 unsigned AS = StoreNode->getAddressSpace(); 4583 Align Alignment = StoreNode->getAlign(); 4584 if (Alignment < MemVT.getStoreSize() && 4585 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment, 4586 StoreNode->getMemOperand()->getFlags(), 4587 nullptr)) { 4588 return scalarizeVectorStore(StoreNode, DAG); 4589 } 4590 4591 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 && 4592 MemVT == MVT::v4i8) { 4593 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG); 4594 } 4595 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of 4596 // the custom lowering, as there are no un-paired non-temporal stores and 4597 // legalization will break up 256 bit inputs. 4598 ElementCount EC = MemVT.getVectorElementCount(); 4599 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u && 4600 EC.isKnownEven() && 4601 ((MemVT.getScalarSizeInBits() == 8u || 4602 MemVT.getScalarSizeInBits() == 16u || 4603 MemVT.getScalarSizeInBits() == 32u || 4604 MemVT.getScalarSizeInBits() == 64u))) { 4605 SDValue Lo = 4606 DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl, 4607 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), 4608 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64)); 4609 SDValue Hi = 4610 DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl, 4611 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), 4612 StoreNode->getValue(), 4613 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64)); 4614 SDValue Result = DAG.getMemIntrinsicNode( 4615 AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other), 4616 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()}, 4617 StoreNode->getMemoryVT(), StoreNode->getMemOperand()); 4618 return Result; 4619 } 4620 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) { 4621 assert(StoreNode->getValue()->getValueType(0) == MVT::i128); 4622 SDValue Lo = 4623 DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(), 4624 DAG.getConstant(0, Dl, MVT::i64)); 4625 SDValue Hi = 4626 DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(), 4627 DAG.getConstant(1, Dl, MVT::i64)); 4628 SDValue Result = DAG.getMemIntrinsicNode( 4629 AArch64ISD::STP, Dl, DAG.getVTList(MVT::Other), 4630 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()}, 4631 StoreNode->getMemoryVT(), StoreNode->getMemOperand()); 4632 return Result; 4633 } else if (MemVT == MVT::i64x8) { 4634 SDValue Value = StoreNode->getValue(); 4635 assert(Value->getValueType(0) == MVT::i64x8); 4636 SDValue Chain = StoreNode->getChain(); 4637 SDValue Base = StoreNode->getBasePtr(); 4638 EVT PtrVT = Base.getValueType(); 4639 for (unsigned i = 0; i < 8; i++) { 4640 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64, 4641 Value, DAG.getConstant(i, Dl, MVT::i32)); 4642 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base, 4643 DAG.getConstant(i * 8, Dl, PtrVT)); 4644 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(), 4645 StoreNode->getOriginalAlign()); 4646 } 4647 return Chain; 4648 } 4649 4650 return SDValue(); 4651 } 4652 4653 SDValue AArch64TargetLowering::LowerLOAD(SDValue Op, 4654 SelectionDAG &DAG) const { 4655 SDLoc DL(Op); 4656 LoadSDNode *LoadNode = cast<LoadSDNode>(Op); 4657 assert(LoadNode && "Expected custom lowering of a load node"); 4658 4659 if (LoadNode->getMemoryVT() == MVT::i64x8) { 4660 SmallVector<SDValue, 8> Ops; 4661 SDValue Base = LoadNode->getBasePtr(); 4662 SDValue Chain = LoadNode->getChain(); 4663 EVT PtrVT = Base.getValueType(); 4664 for (unsigned i = 0; i < 8; i++) { 4665 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base, 4666 DAG.getConstant(i * 8, DL, PtrVT)); 4667 SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr, 4668 LoadNode->getPointerInfo(), 4669 LoadNode->getOriginalAlign()); 4670 Ops.push_back(Part); 4671 Chain = SDValue(Part.getNode(), 1); 4672 } 4673 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops); 4674 return DAG.getMergeValues({Loaded, Chain}, DL); 4675 } 4676 4677 // Custom lowering for extending v4i8 vector loads. 4678 EVT VT = Op->getValueType(0); 4679 assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32"); 4680 4681 if (LoadNode->getMemoryVT() != MVT::v4i8) 4682 return SDValue(); 4683 4684 unsigned ExtType; 4685 if (LoadNode->getExtensionType() == ISD::SEXTLOAD) 4686 ExtType = ISD::SIGN_EXTEND; 4687 else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD || 4688 LoadNode->getExtensionType() == ISD::EXTLOAD) 4689 ExtType = ISD::ZERO_EXTEND; 4690 else 4691 return SDValue(); 4692 4693 SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(), 4694 LoadNode->getBasePtr(), MachinePointerInfo()); 4695 SDValue Chain = Load.getValue(1); 4696 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load); 4697 SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec); 4698 SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC); 4699 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext, 4700 DAG.getConstant(0, DL, MVT::i64)); 4701 if (VT == MVT::v4i32) 4702 Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext); 4703 return DAG.getMergeValues({Ext, Chain}, DL); 4704 } 4705 4706 // Generate SUBS and CSEL for integer abs. 4707 SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const { 4708 MVT VT = Op.getSimpleValueType(); 4709 4710 if (VT.isVector()) 4711 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU); 4712 4713 SDLoc DL(Op); 4714 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), 4715 Op.getOperand(0)); 4716 // Generate SUBS & CSEL. 4717 SDValue Cmp = 4718 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32), 4719 Op.getOperand(0), DAG.getConstant(0, DL, VT)); 4720 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg, 4721 DAG.getConstant(AArch64CC::PL, DL, MVT::i32), 4722 Cmp.getValue(1)); 4723 } 4724 4725 SDValue AArch64TargetLowering::LowerOperation(SDValue Op, 4726 SelectionDAG &DAG) const { 4727 LLVM_DEBUG(dbgs() << "Custom lowering: "); 4728 LLVM_DEBUG(Op.dump()); 4729 4730 switch (Op.getOpcode()) { 4731 default: 4732 llvm_unreachable("unimplemented operand"); 4733 return SDValue(); 4734 case ISD::BITCAST: 4735 return LowerBITCAST(Op, DAG); 4736 case ISD::GlobalAddress: 4737 return LowerGlobalAddress(Op, DAG); 4738 case ISD::GlobalTLSAddress: 4739 return LowerGlobalTLSAddress(Op, DAG); 4740 case ISD::SETCC: 4741 case ISD::STRICT_FSETCC: 4742 case ISD::STRICT_FSETCCS: 4743 return LowerSETCC(Op, DAG); 4744 case ISD::BR_CC: 4745 return LowerBR_CC(Op, DAG); 4746 case ISD::SELECT: 4747 return LowerSELECT(Op, DAG); 4748 case ISD::SELECT_CC: 4749 return LowerSELECT_CC(Op, DAG); 4750 case ISD::JumpTable: 4751 return LowerJumpTable(Op, DAG); 4752 case ISD::BR_JT: 4753 return LowerBR_JT(Op, DAG); 4754 case ISD::ConstantPool: 4755 return LowerConstantPool(Op, DAG); 4756 case ISD::BlockAddress: 4757 return LowerBlockAddress(Op, DAG); 4758 case ISD::VASTART: 4759 return LowerVASTART(Op, DAG); 4760 case ISD::VACOPY: 4761 return LowerVACOPY(Op, DAG); 4762 case ISD::VAARG: 4763 return LowerVAARG(Op, DAG); 4764 case ISD::ADDC: 4765 case ISD::ADDE: 4766 case ISD::SUBC: 4767 case ISD::SUBE: 4768 return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 4769 case ISD::SADDO: 4770 case ISD::UADDO: 4771 case ISD::SSUBO: 4772 case ISD::USUBO: 4773 case ISD::SMULO: 4774 case ISD::UMULO: 4775 return LowerXALUO(Op, DAG); 4776 case ISD::FADD: 4777 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED); 4778 case ISD::FSUB: 4779 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED); 4780 case ISD::FMUL: 4781 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED); 4782 case ISD::FMA: 4783 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED); 4784 case ISD::FDIV: 4785 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED); 4786 case ISD::FNEG: 4787 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU); 4788 case ISD::FCEIL: 4789 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU); 4790 case ISD::FFLOOR: 4791 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU); 4792 case ISD::FNEARBYINT: 4793 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU); 4794 case ISD::FRINT: 4795 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU); 4796 case ISD::FROUND: 4797 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU); 4798 case ISD::FROUNDEVEN: 4799 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU); 4800 case ISD::FTRUNC: 4801 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU); 4802 case ISD::FSQRT: 4803 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU); 4804 case ISD::FABS: 4805 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU); 4806 case ISD::FP_ROUND: 4807 case ISD::STRICT_FP_ROUND: 4808 return LowerFP_ROUND(Op, DAG); 4809 case ISD::FP_EXTEND: 4810 return LowerFP_EXTEND(Op, DAG); 4811 case ISD::FRAMEADDR: 4812 return LowerFRAMEADDR(Op, DAG); 4813 case ISD::SPONENTRY: 4814 return LowerSPONENTRY(Op, DAG); 4815 case ISD::RETURNADDR: 4816 return LowerRETURNADDR(Op, DAG); 4817 case ISD::ADDROFRETURNADDR: 4818 return LowerADDROFRETURNADDR(Op, DAG); 4819 case ISD::CONCAT_VECTORS: 4820 return LowerCONCAT_VECTORS(Op, DAG); 4821 case ISD::INSERT_VECTOR_ELT: 4822 return LowerINSERT_VECTOR_ELT(Op, DAG); 4823 case ISD::EXTRACT_VECTOR_ELT: 4824 return LowerEXTRACT_VECTOR_ELT(Op, DAG); 4825 case ISD::BUILD_VECTOR: 4826 return LowerBUILD_VECTOR(Op, DAG); 4827 case ISD::VECTOR_SHUFFLE: 4828 return LowerVECTOR_SHUFFLE(Op, DAG); 4829 case ISD::SPLAT_VECTOR: 4830 return LowerSPLAT_VECTOR(Op, DAG); 4831 case ISD::EXTRACT_SUBVECTOR: 4832 return LowerEXTRACT_SUBVECTOR(Op, DAG); 4833 case ISD::INSERT_SUBVECTOR: 4834 return LowerINSERT_SUBVECTOR(Op, DAG); 4835 case ISD::SDIV: 4836 case ISD::UDIV: 4837 return LowerDIV(Op, DAG); 4838 case ISD::SMIN: 4839 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED, 4840 /*OverrideNEON=*/true); 4841 case ISD::UMIN: 4842 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED, 4843 /*OverrideNEON=*/true); 4844 case ISD::SMAX: 4845 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED, 4846 /*OverrideNEON=*/true); 4847 case ISD::UMAX: 4848 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED, 4849 /*OverrideNEON=*/true); 4850 case ISD::SRA: 4851 case ISD::SRL: 4852 case ISD::SHL: 4853 return LowerVectorSRA_SRL_SHL(Op, DAG); 4854 case ISD::SHL_PARTS: 4855 case ISD::SRL_PARTS: 4856 case ISD::SRA_PARTS: 4857 return LowerShiftParts(Op, DAG); 4858 case ISD::CTPOP: 4859 return LowerCTPOP(Op, DAG); 4860 case ISD::FCOPYSIGN: 4861 return LowerFCOPYSIGN(Op, DAG); 4862 case ISD::OR: 4863 return LowerVectorOR(Op, DAG); 4864 case ISD::XOR: 4865 return LowerXOR(Op, DAG); 4866 case ISD::PREFETCH: 4867 return LowerPREFETCH(Op, DAG); 4868 case ISD::SINT_TO_FP: 4869 case ISD::UINT_TO_FP: 4870 case ISD::STRICT_SINT_TO_FP: 4871 case ISD::STRICT_UINT_TO_FP: 4872 return LowerINT_TO_FP(Op, DAG); 4873 case ISD::FP_TO_SINT: 4874 case ISD::FP_TO_UINT: 4875 case ISD::STRICT_FP_TO_SINT: 4876 case ISD::STRICT_FP_TO_UINT: 4877 return LowerFP_TO_INT(Op, DAG); 4878 case ISD::FP_TO_SINT_SAT: 4879 case ISD::FP_TO_UINT_SAT: 4880 return LowerFP_TO_INT_SAT(Op, DAG); 4881 case ISD::FSINCOS: 4882 return LowerFSINCOS(Op, DAG); 4883 case ISD::FLT_ROUNDS_: 4884 return LowerFLT_ROUNDS_(Op, DAG); 4885 case ISD::SET_ROUNDING: 4886 return LowerSET_ROUNDING(Op, DAG); 4887 case ISD::MUL: 4888 return LowerMUL(Op, DAG); 4889 case ISD::MULHS: 4890 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED, 4891 /*OverrideNEON=*/true); 4892 case ISD::MULHU: 4893 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED, 4894 /*OverrideNEON=*/true); 4895 case ISD::INTRINSIC_WO_CHAIN: 4896 return LowerINTRINSIC_WO_CHAIN(Op, DAG); 4897 case ISD::STORE: 4898 return LowerSTORE(Op, DAG); 4899 case ISD::MSTORE: 4900 return LowerFixedLengthVectorMStoreToSVE(Op, DAG); 4901 case ISD::MGATHER: 4902 return LowerMGATHER(Op, DAG); 4903 case ISD::MSCATTER: 4904 return LowerMSCATTER(Op, DAG); 4905 case ISD::VECREDUCE_SEQ_FADD: 4906 return LowerVECREDUCE_SEQ_FADD(Op, DAG); 4907 case ISD::VECREDUCE_ADD: 4908 case ISD::VECREDUCE_AND: 4909 case ISD::VECREDUCE_OR: 4910 case ISD::VECREDUCE_XOR: 4911 case ISD::VECREDUCE_SMAX: 4912 case ISD::VECREDUCE_SMIN: 4913 case ISD::VECREDUCE_UMAX: 4914 case ISD::VECREDUCE_UMIN: 4915 case ISD::VECREDUCE_FADD: 4916 case ISD::VECREDUCE_FMAX: 4917 case ISD::VECREDUCE_FMIN: 4918 return LowerVECREDUCE(Op, DAG); 4919 case ISD::ATOMIC_LOAD_SUB: 4920 return LowerATOMIC_LOAD_SUB(Op, DAG); 4921 case ISD::ATOMIC_LOAD_AND: 4922 return LowerATOMIC_LOAD_AND(Op, DAG); 4923 case ISD::DYNAMIC_STACKALLOC: 4924 return LowerDYNAMIC_STACKALLOC(Op, DAG); 4925 case ISD::VSCALE: 4926 return LowerVSCALE(Op, DAG); 4927 case ISD::ANY_EXTEND: 4928 case ISD::SIGN_EXTEND: 4929 case ISD::ZERO_EXTEND: 4930 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG); 4931 case ISD::SIGN_EXTEND_INREG: { 4932 // Only custom lower when ExtraVT has a legal byte based element type. 4933 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); 4934 EVT ExtraEltVT = ExtraVT.getVectorElementType(); 4935 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) && 4936 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64)) 4937 return SDValue(); 4938 4939 return LowerToPredicatedOp(Op, DAG, 4940 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU); 4941 } 4942 case ISD::TRUNCATE: 4943 return LowerTRUNCATE(Op, DAG); 4944 case ISD::MLOAD: 4945 return LowerMLOAD(Op, DAG); 4946 case ISD::LOAD: 4947 if (useSVEForFixedLengthVectorVT(Op.getValueType())) 4948 return LowerFixedLengthVectorLoadToSVE(Op, DAG); 4949 return LowerLOAD(Op, DAG); 4950 case ISD::ADD: 4951 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED); 4952 case ISD::AND: 4953 return LowerToScalableOp(Op, DAG); 4954 case ISD::SUB: 4955 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SUB_PRED); 4956 case ISD::FMAXIMUM: 4957 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED); 4958 case ISD::FMAXNUM: 4959 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED); 4960 case ISD::FMINIMUM: 4961 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED); 4962 case ISD::FMINNUM: 4963 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED); 4964 case ISD::VSELECT: 4965 return LowerFixedLengthVectorSelectToSVE(Op, DAG); 4966 case ISD::ABS: 4967 return LowerABS(Op, DAG); 4968 case ISD::BITREVERSE: 4969 return LowerBitreverse(Op, DAG); 4970 case ISD::BSWAP: 4971 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU); 4972 case ISD::CTLZ: 4973 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU, 4974 /*OverrideNEON=*/true); 4975 case ISD::CTTZ: 4976 return LowerCTTZ(Op, DAG); 4977 case ISD::VECTOR_SPLICE: 4978 return LowerVECTOR_SPLICE(Op, DAG); 4979 } 4980 } 4981 4982 bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const { 4983 return !Subtarget->useSVEForFixedLengthVectors(); 4984 } 4985 4986 bool AArch64TargetLowering::useSVEForFixedLengthVectorVT( 4987 EVT VT, bool OverrideNEON) const { 4988 if (!Subtarget->useSVEForFixedLengthVectors()) 4989 return false; 4990 4991 if (!VT.isFixedLengthVector()) 4992 return false; 4993 4994 // Don't use SVE for vectors we cannot scalarize if required. 4995 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { 4996 // Fixed length predicates should be promoted to i8. 4997 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work. 4998 case MVT::i1: 4999 default: 5000 return false; 5001 case MVT::i8: 5002 case MVT::i16: 5003 case MVT::i32: 5004 case MVT::i64: 5005 case MVT::f16: 5006 case MVT::f32: 5007 case MVT::f64: 5008 break; 5009 } 5010 5011 // All SVE implementations support NEON sized vectors. 5012 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector())) 5013 return true; 5014 5015 // Ensure NEON MVTs only belong to a single register class. 5016 if (VT.getFixedSizeInBits() <= 128) 5017 return false; 5018 5019 // Don't use SVE for types that don't fit. 5020 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits()) 5021 return false; 5022 5023 // TODO: Perhaps an artificial restriction, but worth having whilst getting 5024 // the base fixed length SVE support in place. 5025 if (!VT.isPow2VectorType()) 5026 return false; 5027 5028 return true; 5029 } 5030 5031 //===----------------------------------------------------------------------===// 5032 // Calling Convention Implementation 5033 //===----------------------------------------------------------------------===// 5034 5035 /// Selects the correct CCAssignFn for a given CallingConvention value. 5036 CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, 5037 bool IsVarArg) const { 5038 switch (CC) { 5039 default: 5040 report_fatal_error("Unsupported calling convention."); 5041 case CallingConv::WebKit_JS: 5042 return CC_AArch64_WebKit_JS; 5043 case CallingConv::GHC: 5044 return CC_AArch64_GHC; 5045 case CallingConv::C: 5046 case CallingConv::Fast: 5047 case CallingConv::PreserveMost: 5048 case CallingConv::CXX_FAST_TLS: 5049 case CallingConv::Swift: 5050 case CallingConv::SwiftTail: 5051 case CallingConv::Tail: 5052 if (Subtarget->isTargetWindows() && IsVarArg) 5053 return CC_AArch64_Win64_VarArg; 5054 if (!Subtarget->isTargetDarwin()) 5055 return CC_AArch64_AAPCS; 5056 if (!IsVarArg) 5057 return CC_AArch64_DarwinPCS; 5058 return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg 5059 : CC_AArch64_DarwinPCS_VarArg; 5060 case CallingConv::Win64: 5061 return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS; 5062 case CallingConv::CFGuard_Check: 5063 return CC_AArch64_Win64_CFGuard_Check; 5064 case CallingConv::AArch64_VectorCall: 5065 case CallingConv::AArch64_SVE_VectorCall: 5066 return CC_AArch64_AAPCS; 5067 } 5068 } 5069 5070 CCAssignFn * 5071 AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const { 5072 return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS 5073 : RetCC_AArch64_AAPCS; 5074 } 5075 5076 SDValue AArch64TargetLowering::LowerFormalArguments( 5077 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 5078 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, 5079 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 5080 MachineFunction &MF = DAG.getMachineFunction(); 5081 MachineFrameInfo &MFI = MF.getFrameInfo(); 5082 bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()); 5083 5084 // Assign locations to all of the incoming arguments. 5085 SmallVector<CCValAssign, 16> ArgLocs; 5086 DenseMap<unsigned, SDValue> CopiedRegs; 5087 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 5088 *DAG.getContext()); 5089 5090 // At this point, Ins[].VT may already be promoted to i32. To correctly 5091 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and 5092 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. 5093 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here 5094 // we use a special version of AnalyzeFormalArguments to pass in ValVT and 5095 // LocVT. 5096 unsigned NumArgs = Ins.size(); 5097 Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin(); 5098 unsigned CurArgIdx = 0; 5099 for (unsigned i = 0; i != NumArgs; ++i) { 5100 MVT ValVT = Ins[i].VT; 5101 if (Ins[i].isOrigArg()) { 5102 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx); 5103 CurArgIdx = Ins[i].getOrigArgIndex(); 5104 5105 // Get type of the original argument. 5106 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(), 5107 /*AllowUnknown*/ true); 5108 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other; 5109 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. 5110 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) 5111 ValVT = MVT::i8; 5112 else if (ActualMVT == MVT::i16) 5113 ValVT = MVT::i16; 5114 } 5115 bool UseVarArgCC = false; 5116 if (IsWin64) 5117 UseVarArgCC = isVarArg; 5118 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC); 5119 bool Res = 5120 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo); 5121 assert(!Res && "Call operand has unhandled type"); 5122 (void)Res; 5123 } 5124 SmallVector<SDValue, 16> ArgValues; 5125 unsigned ExtraArgLocs = 0; 5126 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 5127 CCValAssign &VA = ArgLocs[i - ExtraArgLocs]; 5128 5129 if (Ins[i].Flags.isByVal()) { 5130 // Byval is used for HFAs in the PCS, but the system should work in a 5131 // non-compliant manner for larger structs. 5132 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 5133 int Size = Ins[i].Flags.getByValSize(); 5134 unsigned NumRegs = (Size + 7) / 8; 5135 5136 // FIXME: This works on big-endian for composite byvals, which are the common 5137 // case. It should also work for fundamental types too. 5138 unsigned FrameIdx = 5139 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false); 5140 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT); 5141 InVals.push_back(FrameIdxN); 5142 5143 continue; 5144 } 5145 5146 if (Ins[i].Flags.isSwiftAsync()) 5147 MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true); 5148 5149 SDValue ArgValue; 5150 if (VA.isRegLoc()) { 5151 // Arguments stored in registers. 5152 EVT RegVT = VA.getLocVT(); 5153 const TargetRegisterClass *RC; 5154 5155 if (RegVT == MVT::i32) 5156 RC = &AArch64::GPR32RegClass; 5157 else if (RegVT == MVT::i64) 5158 RC = &AArch64::GPR64RegClass; 5159 else if (RegVT == MVT::f16 || RegVT == MVT::bf16) 5160 RC = &AArch64::FPR16RegClass; 5161 else if (RegVT == MVT::f32) 5162 RC = &AArch64::FPR32RegClass; 5163 else if (RegVT == MVT::f64 || RegVT.is64BitVector()) 5164 RC = &AArch64::FPR64RegClass; 5165 else if (RegVT == MVT::f128 || RegVT.is128BitVector()) 5166 RC = &AArch64::FPR128RegClass; 5167 else if (RegVT.isScalableVector() && 5168 RegVT.getVectorElementType() == MVT::i1) 5169 RC = &AArch64::PPRRegClass; 5170 else if (RegVT.isScalableVector()) 5171 RC = &AArch64::ZPRRegClass; 5172 else 5173 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); 5174 5175 // Transform the arguments in physical registers into virtual ones. 5176 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 5177 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT); 5178 5179 // If this is an 8, 16 or 32-bit value, it is really passed promoted 5180 // to 64 bits. Insert an assert[sz]ext to capture this, then 5181 // truncate to the right size. 5182 switch (VA.getLocInfo()) { 5183 default: 5184 llvm_unreachable("Unknown loc info!"); 5185 case CCValAssign::Full: 5186 break; 5187 case CCValAssign::Indirect: 5188 assert(VA.getValVT().isScalableVector() && 5189 "Only scalable vectors can be passed indirectly"); 5190 break; 5191 case CCValAssign::BCvt: 5192 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue); 5193 break; 5194 case CCValAssign::AExt: 5195 case CCValAssign::SExt: 5196 case CCValAssign::ZExt: 5197 break; 5198 case CCValAssign::AExtUpper: 5199 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue, 5200 DAG.getConstant(32, DL, RegVT)); 5201 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT()); 5202 break; 5203 } 5204 } else { // VA.isRegLoc() 5205 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem"); 5206 unsigned ArgOffset = VA.getLocMemOffset(); 5207 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect 5208 ? VA.getLocVT().getSizeInBits() 5209 : VA.getValVT().getSizeInBits()) / 8; 5210 5211 uint32_t BEAlign = 0; 5212 if (!Subtarget->isLittleEndian() && ArgSize < 8 && 5213 !Ins[i].Flags.isInConsecutiveRegs()) 5214 BEAlign = 8 - ArgSize; 5215 5216 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true); 5217 5218 // Create load nodes to retrieve arguments from the stack. 5219 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 5220 5221 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) 5222 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; 5223 MVT MemVT = VA.getValVT(); 5224 5225 switch (VA.getLocInfo()) { 5226 default: 5227 break; 5228 case CCValAssign::Trunc: 5229 case CCValAssign::BCvt: 5230 MemVT = VA.getLocVT(); 5231 break; 5232 case CCValAssign::Indirect: 5233 assert(VA.getValVT().isScalableVector() && 5234 "Only scalable vectors can be passed indirectly"); 5235 MemVT = VA.getLocVT(); 5236 break; 5237 case CCValAssign::SExt: 5238 ExtType = ISD::SEXTLOAD; 5239 break; 5240 case CCValAssign::ZExt: 5241 ExtType = ISD::ZEXTLOAD; 5242 break; 5243 case CCValAssign::AExt: 5244 ExtType = ISD::EXTLOAD; 5245 break; 5246 } 5247 5248 ArgValue = DAG.getExtLoad( 5249 ExtType, DL, VA.getLocVT(), Chain, FIN, 5250 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), 5251 MemVT); 5252 } 5253 5254 if (VA.getLocInfo() == CCValAssign::Indirect) { 5255 assert(VA.getValVT().isScalableVector() && 5256 "Only scalable vectors can be passed indirectly"); 5257 5258 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinSize(); 5259 unsigned NumParts = 1; 5260 if (Ins[i].Flags.isInConsecutiveRegs()) { 5261 assert(!Ins[i].Flags.isInConsecutiveRegsLast()); 5262 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast()) 5263 ++NumParts; 5264 } 5265 5266 MVT PartLoad = VA.getValVT(); 5267 SDValue Ptr = ArgValue; 5268 5269 // Ensure we generate all loads for each tuple part, whilst updating the 5270 // pointer after each load correctly using vscale. 5271 while (NumParts > 0) { 5272 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo()); 5273 InVals.push_back(ArgValue); 5274 NumParts--; 5275 if (NumParts > 0) { 5276 SDValue BytesIncrement = DAG.getVScale( 5277 DL, Ptr.getValueType(), 5278 APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize)); 5279 SDNodeFlags Flags; 5280 Flags.setNoUnsignedWrap(true); 5281 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, 5282 BytesIncrement, Flags); 5283 ExtraArgLocs++; 5284 i++; 5285 } 5286 } 5287 } else { 5288 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer()) 5289 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(), 5290 ArgValue, DAG.getValueType(MVT::i32)); 5291 InVals.push_back(ArgValue); 5292 } 5293 } 5294 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size()); 5295 5296 // varargs 5297 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 5298 if (isVarArg) { 5299 if (!Subtarget->isTargetDarwin() || IsWin64) { 5300 // The AAPCS variadic function ABI is identical to the non-variadic 5301 // one. As a result there may be more arguments in registers and we should 5302 // save them for future reference. 5303 // Win64 variadic functions also pass arguments in registers, but all float 5304 // arguments are passed in integer registers. 5305 saveVarArgRegisters(CCInfo, DAG, DL, Chain); 5306 } 5307 5308 // This will point to the next argument passed via stack. 5309 unsigned StackOffset = CCInfo.getNextStackOffset(); 5310 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32 5311 StackOffset = alignTo(StackOffset, Subtarget->isTargetILP32() ? 4 : 8); 5312 FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true)); 5313 5314 if (MFI.hasMustTailInVarArgFunc()) { 5315 SmallVector<MVT, 2> RegParmTypes; 5316 RegParmTypes.push_back(MVT::i64); 5317 RegParmTypes.push_back(MVT::f128); 5318 // Compute the set of forwarded registers. The rest are scratch. 5319 SmallVectorImpl<ForwardedRegister> &Forwards = 5320 FuncInfo->getForwardedMustTailRegParms(); 5321 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, 5322 CC_AArch64_AAPCS); 5323 5324 // Conservatively forward X8, since it might be used for aggregate return. 5325 if (!CCInfo.isAllocated(AArch64::X8)) { 5326 unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass); 5327 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64)); 5328 } 5329 } 5330 } 5331 5332 // On Windows, InReg pointers must be returned, so record the pointer in a 5333 // virtual register at the start of the function so it can be returned in the 5334 // epilogue. 5335 if (IsWin64) { 5336 for (unsigned I = 0, E = Ins.size(); I != E; ++I) { 5337 if (Ins[I].Flags.isInReg()) { 5338 assert(!FuncInfo->getSRetReturnReg()); 5339 5340 MVT PtrTy = getPointerTy(DAG.getDataLayout()); 5341 Register Reg = 5342 MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); 5343 FuncInfo->setSRetReturnReg(Reg); 5344 5345 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]); 5346 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain); 5347 break; 5348 } 5349 } 5350 } 5351 5352 unsigned StackArgSize = CCInfo.getNextStackOffset(); 5353 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 5354 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) { 5355 // This is a non-standard ABI so by fiat I say we're allowed to make full 5356 // use of the stack area to be popped, which must be aligned to 16 bytes in 5357 // any case: 5358 StackArgSize = alignTo(StackArgSize, 16); 5359 5360 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding 5361 // a multiple of 16. 5362 FuncInfo->setArgumentStackToRestore(StackArgSize); 5363 5364 // This realignment carries over to the available bytes below. Our own 5365 // callers will guarantee the space is free by giving an aligned value to 5366 // CALLSEQ_START. 5367 } 5368 // Even if we're not expected to free up the space, it's useful to know how 5369 // much is there while considering tail calls (because we can reuse it). 5370 FuncInfo->setBytesInStackArgArea(StackArgSize); 5371 5372 if (Subtarget->hasCustomCallingConv()) 5373 Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF); 5374 5375 return Chain; 5376 } 5377 5378 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, 5379 SelectionDAG &DAG, 5380 const SDLoc &DL, 5381 SDValue &Chain) const { 5382 MachineFunction &MF = DAG.getMachineFunction(); 5383 MachineFrameInfo &MFI = MF.getFrameInfo(); 5384 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 5385 auto PtrVT = getPointerTy(DAG.getDataLayout()); 5386 bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()); 5387 5388 SmallVector<SDValue, 8> MemOps; 5389 5390 static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2, 5391 AArch64::X3, AArch64::X4, AArch64::X5, 5392 AArch64::X6, AArch64::X7 }; 5393 static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs); 5394 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs); 5395 5396 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR); 5397 int GPRIdx = 0; 5398 if (GPRSaveSize != 0) { 5399 if (IsWin64) { 5400 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false); 5401 if (GPRSaveSize & 15) 5402 // The extra size here, if triggered, will always be 8. 5403 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false); 5404 } else 5405 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false); 5406 5407 SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT); 5408 5409 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) { 5410 unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass); 5411 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); 5412 SDValue Store = DAG.getStore( 5413 Val.getValue(1), DL, Val, FIN, 5414 IsWin64 5415 ? MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), 5416 GPRIdx, 5417 (i - FirstVariadicGPR) * 8) 5418 : MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8)); 5419 MemOps.push_back(Store); 5420 FIN = 5421 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT)); 5422 } 5423 } 5424 FuncInfo->setVarArgsGPRIndex(GPRIdx); 5425 FuncInfo->setVarArgsGPRSize(GPRSaveSize); 5426 5427 if (Subtarget->hasFPARMv8() && !IsWin64) { 5428 static const MCPhysReg FPRArgRegs[] = { 5429 AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, 5430 AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7}; 5431 static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs); 5432 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs); 5433 5434 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); 5435 int FPRIdx = 0; 5436 if (FPRSaveSize != 0) { 5437 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false); 5438 5439 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT); 5440 5441 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) { 5442 unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass); 5443 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); 5444 5445 SDValue Store = DAG.getStore( 5446 Val.getValue(1), DL, Val, FIN, 5447 MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16)); 5448 MemOps.push_back(Store); 5449 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, 5450 DAG.getConstant(16, DL, PtrVT)); 5451 } 5452 } 5453 FuncInfo->setVarArgsFPRIndex(FPRIdx); 5454 FuncInfo->setVarArgsFPRSize(FPRSaveSize); 5455 } 5456 5457 if (!MemOps.empty()) { 5458 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); 5459 } 5460 } 5461 5462 /// LowerCallResult - Lower the result values of a call into the 5463 /// appropriate copies out of appropriate physical registers. 5464 SDValue AArch64TargetLowering::LowerCallResult( 5465 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, 5466 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, 5467 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn, 5468 SDValue ThisVal) const { 5469 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv); 5470 // Assign locations to each value returned by this call. 5471 SmallVector<CCValAssign, 16> RVLocs; 5472 DenseMap<unsigned, SDValue> CopiedRegs; 5473 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 5474 *DAG.getContext()); 5475 CCInfo.AnalyzeCallResult(Ins, RetCC); 5476 5477 // Copy all of the result registers out of their specified physreg. 5478 for (unsigned i = 0; i != RVLocs.size(); ++i) { 5479 CCValAssign VA = RVLocs[i]; 5480 5481 // Pass 'this' value directly from the argument to return value, to avoid 5482 // reg unit interference 5483 if (i == 0 && isThisReturn) { 5484 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 && 5485 "unexpected return calling convention register assignment"); 5486 InVals.push_back(ThisVal); 5487 continue; 5488 } 5489 5490 // Avoid copying a physreg twice since RegAllocFast is incompetent and only 5491 // allows one use of a physreg per block. 5492 SDValue Val = CopiedRegs.lookup(VA.getLocReg()); 5493 if (!Val) { 5494 Val = 5495 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag); 5496 Chain = Val.getValue(1); 5497 InFlag = Val.getValue(2); 5498 CopiedRegs[VA.getLocReg()] = Val; 5499 } 5500 5501 switch (VA.getLocInfo()) { 5502 default: 5503 llvm_unreachable("Unknown loc info!"); 5504 case CCValAssign::Full: 5505 break; 5506 case CCValAssign::BCvt: 5507 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); 5508 break; 5509 case CCValAssign::AExtUpper: 5510 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val, 5511 DAG.getConstant(32, DL, VA.getLocVT())); 5512 LLVM_FALLTHROUGH; 5513 case CCValAssign::AExt: 5514 LLVM_FALLTHROUGH; 5515 case CCValAssign::ZExt: 5516 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT()); 5517 break; 5518 } 5519 5520 InVals.push_back(Val); 5521 } 5522 5523 return Chain; 5524 } 5525 5526 /// Return true if the calling convention is one that we can guarantee TCO for. 5527 static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) { 5528 return (CC == CallingConv::Fast && GuaranteeTailCalls) || 5529 CC == CallingConv::Tail || CC == CallingConv::SwiftTail; 5530 } 5531 5532 /// Return true if we might ever do TCO for calls with this calling convention. 5533 static bool mayTailCallThisCC(CallingConv::ID CC) { 5534 switch (CC) { 5535 case CallingConv::C: 5536 case CallingConv::AArch64_SVE_VectorCall: 5537 case CallingConv::PreserveMost: 5538 case CallingConv::Swift: 5539 case CallingConv::SwiftTail: 5540 case CallingConv::Tail: 5541 case CallingConv::Fast: 5542 return true; 5543 default: 5544 return false; 5545 } 5546 } 5547 5548 bool AArch64TargetLowering::isEligibleForTailCallOptimization( 5549 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, 5550 const SmallVectorImpl<ISD::OutputArg> &Outs, 5551 const SmallVectorImpl<SDValue> &OutVals, 5552 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { 5553 if (!mayTailCallThisCC(CalleeCC)) 5554 return false; 5555 5556 MachineFunction &MF = DAG.getMachineFunction(); 5557 const Function &CallerF = MF.getFunction(); 5558 CallingConv::ID CallerCC = CallerF.getCallingConv(); 5559 5560 // Functions using the C or Fast calling convention that have an SVE signature 5561 // preserve more registers and should assume the SVE_VectorCall CC. 5562 // The check for matching callee-saved regs will determine whether it is 5563 // eligible for TCO. 5564 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) && 5565 AArch64RegisterInfo::hasSVEArgsOrReturn(&MF)) 5566 CallerCC = CallingConv::AArch64_SVE_VectorCall; 5567 5568 bool CCMatch = CallerCC == CalleeCC; 5569 5570 // When using the Windows calling convention on a non-windows OS, we want 5571 // to back up and restore X18 in such functions; we can't do a tail call 5572 // from those functions. 5573 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() && 5574 CalleeCC != CallingConv::Win64) 5575 return false; 5576 5577 // Byval parameters hand the function a pointer directly into the stack area 5578 // we want to reuse during a tail call. Working around this *is* possible (see 5579 // X86) but less efficient and uglier in LowerCall. 5580 for (Function::const_arg_iterator i = CallerF.arg_begin(), 5581 e = CallerF.arg_end(); 5582 i != e; ++i) { 5583 if (i->hasByValAttr()) 5584 return false; 5585 5586 // On Windows, "inreg" attributes signify non-aggregate indirect returns. 5587 // In this case, it is necessary to save/restore X0 in the callee. Tail 5588 // call opt interferes with this. So we disable tail call opt when the 5589 // caller has an argument with "inreg" attribute. 5590 5591 // FIXME: Check whether the callee also has an "inreg" argument. 5592 if (i->hasInRegAttr()) 5593 return false; 5594 } 5595 5596 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt)) 5597 return CCMatch; 5598 5599 // Externally-defined functions with weak linkage should not be 5600 // tail-called on AArch64 when the OS does not support dynamic 5601 // pre-emption of symbols, as the AAELF spec requires normal calls 5602 // to undefined weak functions to be replaced with a NOP or jump to the 5603 // next instruction. The behaviour of branch instructions in this 5604 // situation (as used for tail calls) is implementation-defined, so we 5605 // cannot rely on the linker replacing the tail call with a return. 5606 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 5607 const GlobalValue *GV = G->getGlobal(); 5608 const Triple &TT = getTargetMachine().getTargetTriple(); 5609 if (GV->hasExternalWeakLinkage() && 5610 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) 5611 return false; 5612 } 5613 5614 // Now we search for cases where we can use a tail call without changing the 5615 // ABI. Sibcall is used in some places (particularly gcc) to refer to this 5616 // concept. 5617 5618 // I want anyone implementing a new calling convention to think long and hard 5619 // about this assert. 5620 assert((!isVarArg || CalleeCC == CallingConv::C) && 5621 "Unexpected variadic calling convention"); 5622 5623 LLVMContext &C = *DAG.getContext(); 5624 if (isVarArg && !Outs.empty()) { 5625 // At least two cases here: if caller is fastcc then we can't have any 5626 // memory arguments (we'd be expected to clean up the stack afterwards). If 5627 // caller is C then we could potentially use its argument area. 5628 5629 // FIXME: for now we take the most conservative of these in both cases: 5630 // disallow all variadic memory operands. 5631 SmallVector<CCValAssign, 16> ArgLocs; 5632 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); 5633 5634 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true)); 5635 for (const CCValAssign &ArgLoc : ArgLocs) 5636 if (!ArgLoc.isRegLoc()) 5637 return false; 5638 } 5639 5640 // Check that the call results are passed in the same way. 5641 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, 5642 CCAssignFnForCall(CalleeCC, isVarArg), 5643 CCAssignFnForCall(CallerCC, isVarArg))) 5644 return false; 5645 // The callee has to preserve all registers the caller needs to preserve. 5646 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 5647 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); 5648 if (!CCMatch) { 5649 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); 5650 if (Subtarget->hasCustomCallingConv()) { 5651 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved); 5652 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved); 5653 } 5654 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) 5655 return false; 5656 } 5657 5658 // Nothing more to check if the callee is taking no arguments 5659 if (Outs.empty()) 5660 return true; 5661 5662 SmallVector<CCValAssign, 16> ArgLocs; 5663 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); 5664 5665 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); 5666 5667 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 5668 5669 // If any of the arguments is passed indirectly, it must be SVE, so the 5670 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to 5671 // allocate space on the stack. That is why we determine this explicitly here 5672 // the call cannot be a tailcall. 5673 if (llvm::any_of(ArgLocs, [](CCValAssign &A) { 5674 assert((A.getLocInfo() != CCValAssign::Indirect || 5675 A.getValVT().isScalableVector()) && 5676 "Expected value to be scalable"); 5677 return A.getLocInfo() == CCValAssign::Indirect; 5678 })) 5679 return false; 5680 5681 // If the stack arguments for this call do not fit into our own save area then 5682 // the call cannot be made tail. 5683 if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) 5684 return false; 5685 5686 const MachineRegisterInfo &MRI = MF.getRegInfo(); 5687 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) 5688 return false; 5689 5690 return true; 5691 } 5692 5693 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain, 5694 SelectionDAG &DAG, 5695 MachineFrameInfo &MFI, 5696 int ClobberedFI) const { 5697 SmallVector<SDValue, 8> ArgChains; 5698 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI); 5699 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1; 5700 5701 // Include the original chain at the beginning of the list. When this is 5702 // used by target LowerCall hooks, this helps legalize find the 5703 // CALLSEQ_BEGIN node. 5704 ArgChains.push_back(Chain); 5705 5706 // Add a chain value for each stack argument corresponding 5707 for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(), 5708 UE = DAG.getEntryNode().getNode()->use_end(); 5709 U != UE; ++U) 5710 if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) 5711 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) 5712 if (FI->getIndex() < 0) { 5713 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex()); 5714 int64_t InLastByte = InFirstByte; 5715 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1; 5716 5717 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || 5718 (FirstByte <= InFirstByte && InFirstByte <= LastByte)) 5719 ArgChains.push_back(SDValue(L, 1)); 5720 } 5721 5722 // Build a tokenfactor for all the chains. 5723 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); 5724 } 5725 5726 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC, 5727 bool TailCallOpt) const { 5728 return (CallCC == CallingConv::Fast && TailCallOpt) || 5729 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail; 5730 } 5731 5732 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain, 5733 /// and add input and output parameter nodes. 5734 SDValue 5735 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, 5736 SmallVectorImpl<SDValue> &InVals) const { 5737 SelectionDAG &DAG = CLI.DAG; 5738 SDLoc &DL = CLI.DL; 5739 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; 5740 SmallVector<SDValue, 32> &OutVals = CLI.OutVals; 5741 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; 5742 SDValue Chain = CLI.Chain; 5743 SDValue Callee = CLI.Callee; 5744 bool &IsTailCall = CLI.IsTailCall; 5745 CallingConv::ID CallConv = CLI.CallConv; 5746 bool IsVarArg = CLI.IsVarArg; 5747 5748 MachineFunction &MF = DAG.getMachineFunction(); 5749 MachineFunction::CallSiteInfo CSInfo; 5750 bool IsThisReturn = false; 5751 5752 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 5753 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 5754 bool IsSibCall = false; 5755 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CallConv); 5756 5757 // Check callee args/returns for SVE registers and set calling convention 5758 // accordingly. 5759 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) { 5760 bool CalleeOutSVE = any_of(Outs, [](ISD::OutputArg &Out){ 5761 return Out.VT.isScalableVector(); 5762 }); 5763 bool CalleeInSVE = any_of(Ins, [](ISD::InputArg &In){ 5764 return In.VT.isScalableVector(); 5765 }); 5766 5767 if (CalleeInSVE || CalleeOutSVE) 5768 CallConv = CallingConv::AArch64_SVE_VectorCall; 5769 } 5770 5771 if (IsTailCall) { 5772 // Check if it's really possible to do a tail call. 5773 IsTailCall = isEligibleForTailCallOptimization( 5774 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); 5775 5776 // A sibling call is one where we're under the usual C ABI and not planning 5777 // to change that but can still do a tail call: 5778 if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail && 5779 CallConv != CallingConv::SwiftTail) 5780 IsSibCall = true; 5781 5782 if (IsTailCall) 5783 ++NumTailCalls; 5784 } 5785 5786 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) 5787 report_fatal_error("failed to perform tail call elimination on a call " 5788 "site marked musttail"); 5789 5790 // Analyze operands of the call, assigning locations to each operand. 5791 SmallVector<CCValAssign, 16> ArgLocs; 5792 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs, 5793 *DAG.getContext()); 5794 5795 if (IsVarArg) { 5796 // Handle fixed and variable vector arguments differently. 5797 // Variable vector arguments always go into memory. 5798 unsigned NumArgs = Outs.size(); 5799 5800 for (unsigned i = 0; i != NumArgs; ++i) { 5801 MVT ArgVT = Outs[i].VT; 5802 if (!Outs[i].IsFixed && ArgVT.isScalableVector()) 5803 report_fatal_error("Passing SVE types to variadic functions is " 5804 "currently not supported"); 5805 5806 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; 5807 bool UseVarArgCC = !Outs[i].IsFixed; 5808 // On Windows, the fixed arguments in a vararg call are passed in GPRs 5809 // too, so use the vararg CC to force them to integer registers. 5810 if (IsCalleeWin64) 5811 UseVarArgCC = true; 5812 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC); 5813 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo); 5814 assert(!Res && "Call operand has unhandled type"); 5815 (void)Res; 5816 } 5817 } else { 5818 // At this point, Outs[].VT may already be promoted to i32. To correctly 5819 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and 5820 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. 5821 // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here 5822 // we use a special version of AnalyzeCallOperands to pass in ValVT and 5823 // LocVT. 5824 unsigned NumArgs = Outs.size(); 5825 for (unsigned i = 0; i != NumArgs; ++i) { 5826 MVT ValVT = Outs[i].VT; 5827 // Get type of the original argument. 5828 EVT ActualVT = getValueType(DAG.getDataLayout(), 5829 CLI.getArgs()[Outs[i].OrigArgIndex].Ty, 5830 /*AllowUnknown*/ true); 5831 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT; 5832 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; 5833 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. 5834 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) 5835 ValVT = MVT::i8; 5836 else if (ActualMVT == MVT::i16) 5837 ValVT = MVT::i16; 5838 5839 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); 5840 bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo); 5841 assert(!Res && "Call operand has unhandled type"); 5842 (void)Res; 5843 } 5844 } 5845 5846 // Get a count of how many bytes are to be pushed on the stack. 5847 unsigned NumBytes = CCInfo.getNextStackOffset(); 5848 5849 if (IsSibCall) { 5850 // Since we're not changing the ABI to make this a tail call, the memory 5851 // operands are already available in the caller's incoming argument space. 5852 NumBytes = 0; 5853 } 5854 5855 // FPDiff is the byte offset of the call's argument area from the callee's. 5856 // Stores to callee stack arguments will be placed in FixedStackSlots offset 5857 // by this amount for a tail call. In a sibling call it must be 0 because the 5858 // caller will deallocate the entire stack and the callee still expects its 5859 // arguments to begin at SP+0. Completely unused for non-tail calls. 5860 int FPDiff = 0; 5861 5862 if (IsTailCall && !IsSibCall) { 5863 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea(); 5864 5865 // Since callee will pop argument stack as a tail call, we must keep the 5866 // popped size 16-byte aligned. 5867 NumBytes = alignTo(NumBytes, 16); 5868 5869 // FPDiff will be negative if this tail call requires more space than we 5870 // would automatically have in our incoming argument space. Positive if we 5871 // can actually shrink the stack. 5872 FPDiff = NumReusableBytes - NumBytes; 5873 5874 // Update the required reserved area if this is the tail call requiring the 5875 // most argument stack space. 5876 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff) 5877 FuncInfo->setTailCallReservedStack(-FPDiff); 5878 5879 // The stack pointer must be 16-byte aligned at all times it's used for a 5880 // memory operation, which in practice means at *all* times and in 5881 // particular across call boundaries. Therefore our own arguments started at 5882 // a 16-byte aligned SP and the delta applied for the tail call should 5883 // satisfy the same constraint. 5884 assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); 5885 } 5886 5887 // Adjust the stack pointer for the new arguments... 5888 // These operations are automatically eliminated by the prolog/epilog pass 5889 if (!IsSibCall) 5890 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL); 5891 5892 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, 5893 getPointerTy(DAG.getDataLayout())); 5894 5895 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 5896 SmallSet<unsigned, 8> RegsUsed; 5897 SmallVector<SDValue, 8> MemOpChains; 5898 auto PtrVT = getPointerTy(DAG.getDataLayout()); 5899 5900 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) { 5901 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms(); 5902 for (const auto &F : Forwards) { 5903 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT); 5904 RegsToPass.emplace_back(F.PReg, Val); 5905 } 5906 } 5907 5908 // Walk the register/memloc assignments, inserting copies/loads. 5909 unsigned ExtraArgLocs = 0; 5910 for (unsigned i = 0, e = Outs.size(); i != e; ++i) { 5911 CCValAssign &VA = ArgLocs[i - ExtraArgLocs]; 5912 SDValue Arg = OutVals[i]; 5913 ISD::ArgFlagsTy Flags = Outs[i].Flags; 5914 5915 // Promote the value if needed. 5916 switch (VA.getLocInfo()) { 5917 default: 5918 llvm_unreachable("Unknown loc info!"); 5919 case CCValAssign::Full: 5920 break; 5921 case CCValAssign::SExt: 5922 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); 5923 break; 5924 case CCValAssign::ZExt: 5925 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); 5926 break; 5927 case CCValAssign::AExt: 5928 if (Outs[i].ArgVT == MVT::i1) { 5929 // AAPCS requires i1 to be zero-extended to 8-bits by the caller. 5930 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); 5931 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg); 5932 } 5933 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); 5934 break; 5935 case CCValAssign::AExtUpper: 5936 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits"); 5937 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); 5938 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg, 5939 DAG.getConstant(32, DL, VA.getLocVT())); 5940 break; 5941 case CCValAssign::BCvt: 5942 Arg = DAG.getBitcast(VA.getLocVT(), Arg); 5943 break; 5944 case CCValAssign::Trunc: 5945 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT()); 5946 break; 5947 case CCValAssign::FPExt: 5948 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg); 5949 break; 5950 case CCValAssign::Indirect: 5951 assert(VA.getValVT().isScalableVector() && 5952 "Only scalable vectors can be passed indirectly"); 5953 5954 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinSize(); 5955 uint64_t PartSize = StoreSize; 5956 unsigned NumParts = 1; 5957 if (Outs[i].Flags.isInConsecutiveRegs()) { 5958 assert(!Outs[i].Flags.isInConsecutiveRegsLast()); 5959 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast()) 5960 ++NumParts; 5961 StoreSize *= NumParts; 5962 } 5963 5964 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 5965 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext()); 5966 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty); 5967 int FI = MFI.CreateStackObject(StoreSize, Alignment, false); 5968 MFI.setStackID(FI, TargetStackID::ScalableVector); 5969 5970 MachinePointerInfo MPI = 5971 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); 5972 SDValue Ptr = DAG.getFrameIndex( 5973 FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); 5974 SDValue SpillSlot = Ptr; 5975 5976 // Ensure we generate all stores for each tuple part, whilst updating the 5977 // pointer after each store correctly using vscale. 5978 while (NumParts) { 5979 Chain = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI); 5980 NumParts--; 5981 if (NumParts > 0) { 5982 SDValue BytesIncrement = DAG.getVScale( 5983 DL, Ptr.getValueType(), 5984 APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize)); 5985 SDNodeFlags Flags; 5986 Flags.setNoUnsignedWrap(true); 5987 5988 MPI = MachinePointerInfo(MPI.getAddrSpace()); 5989 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, 5990 BytesIncrement, Flags); 5991 ExtraArgLocs++; 5992 i++; 5993 } 5994 } 5995 5996 Arg = SpillSlot; 5997 break; 5998 } 5999 6000 if (VA.isRegLoc()) { 6001 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() && 6002 Outs[0].VT == MVT::i64) { 6003 assert(VA.getLocVT() == MVT::i64 && 6004 "unexpected calling convention register assignment"); 6005 assert(!Ins.empty() && Ins[0].VT == MVT::i64 && 6006 "unexpected use of 'returned'"); 6007 IsThisReturn = true; 6008 } 6009 if (RegsUsed.count(VA.getLocReg())) { 6010 // If this register has already been used then we're trying to pack 6011 // parts of an [N x i32] into an X-register. The extension type will 6012 // take care of putting the two halves in the right place but we have to 6013 // combine them. 6014 SDValue &Bits = 6015 llvm::find_if(RegsToPass, 6016 [=](const std::pair<unsigned, SDValue> &Elt) { 6017 return Elt.first == VA.getLocReg(); 6018 }) 6019 ->second; 6020 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg); 6021 // Call site info is used for function's parameter entry value 6022 // tracking. For now we track only simple cases when parameter 6023 // is transferred through whole register. 6024 llvm::erase_if(CSInfo, [&VA](MachineFunction::ArgRegPair ArgReg) { 6025 return ArgReg.Reg == VA.getLocReg(); 6026 }); 6027 } else { 6028 RegsToPass.emplace_back(VA.getLocReg(), Arg); 6029 RegsUsed.insert(VA.getLocReg()); 6030 const TargetOptions &Options = DAG.getTarget().Options; 6031 if (Options.EmitCallSiteInfo) 6032 CSInfo.emplace_back(VA.getLocReg(), i); 6033 } 6034 } else { 6035 assert(VA.isMemLoc()); 6036 6037 SDValue DstAddr; 6038 MachinePointerInfo DstInfo; 6039 6040 // FIXME: This works on big-endian for composite byvals, which are the 6041 // common case. It should also work for fundamental types too. 6042 uint32_t BEAlign = 0; 6043 unsigned OpSize; 6044 if (VA.getLocInfo() == CCValAssign::Indirect || 6045 VA.getLocInfo() == CCValAssign::Trunc) 6046 OpSize = VA.getLocVT().getFixedSizeInBits(); 6047 else 6048 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 6049 : VA.getValVT().getSizeInBits(); 6050 OpSize = (OpSize + 7) / 8; 6051 if (!Subtarget->isLittleEndian() && !Flags.isByVal() && 6052 !Flags.isInConsecutiveRegs()) { 6053 if (OpSize < 8) 6054 BEAlign = 8 - OpSize; 6055 } 6056 unsigned LocMemOffset = VA.getLocMemOffset(); 6057 int32_t Offset = LocMemOffset + BEAlign; 6058 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); 6059 PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); 6060 6061 if (IsTailCall) { 6062 Offset = Offset + FPDiff; 6063 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true); 6064 6065 DstAddr = DAG.getFrameIndex(FI, PtrVT); 6066 DstInfo = 6067 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); 6068 6069 // Make sure any stack arguments overlapping with where we're storing 6070 // are loaded before this eventual operation. Otherwise they'll be 6071 // clobbered. 6072 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI); 6073 } else { 6074 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); 6075 6076 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); 6077 DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(), 6078 LocMemOffset); 6079 } 6080 6081 if (Outs[i].Flags.isByVal()) { 6082 SDValue SizeNode = 6083 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64); 6084 SDValue Cpy = DAG.getMemcpy( 6085 Chain, DL, DstAddr, Arg, SizeNode, 6086 Outs[i].Flags.getNonZeroByValAlign(), 6087 /*isVol = */ false, /*AlwaysInline = */ false, 6088 /*isTailCall = */ false, DstInfo, MachinePointerInfo()); 6089 6090 MemOpChains.push_back(Cpy); 6091 } else { 6092 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already 6093 // promoted to a legal register type i32, we should truncate Arg back to 6094 // i1/i8/i16. 6095 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 || 6096 VA.getValVT() == MVT::i16) 6097 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg); 6098 6099 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo); 6100 MemOpChains.push_back(Store); 6101 } 6102 } 6103 } 6104 6105 if (!MemOpChains.empty()) 6106 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); 6107 6108 // Build a sequence of copy-to-reg nodes chained together with token chain 6109 // and flag operands which copy the outgoing args into the appropriate regs. 6110 SDValue InFlag; 6111 for (auto &RegToPass : RegsToPass) { 6112 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first, 6113 RegToPass.second, InFlag); 6114 InFlag = Chain.getValue(1); 6115 } 6116 6117 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every 6118 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol 6119 // node so that legalize doesn't hack it. 6120 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 6121 auto GV = G->getGlobal(); 6122 unsigned OpFlags = 6123 Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()); 6124 if (OpFlags & AArch64II::MO_GOT) { 6125 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags); 6126 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); 6127 } else { 6128 const GlobalValue *GV = G->getGlobal(); 6129 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); 6130 } 6131 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 6132 if (getTargetMachine().getCodeModel() == CodeModel::Large && 6133 Subtarget->isTargetMachO()) { 6134 const char *Sym = S->getSymbol(); 6135 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT); 6136 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); 6137 } else { 6138 const char *Sym = S->getSymbol(); 6139 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0); 6140 } 6141 } 6142 6143 // We don't usually want to end the call-sequence here because we would tidy 6144 // the frame up *after* the call, however in the ABI-changing tail-call case 6145 // we've carefully laid out the parameters so that when sp is reset they'll be 6146 // in the correct location. 6147 if (IsTailCall && !IsSibCall) { 6148 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true), 6149 DAG.getIntPtrConstant(0, DL, true), InFlag, DL); 6150 InFlag = Chain.getValue(1); 6151 } 6152 6153 std::vector<SDValue> Ops; 6154 Ops.push_back(Chain); 6155 Ops.push_back(Callee); 6156 6157 if (IsTailCall) { 6158 // Each tail call may have to adjust the stack by a different amount, so 6159 // this information must travel along with the operation for eventual 6160 // consumption by emitEpilogue. 6161 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32)); 6162 } 6163 6164 // Add argument registers to the end of the list so that they are known live 6165 // into the call. 6166 for (auto &RegToPass : RegsToPass) 6167 Ops.push_back(DAG.getRegister(RegToPass.first, 6168 RegToPass.second.getValueType())); 6169 6170 // Add a register mask operand representing the call-preserved registers. 6171 const uint32_t *Mask; 6172 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 6173 if (IsThisReturn) { 6174 // For 'this' returns, use the X0-preserving mask if applicable 6175 Mask = TRI->getThisReturnPreservedMask(MF, CallConv); 6176 if (!Mask) { 6177 IsThisReturn = false; 6178 Mask = TRI->getCallPreservedMask(MF, CallConv); 6179 } 6180 } else 6181 Mask = TRI->getCallPreservedMask(MF, CallConv); 6182 6183 if (Subtarget->hasCustomCallingConv()) 6184 TRI->UpdateCustomCallPreservedMask(MF, &Mask); 6185 6186 if (TRI->isAnyArgRegReserved(MF)) 6187 TRI->emitReservedArgRegCallError(MF); 6188 6189 assert(Mask && "Missing call preserved mask for calling convention"); 6190 Ops.push_back(DAG.getRegisterMask(Mask)); 6191 6192 if (InFlag.getNode()) 6193 Ops.push_back(InFlag); 6194 6195 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 6196 6197 // If we're doing a tall call, use a TC_RETURN here rather than an 6198 // actual call instruction. 6199 if (IsTailCall) { 6200 MF.getFrameInfo().setHasTailCall(); 6201 SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops); 6202 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); 6203 return Ret; 6204 } 6205 6206 unsigned CallOpc = AArch64ISD::CALL; 6207 // Calls with operand bundle "clang.arc.attachedcall" are special. They should 6208 // be expanded to the call, directly followed by a special marker sequence. 6209 // Use the CALL_RVMARKER to do that. 6210 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) { 6211 assert(!IsTailCall && 6212 "tail calls cannot be marked with clang.arc.attachedcall"); 6213 CallOpc = AArch64ISD::CALL_RVMARKER; 6214 } 6215 6216 // Returns a chain and a flag for retval copy to use. 6217 Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops); 6218 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); 6219 InFlag = Chain.getValue(1); 6220 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); 6221 6222 uint64_t CalleePopBytes = 6223 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0; 6224 6225 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true), 6226 DAG.getIntPtrConstant(CalleePopBytes, DL, true), 6227 InFlag, DL); 6228 if (!Ins.empty()) 6229 InFlag = Chain.getValue(1); 6230 6231 // Handle result values, copying them out of physregs into vregs that we 6232 // return. 6233 return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG, 6234 InVals, IsThisReturn, 6235 IsThisReturn ? OutVals[0] : SDValue()); 6236 } 6237 6238 bool AArch64TargetLowering::CanLowerReturn( 6239 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, 6240 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { 6241 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv); 6242 SmallVector<CCValAssign, 16> RVLocs; 6243 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 6244 return CCInfo.CheckReturn(Outs, RetCC); 6245 } 6246 6247 SDValue 6248 AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 6249 bool isVarArg, 6250 const SmallVectorImpl<ISD::OutputArg> &Outs, 6251 const SmallVectorImpl<SDValue> &OutVals, 6252 const SDLoc &DL, SelectionDAG &DAG) const { 6253 auto &MF = DAG.getMachineFunction(); 6254 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 6255 6256 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv); 6257 SmallVector<CCValAssign, 16> RVLocs; 6258 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 6259 *DAG.getContext()); 6260 CCInfo.AnalyzeReturn(Outs, RetCC); 6261 6262 // Copy the result values into the output registers. 6263 SDValue Flag; 6264 SmallVector<std::pair<unsigned, SDValue>, 4> RetVals; 6265 SmallSet<unsigned, 4> RegsUsed; 6266 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); 6267 ++i, ++realRVLocIdx) { 6268 CCValAssign &VA = RVLocs[i]; 6269 assert(VA.isRegLoc() && "Can only return in registers!"); 6270 SDValue Arg = OutVals[realRVLocIdx]; 6271 6272 switch (VA.getLocInfo()) { 6273 default: 6274 llvm_unreachable("Unknown loc info!"); 6275 case CCValAssign::Full: 6276 if (Outs[i].ArgVT == MVT::i1) { 6277 // AAPCS requires i1 to be zero-extended to i8 by the producer of the 6278 // value. This is strictly redundant on Darwin (which uses "zeroext 6279 // i1"), but will be optimised out before ISel. 6280 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); 6281 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); 6282 } 6283 break; 6284 case CCValAssign::BCvt: 6285 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); 6286 break; 6287 case CCValAssign::AExt: 6288 case CCValAssign::ZExt: 6289 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT()); 6290 break; 6291 case CCValAssign::AExtUpper: 6292 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits"); 6293 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT()); 6294 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg, 6295 DAG.getConstant(32, DL, VA.getLocVT())); 6296 break; 6297 } 6298 6299 if (RegsUsed.count(VA.getLocReg())) { 6300 SDValue &Bits = 6301 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) { 6302 return Elt.first == VA.getLocReg(); 6303 })->second; 6304 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg); 6305 } else { 6306 RetVals.emplace_back(VA.getLocReg(), Arg); 6307 RegsUsed.insert(VA.getLocReg()); 6308 } 6309 } 6310 6311 SmallVector<SDValue, 4> RetOps(1, Chain); 6312 for (auto &RetVal : RetVals) { 6313 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Flag); 6314 Flag = Chain.getValue(1); 6315 RetOps.push_back( 6316 DAG.getRegister(RetVal.first, RetVal.second.getValueType())); 6317 } 6318 6319 // Windows AArch64 ABIs require that for returning structs by value we copy 6320 // the sret argument into X0 for the return. 6321 // We saved the argument into a virtual register in the entry block, 6322 // so now we copy the value out and into X0. 6323 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) { 6324 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg, 6325 getPointerTy(MF.getDataLayout())); 6326 6327 unsigned RetValReg = AArch64::X0; 6328 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Flag); 6329 Flag = Chain.getValue(1); 6330 6331 RetOps.push_back( 6332 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); 6333 } 6334 6335 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 6336 const MCPhysReg *I = 6337 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 6338 if (I) { 6339 for (; *I; ++I) { 6340 if (AArch64::GPR64RegClass.contains(*I)) 6341 RetOps.push_back(DAG.getRegister(*I, MVT::i64)); 6342 else if (AArch64::FPR64RegClass.contains(*I)) 6343 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); 6344 else 6345 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 6346 } 6347 } 6348 6349 RetOps[0] = Chain; // Update chain. 6350 6351 // Add the flag if we have it. 6352 if (Flag.getNode()) 6353 RetOps.push_back(Flag); 6354 6355 return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps); 6356 } 6357 6358 //===----------------------------------------------------------------------===// 6359 // Other Lowering Code 6360 //===----------------------------------------------------------------------===// 6361 6362 SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty, 6363 SelectionDAG &DAG, 6364 unsigned Flag) const { 6365 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty, 6366 N->getOffset(), Flag); 6367 } 6368 6369 SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty, 6370 SelectionDAG &DAG, 6371 unsigned Flag) const { 6372 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag); 6373 } 6374 6375 SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty, 6376 SelectionDAG &DAG, 6377 unsigned Flag) const { 6378 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(), 6379 N->getOffset(), Flag); 6380 } 6381 6382 SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty, 6383 SelectionDAG &DAG, 6384 unsigned Flag) const { 6385 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag); 6386 } 6387 6388 // (loadGOT sym) 6389 template <class NodeTy> 6390 SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG, 6391 unsigned Flags) const { 6392 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n"); 6393 SDLoc DL(N); 6394 EVT Ty = getPointerTy(DAG.getDataLayout()); 6395 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags); 6396 // FIXME: Once remat is capable of dealing with instructions with register 6397 // operands, expand this into two nodes instead of using a wrapper node. 6398 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr); 6399 } 6400 6401 // (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym)) 6402 template <class NodeTy> 6403 SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG, 6404 unsigned Flags) const { 6405 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n"); 6406 SDLoc DL(N); 6407 EVT Ty = getPointerTy(DAG.getDataLayout()); 6408 const unsigned char MO_NC = AArch64II::MO_NC; 6409 return DAG.getNode( 6410 AArch64ISD::WrapperLarge, DL, Ty, 6411 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags), 6412 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags), 6413 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags), 6414 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags)); 6415 } 6416 6417 // (addlow (adrp %hi(sym)) %lo(sym)) 6418 template <class NodeTy> 6419 SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG, 6420 unsigned Flags) const { 6421 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n"); 6422 SDLoc DL(N); 6423 EVT Ty = getPointerTy(DAG.getDataLayout()); 6424 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags); 6425 SDValue Lo = getTargetNode(N, Ty, DAG, 6426 AArch64II::MO_PAGEOFF | AArch64II::MO_NC | Flags); 6427 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi); 6428 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo); 6429 } 6430 6431 // (adr sym) 6432 template <class NodeTy> 6433 SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG, 6434 unsigned Flags) const { 6435 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n"); 6436 SDLoc DL(N); 6437 EVT Ty = getPointerTy(DAG.getDataLayout()); 6438 SDValue Sym = getTargetNode(N, Ty, DAG, Flags); 6439 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym); 6440 } 6441 6442 SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, 6443 SelectionDAG &DAG) const { 6444 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); 6445 const GlobalValue *GV = GN->getGlobal(); 6446 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 6447 6448 if (OpFlags != AArch64II::MO_NO_FLAG) 6449 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 && 6450 "unexpected offset in global node"); 6451 6452 // This also catches the large code model case for Darwin, and tiny code 6453 // model with got relocations. 6454 if ((OpFlags & AArch64II::MO_GOT) != 0) { 6455 return getGOT(GN, DAG, OpFlags); 6456 } 6457 6458 SDValue Result; 6459 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 6460 Result = getAddrLarge(GN, DAG, OpFlags); 6461 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) { 6462 Result = getAddrTiny(GN, DAG, OpFlags); 6463 } else { 6464 Result = getAddr(GN, DAG, OpFlags); 6465 } 6466 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 6467 SDLoc DL(GN); 6468 if (OpFlags & (AArch64II::MO_DLLIMPORT | AArch64II::MO_COFFSTUB)) 6469 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, 6470 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 6471 return Result; 6472 } 6473 6474 /// Convert a TLS address reference into the correct sequence of loads 6475 /// and calls to compute the variable's address (for Darwin, currently) and 6476 /// return an SDValue containing the final node. 6477 6478 /// Darwin only has one TLS scheme which must be capable of dealing with the 6479 /// fully general situation, in the worst case. This means: 6480 /// + "extern __thread" declaration. 6481 /// + Defined in a possibly unknown dynamic library. 6482 /// 6483 /// The general system is that each __thread variable has a [3 x i64] descriptor 6484 /// which contains information used by the runtime to calculate the address. The 6485 /// only part of this the compiler needs to know about is the first xword, which 6486 /// contains a function pointer that must be called with the address of the 6487 /// entire descriptor in "x0". 6488 /// 6489 /// Since this descriptor may be in a different unit, in general even the 6490 /// descriptor must be accessed via an indirect load. The "ideal" code sequence 6491 /// is: 6492 /// adrp x0, _var@TLVPPAGE 6493 /// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor 6494 /// ldr x1, [x0] ; x1 contains 1st entry of descriptor, 6495 /// ; the function pointer 6496 /// blr x1 ; Uses descriptor address in x0 6497 /// ; Address of _var is now in x0. 6498 /// 6499 /// If the address of _var's descriptor *is* known to the linker, then it can 6500 /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for 6501 /// a slight efficiency gain. 6502 SDValue 6503 AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, 6504 SelectionDAG &DAG) const { 6505 assert(Subtarget->isTargetDarwin() && 6506 "This function expects a Darwin target"); 6507 6508 SDLoc DL(Op); 6509 MVT PtrVT = getPointerTy(DAG.getDataLayout()); 6510 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout()); 6511 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 6512 6513 SDValue TLVPAddr = 6514 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); 6515 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr); 6516 6517 // The first entry in the descriptor is a function pointer that we must call 6518 // to obtain the address of the variable. 6519 SDValue Chain = DAG.getEntryNode(); 6520 SDValue FuncTLVGet = DAG.getLoad( 6521 PtrMemVT, DL, Chain, DescAddr, 6522 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 6523 Align(PtrMemVT.getSizeInBits() / 8), 6524 MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable); 6525 Chain = FuncTLVGet.getValue(1); 6526 6527 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer. 6528 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT); 6529 6530 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 6531 MFI.setAdjustsStack(true); 6532 6533 // TLS calls preserve all registers except those that absolutely must be 6534 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be 6535 // silly). 6536 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 6537 const uint32_t *Mask = TRI->getTLSCallPreservedMask(); 6538 if (Subtarget->hasCustomCallingConv()) 6539 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask); 6540 6541 // Finally, we can make the call. This is just a degenerate version of a 6542 // normal AArch64 call node: x0 takes the address of the descriptor, and 6543 // returns the address of the variable in this thread. 6544 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue()); 6545 Chain = 6546 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), 6547 Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64), 6548 DAG.getRegisterMask(Mask), Chain.getValue(1)); 6549 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1)); 6550 } 6551 6552 /// Convert a thread-local variable reference into a sequence of instructions to 6553 /// compute the variable's address for the local exec TLS model of ELF targets. 6554 /// The sequence depends on the maximum TLS area size. 6555 SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV, 6556 SDValue ThreadBase, 6557 const SDLoc &DL, 6558 SelectionDAG &DAG) const { 6559 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 6560 SDValue TPOff, Addr; 6561 6562 switch (DAG.getTarget().Options.TLSSize) { 6563 default: 6564 llvm_unreachable("Unexpected TLS size"); 6565 6566 case 12: { 6567 // mrs x0, TPIDR_EL0 6568 // add x0, x0, :tprel_lo12:a 6569 SDValue Var = DAG.getTargetGlobalAddress( 6570 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF); 6571 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase, 6572 Var, 6573 DAG.getTargetConstant(0, DL, MVT::i32)), 6574 0); 6575 } 6576 6577 case 24: { 6578 // mrs x0, TPIDR_EL0 6579 // add x0, x0, :tprel_hi12:a 6580 // add x0, x0, :tprel_lo12_nc:a 6581 SDValue HiVar = DAG.getTargetGlobalAddress( 6582 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); 6583 SDValue LoVar = DAG.getTargetGlobalAddress( 6584 GV, DL, PtrVT, 0, 6585 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 6586 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase, 6587 HiVar, 6588 DAG.getTargetConstant(0, DL, MVT::i32)), 6589 0); 6590 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr, 6591 LoVar, 6592 DAG.getTargetConstant(0, DL, MVT::i32)), 6593 0); 6594 } 6595 6596 case 32: { 6597 // mrs x1, TPIDR_EL0 6598 // movz x0, #:tprel_g1:a 6599 // movk x0, #:tprel_g0_nc:a 6600 // add x0, x1, x0 6601 SDValue HiVar = DAG.getTargetGlobalAddress( 6602 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1); 6603 SDValue LoVar = DAG.getTargetGlobalAddress( 6604 GV, DL, PtrVT, 0, 6605 AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC); 6606 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar, 6607 DAG.getTargetConstant(16, DL, MVT::i32)), 6608 0); 6609 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar, 6610 DAG.getTargetConstant(0, DL, MVT::i32)), 6611 0); 6612 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); 6613 } 6614 6615 case 48: { 6616 // mrs x1, TPIDR_EL0 6617 // movz x0, #:tprel_g2:a 6618 // movk x0, #:tprel_g1_nc:a 6619 // movk x0, #:tprel_g0_nc:a 6620 // add x0, x1, x0 6621 SDValue HiVar = DAG.getTargetGlobalAddress( 6622 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2); 6623 SDValue MiVar = DAG.getTargetGlobalAddress( 6624 GV, DL, PtrVT, 0, 6625 AArch64II::MO_TLS | AArch64II::MO_G1 | AArch64II::MO_NC); 6626 SDValue LoVar = DAG.getTargetGlobalAddress( 6627 GV, DL, PtrVT, 0, 6628 AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC); 6629 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar, 6630 DAG.getTargetConstant(32, DL, MVT::i32)), 6631 0); 6632 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar, 6633 DAG.getTargetConstant(16, DL, MVT::i32)), 6634 0); 6635 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar, 6636 DAG.getTargetConstant(0, DL, MVT::i32)), 6637 0); 6638 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); 6639 } 6640 } 6641 } 6642 6643 /// When accessing thread-local variables under either the general-dynamic or 6644 /// local-dynamic system, we make a "TLS-descriptor" call. The variable will 6645 /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry 6646 /// is a function pointer to carry out the resolution. 6647 /// 6648 /// The sequence is: 6649 /// adrp x0, :tlsdesc:var 6650 /// ldr x1, [x0, #:tlsdesc_lo12:var] 6651 /// add x0, x0, #:tlsdesc_lo12:var 6652 /// .tlsdesccall var 6653 /// blr x1 6654 /// (TPIDR_EL0 offset now in x0) 6655 /// 6656 /// The above sequence must be produced unscheduled, to enable the linker to 6657 /// optimize/relax this sequence. 6658 /// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the 6659 /// above sequence, and expanded really late in the compilation flow, to ensure 6660 /// the sequence is produced as per above. 6661 SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, 6662 const SDLoc &DL, 6663 SelectionDAG &DAG) const { 6664 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 6665 6666 SDValue Chain = DAG.getEntryNode(); 6667 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 6668 6669 Chain = 6670 DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr}); 6671 SDValue Glue = Chain.getValue(1); 6672 6673 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue); 6674 } 6675 6676 SDValue 6677 AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, 6678 SelectionDAG &DAG) const { 6679 assert(Subtarget->isTargetELF() && "This function expects an ELF target"); 6680 6681 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 6682 6683 TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal()); 6684 6685 if (!EnableAArch64ELFLocalDynamicTLSGeneration) { 6686 if (Model == TLSModel::LocalDynamic) 6687 Model = TLSModel::GeneralDynamic; 6688 } 6689 6690 if (getTargetMachine().getCodeModel() == CodeModel::Large && 6691 Model != TLSModel::LocalExec) 6692 report_fatal_error("ELF TLS only supported in small memory model or " 6693 "in local exec TLS model"); 6694 // Different choices can be made for the maximum size of the TLS area for a 6695 // module. For the small address model, the default TLS size is 16MiB and the 6696 // maximum TLS size is 4GiB. 6697 // FIXME: add tiny and large code model support for TLS access models other 6698 // than local exec. We currently generate the same code as small for tiny, 6699 // which may be larger than needed. 6700 6701 SDValue TPOff; 6702 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 6703 SDLoc DL(Op); 6704 const GlobalValue *GV = GA->getGlobal(); 6705 6706 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT); 6707 6708 if (Model == TLSModel::LocalExec) { 6709 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG); 6710 } else if (Model == TLSModel::InitialExec) { 6711 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); 6712 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff); 6713 } else if (Model == TLSModel::LocalDynamic) { 6714 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS 6715 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate 6716 // the beginning of the module's TLS region, followed by a DTPREL offset 6717 // calculation. 6718 6719 // These accesses will need deduplicating if there's more than one. 6720 AArch64FunctionInfo *MFI = 6721 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); 6722 MFI->incNumLocalDynamicTLSAccesses(); 6723 6724 // The call needs a relocation too for linker relaxation. It doesn't make 6725 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of 6726 // the address. 6727 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, 6728 AArch64II::MO_TLS); 6729 6730 // Now we can calculate the offset from TPIDR_EL0 to this module's 6731 // thread-local area. 6732 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); 6733 6734 // Now use :dtprel_whatever: operations to calculate this variable's offset 6735 // in its thread-storage area. 6736 SDValue HiVar = DAG.getTargetGlobalAddress( 6737 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); 6738 SDValue LoVar = DAG.getTargetGlobalAddress( 6739 GV, DL, MVT::i64, 0, 6740 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 6741 6742 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar, 6743 DAG.getTargetConstant(0, DL, MVT::i32)), 6744 0); 6745 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar, 6746 DAG.getTargetConstant(0, DL, MVT::i32)), 6747 0); 6748 } else if (Model == TLSModel::GeneralDynamic) { 6749 // The call needs a relocation too for linker relaxation. It doesn't make 6750 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of 6751 // the address. 6752 SDValue SymAddr = 6753 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); 6754 6755 // Finally we can make a call to calculate the offset from tpidr_el0. 6756 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); 6757 } else 6758 llvm_unreachable("Unsupported ELF TLS access model"); 6759 6760 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); 6761 } 6762 6763 SDValue 6764 AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op, 6765 SelectionDAG &DAG) const { 6766 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering"); 6767 6768 SDValue Chain = DAG.getEntryNode(); 6769 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 6770 SDLoc DL(Op); 6771 6772 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64); 6773 6774 // Load the ThreadLocalStoragePointer from the TEB 6775 // A pointer to the TLS array is located at offset 0x58 from the TEB. 6776 SDValue TLSArray = 6777 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL)); 6778 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo()); 6779 Chain = TLSArray.getValue(1); 6780 6781 // Load the TLS index from the C runtime; 6782 // This does the same as getAddr(), but without having a GlobalAddressSDNode. 6783 // This also does the same as LOADgot, but using a generic i32 load, 6784 // while LOADgot only loads i64. 6785 SDValue TLSIndexHi = 6786 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE); 6787 SDValue TLSIndexLo = DAG.getTargetExternalSymbol( 6788 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 6789 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi); 6790 SDValue TLSIndex = 6791 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo); 6792 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo()); 6793 Chain = TLSIndex.getValue(1); 6794 6795 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8 6796 // offset into the TLSArray. 6797 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex); 6798 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex, 6799 DAG.getConstant(3, DL, PtrVT)); 6800 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain, 6801 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot), 6802 MachinePointerInfo()); 6803 Chain = TLS.getValue(1); 6804 6805 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 6806 const GlobalValue *GV = GA->getGlobal(); 6807 SDValue TGAHi = DAG.getTargetGlobalAddress( 6808 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); 6809 SDValue TGALo = DAG.getTargetGlobalAddress( 6810 GV, DL, PtrVT, 0, 6811 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 6812 6813 // Add the offset from the start of the .tls section (section base). 6814 SDValue Addr = 6815 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi, 6816 DAG.getTargetConstant(0, DL, MVT::i32)), 6817 0); 6818 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo); 6819 return Addr; 6820 } 6821 6822 SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, 6823 SelectionDAG &DAG) const { 6824 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 6825 if (DAG.getTarget().useEmulatedTLS()) 6826 return LowerToTLSEmulatedModel(GA, DAG); 6827 6828 if (Subtarget->isTargetDarwin()) 6829 return LowerDarwinGlobalTLSAddress(Op, DAG); 6830 if (Subtarget->isTargetELF()) 6831 return LowerELFGlobalTLSAddress(Op, DAG); 6832 if (Subtarget->isTargetWindows()) 6833 return LowerWindowsGlobalTLSAddress(Op, DAG); 6834 6835 llvm_unreachable("Unexpected platform trying to use TLS"); 6836 } 6837 6838 // Looks through \param Val to determine the bit that can be used to 6839 // check the sign of the value. It returns the unextended value and 6840 // the sign bit position. 6841 std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) { 6842 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG) 6843 return {Val.getOperand(0), 6844 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() - 6845 1}; 6846 6847 if (Val.getOpcode() == ISD::SIGN_EXTEND) 6848 return {Val.getOperand(0), 6849 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1}; 6850 6851 return {Val, Val.getValueSizeInBits() - 1}; 6852 } 6853 6854 SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { 6855 SDValue Chain = Op.getOperand(0); 6856 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 6857 SDValue LHS = Op.getOperand(2); 6858 SDValue RHS = Op.getOperand(3); 6859 SDValue Dest = Op.getOperand(4); 6860 SDLoc dl(Op); 6861 6862 MachineFunction &MF = DAG.getMachineFunction(); 6863 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions 6864 // will not be produced, as they are conditional branch instructions that do 6865 // not set flags. 6866 bool ProduceNonFlagSettingCondBr = 6867 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening); 6868 6869 // Handle f128 first, since lowering it will result in comparing the return 6870 // value of a libcall against zero, which is just what the rest of LowerBR_CC 6871 // is expecting to deal with. 6872 if (LHS.getValueType() == MVT::f128) { 6873 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS); 6874 6875 // If softenSetCCOperands returned a scalar, we need to compare the result 6876 // against zero to select between true and false values. 6877 if (!RHS.getNode()) { 6878 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 6879 CC = ISD::SETNE; 6880 } 6881 } 6882 6883 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 6884 // instruction. 6885 if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) && 6886 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 6887 // Only lower legal XALUO ops. 6888 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) 6889 return SDValue(); 6890 6891 // The actual operation with overflow check. 6892 AArch64CC::CondCode OFCC; 6893 SDValue Value, Overflow; 6894 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG); 6895 6896 if (CC == ISD::SETNE) 6897 OFCC = getInvertedCondCode(OFCC); 6898 SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32); 6899 6900 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, 6901 Overflow); 6902 } 6903 6904 if (LHS.getValueType().isInteger()) { 6905 assert((LHS.getValueType() == RHS.getValueType()) && 6906 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); 6907 6908 // If the RHS of the comparison is zero, we can potentially fold this 6909 // to a specialized branch. 6910 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS); 6911 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) { 6912 if (CC == ISD::SETEQ) { 6913 // See if we can use a TBZ to fold in an AND as well. 6914 // TBZ has a smaller branch displacement than CBZ. If the offset is 6915 // out of bounds, a late MI-layer pass rewrites branches. 6916 // 403.gcc is an example that hits this case. 6917 if (LHS.getOpcode() == ISD::AND && 6918 isa<ConstantSDNode>(LHS.getOperand(1)) && 6919 isPowerOf2_64(LHS.getConstantOperandVal(1))) { 6920 SDValue Test = LHS.getOperand(0); 6921 uint64_t Mask = LHS.getConstantOperandVal(1); 6922 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test, 6923 DAG.getConstant(Log2_64(Mask), dl, MVT::i64), 6924 Dest); 6925 } 6926 6927 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest); 6928 } else if (CC == ISD::SETNE) { 6929 // See if we can use a TBZ to fold in an AND as well. 6930 // TBZ has a smaller branch displacement than CBZ. If the offset is 6931 // out of bounds, a late MI-layer pass rewrites branches. 6932 // 403.gcc is an example that hits this case. 6933 if (LHS.getOpcode() == ISD::AND && 6934 isa<ConstantSDNode>(LHS.getOperand(1)) && 6935 isPowerOf2_64(LHS.getConstantOperandVal(1))) { 6936 SDValue Test = LHS.getOperand(0); 6937 uint64_t Mask = LHS.getConstantOperandVal(1); 6938 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test, 6939 DAG.getConstant(Log2_64(Mask), dl, MVT::i64), 6940 Dest); 6941 } 6942 6943 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest); 6944 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) { 6945 // Don't combine AND since emitComparison converts the AND to an ANDS 6946 // (a.k.a. TST) and the test in the test bit and branch instruction 6947 // becomes redundant. This would also increase register pressure. 6948 uint64_t SignBitPos; 6949 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS); 6950 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS, 6951 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest); 6952 } 6953 } 6954 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT && 6955 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) { 6956 // Don't combine AND since emitComparison converts the AND to an ANDS 6957 // (a.k.a. TST) and the test in the test bit and branch instruction 6958 // becomes redundant. This would also increase register pressure. 6959 uint64_t SignBitPos; 6960 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS); 6961 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS, 6962 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest); 6963 } 6964 6965 SDValue CCVal; 6966 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); 6967 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, 6968 Cmp); 6969 } 6970 6971 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 || 6972 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); 6973 6974 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally 6975 // clean. Some of them require two branches to implement. 6976 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 6977 AArch64CC::CondCode CC1, CC2; 6978 changeFPCCToAArch64CC(CC, CC1, CC2); 6979 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 6980 SDValue BR1 = 6981 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp); 6982 if (CC2 != AArch64CC::AL) { 6983 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); 6984 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val, 6985 Cmp); 6986 } 6987 6988 return BR1; 6989 } 6990 6991 SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, 6992 SelectionDAG &DAG) const { 6993 EVT VT = Op.getValueType(); 6994 SDLoc DL(Op); 6995 6996 SDValue In1 = Op.getOperand(0); 6997 SDValue In2 = Op.getOperand(1); 6998 EVT SrcVT = In2.getValueType(); 6999 7000 if (SrcVT.bitsLT(VT)) 7001 In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2); 7002 else if (SrcVT.bitsGT(VT)) 7003 In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL)); 7004 7005 EVT VecVT; 7006 uint64_t EltMask; 7007 SDValue VecVal1, VecVal2; 7008 7009 auto setVecVal = [&] (int Idx) { 7010 if (!VT.isVector()) { 7011 VecVal1 = DAG.getTargetInsertSubreg(Idx, DL, VecVT, 7012 DAG.getUNDEF(VecVT), In1); 7013 VecVal2 = DAG.getTargetInsertSubreg(Idx, DL, VecVT, 7014 DAG.getUNDEF(VecVT), In2); 7015 } else { 7016 VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); 7017 VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); 7018 } 7019 }; 7020 7021 if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) { 7022 VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32); 7023 EltMask = 0x80000000ULL; 7024 setVecVal(AArch64::ssub); 7025 } else if (VT == MVT::f64 || VT == MVT::v2f64) { 7026 VecVT = MVT::v2i64; 7027 7028 // We want to materialize a mask with the high bit set, but the AdvSIMD 7029 // immediate moves cannot materialize that in a single instruction for 7030 // 64-bit elements. Instead, materialize zero and then negate it. 7031 EltMask = 0; 7032 7033 setVecVal(AArch64::dsub); 7034 } else if (VT == MVT::f16 || VT == MVT::v4f16 || VT == MVT::v8f16) { 7035 VecVT = (VT == MVT::v4f16 ? MVT::v4i16 : MVT::v8i16); 7036 EltMask = 0x8000ULL; 7037 setVecVal(AArch64::hsub); 7038 } else { 7039 llvm_unreachable("Invalid type for copysign!"); 7040 } 7041 7042 SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT); 7043 7044 // If we couldn't materialize the mask above, then the mask vector will be 7045 // the zero vector, and we need to negate it here. 7046 if (VT == MVT::f64 || VT == MVT::v2f64) { 7047 BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec); 7048 BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec); 7049 BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec); 7050 } 7051 7052 SDValue Sel = 7053 DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec); 7054 7055 if (VT == MVT::f16) 7056 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Sel); 7057 if (VT == MVT::f32) 7058 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel); 7059 else if (VT == MVT::f64) 7060 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel); 7061 else 7062 return DAG.getNode(ISD::BITCAST, DL, VT, Sel); 7063 } 7064 7065 SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { 7066 if (DAG.getMachineFunction().getFunction().hasFnAttribute( 7067 Attribute::NoImplicitFloat)) 7068 return SDValue(); 7069 7070 if (!Subtarget->hasNEON()) 7071 return SDValue(); 7072 7073 // While there is no integer popcount instruction, it can 7074 // be more efficiently lowered to the following sequence that uses 7075 // AdvSIMD registers/instructions as long as the copies to/from 7076 // the AdvSIMD registers are cheap. 7077 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd 7078 // CNT V0.8B, V0.8B // 8xbyte pop-counts 7079 // ADDV B0, V0.8B // sum 8xbyte pop-counts 7080 // UMOV X0, V0.B[0] // copy byte result back to integer reg 7081 SDValue Val = Op.getOperand(0); 7082 SDLoc DL(Op); 7083 EVT VT = Op.getValueType(); 7084 7085 if (VT == MVT::i32 || VT == MVT::i64) { 7086 if (VT == MVT::i32) 7087 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val); 7088 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val); 7089 7090 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val); 7091 SDValue UaddLV = DAG.getNode( 7092 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, 7093 DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop); 7094 7095 if (VT == MVT::i64) 7096 UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV); 7097 return UaddLV; 7098 } else if (VT == MVT::i128) { 7099 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val); 7100 7101 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val); 7102 SDValue UaddLV = DAG.getNode( 7103 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, 7104 DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop); 7105 7106 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV); 7107 } 7108 7109 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT)) 7110 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU); 7111 7112 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 || 7113 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) && 7114 "Unexpected type for custom ctpop lowering"); 7115 7116 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; 7117 Val = DAG.getBitcast(VT8Bit, Val); 7118 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val); 7119 7120 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds. 7121 unsigned EltSize = 8; 7122 unsigned NumElts = VT.is64BitVector() ? 8 : 16; 7123 while (EltSize != VT.getScalarSizeInBits()) { 7124 EltSize *= 2; 7125 NumElts /= 2; 7126 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts); 7127 Val = DAG.getNode( 7128 ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, 7129 DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val); 7130 } 7131 7132 return Val; 7133 } 7134 7135 SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { 7136 EVT VT = Op.getValueType(); 7137 assert(VT.isScalableVector() || 7138 useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)); 7139 7140 SDLoc DL(Op); 7141 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0)); 7142 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT); 7143 } 7144 7145 SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op, 7146 SelectionDAG &DAG) const { 7147 EVT VT = Op.getValueType(); 7148 7149 if (VT.isScalableVector() || 7150 useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) 7151 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU, 7152 true); 7153 7154 SDLoc DL(Op); 7155 SDValue REVB; 7156 MVT VST; 7157 7158 switch (VT.getSimpleVT().SimpleTy) { 7159 default: 7160 llvm_unreachable("Invalid type for bitreverse!"); 7161 7162 case MVT::v2i32: { 7163 VST = MVT::v8i8; 7164 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0)); 7165 7166 break; 7167 } 7168 7169 case MVT::v4i32: { 7170 VST = MVT::v16i8; 7171 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0)); 7172 7173 break; 7174 } 7175 7176 case MVT::v1i64: { 7177 VST = MVT::v8i8; 7178 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0)); 7179 7180 break; 7181 } 7182 7183 case MVT::v2i64: { 7184 VST = MVT::v16i8; 7185 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0)); 7186 7187 break; 7188 } 7189 } 7190 7191 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, 7192 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB)); 7193 } 7194 7195 SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 7196 7197 if (Op.getValueType().isVector()) 7198 return LowerVSETCC(Op, DAG); 7199 7200 bool IsStrict = Op->isStrictFPOpcode(); 7201 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS; 7202 unsigned OpNo = IsStrict ? 1 : 0; 7203 SDValue Chain; 7204 if (IsStrict) 7205 Chain = Op.getOperand(0); 7206 SDValue LHS = Op.getOperand(OpNo + 0); 7207 SDValue RHS = Op.getOperand(OpNo + 1); 7208 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get(); 7209 SDLoc dl(Op); 7210 7211 // We chose ZeroOrOneBooleanContents, so use zero and one. 7212 EVT VT = Op.getValueType(); 7213 SDValue TVal = DAG.getConstant(1, dl, VT); 7214 SDValue FVal = DAG.getConstant(0, dl, VT); 7215 7216 // Handle f128 first, since one possible outcome is a normal integer 7217 // comparison which gets picked up by the next if statement. 7218 if (LHS.getValueType() == MVT::f128) { 7219 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain, 7220 IsSignaling); 7221 7222 // If softenSetCCOperands returned a scalar, use it. 7223 if (!RHS.getNode()) { 7224 assert(LHS.getValueType() == Op.getValueType() && 7225 "Unexpected setcc expansion!"); 7226 return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS; 7227 } 7228 } 7229 7230 if (LHS.getValueType().isInteger()) { 7231 SDValue CCVal; 7232 SDValue Cmp = getAArch64Cmp( 7233 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl); 7234 7235 // Note that we inverted the condition above, so we reverse the order of 7236 // the true and false operands here. This will allow the setcc to be 7237 // matched to a single CSINC instruction. 7238 SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp); 7239 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res; 7240 } 7241 7242 // Now we know we're dealing with FP values. 7243 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 || 7244 LHS.getValueType() == MVT::f64); 7245 7246 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead 7247 // and do the comparison. 7248 SDValue Cmp; 7249 if (IsStrict) 7250 Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling); 7251 else 7252 Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 7253 7254 AArch64CC::CondCode CC1, CC2; 7255 changeFPCCToAArch64CC(CC, CC1, CC2); 7256 SDValue Res; 7257 if (CC2 == AArch64CC::AL) { 7258 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1, 7259 CC2); 7260 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 7261 7262 // Note that we inverted the condition above, so we reverse the order of 7263 // the true and false operands here. This will allow the setcc to be 7264 // matched to a single CSINC instruction. 7265 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp); 7266 } else { 7267 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't 7268 // totally clean. Some of them require two CSELs to implement. As is in 7269 // this case, we emit the first CSEL and then emit a second using the output 7270 // of the first as the RHS. We're effectively OR'ing the two CC's together. 7271 7272 // FIXME: It would be nice if we could match the two CSELs to two CSINCs. 7273 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 7274 SDValue CS1 = 7275 DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); 7276 7277 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); 7278 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); 7279 } 7280 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res; 7281 } 7282 7283 SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, 7284 SDValue RHS, SDValue TVal, 7285 SDValue FVal, const SDLoc &dl, 7286 SelectionDAG &DAG) const { 7287 // Handle f128 first, because it will result in a comparison of some RTLIB 7288 // call result against zero. 7289 if (LHS.getValueType() == MVT::f128) { 7290 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS); 7291 7292 // If softenSetCCOperands returned a scalar, we need to compare the result 7293 // against zero to select between true and false values. 7294 if (!RHS.getNode()) { 7295 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 7296 CC = ISD::SETNE; 7297 } 7298 } 7299 7300 // Also handle f16, for which we need to do a f32 comparison. 7301 if (LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { 7302 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); 7303 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); 7304 } 7305 7306 // Next, handle integers. 7307 if (LHS.getValueType().isInteger()) { 7308 assert((LHS.getValueType() == RHS.getValueType()) && 7309 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); 7310 7311 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal); 7312 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal); 7313 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS); 7314 // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform 7315 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the 7316 // supported types. 7317 if (CC == ISD::SETGT && RHSC && RHSC->isAllOnesValue() && CTVal && CFVal && 7318 CTVal->isOne() && CFVal->isAllOnesValue() && 7319 LHS.getValueType() == TVal.getValueType()) { 7320 EVT VT = LHS.getValueType(); 7321 SDValue Shift = 7322 DAG.getNode(ISD::SRA, dl, VT, LHS, 7323 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT)); 7324 return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT)); 7325 } 7326 7327 unsigned Opcode = AArch64ISD::CSEL; 7328 7329 // If both the TVal and the FVal are constants, see if we can swap them in 7330 // order to for a CSINV or CSINC out of them. 7331 if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) { 7332 std::swap(TVal, FVal); 7333 std::swap(CTVal, CFVal); 7334 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 7335 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) { 7336 std::swap(TVal, FVal); 7337 std::swap(CTVal, CFVal); 7338 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 7339 } else if (TVal.getOpcode() == ISD::XOR) { 7340 // If TVal is a NOT we want to swap TVal and FVal so that we can match 7341 // with a CSINV rather than a CSEL. 7342 if (isAllOnesConstant(TVal.getOperand(1))) { 7343 std::swap(TVal, FVal); 7344 std::swap(CTVal, CFVal); 7345 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 7346 } 7347 } else if (TVal.getOpcode() == ISD::SUB) { 7348 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so 7349 // that we can match with a CSNEG rather than a CSEL. 7350 if (isNullConstant(TVal.getOperand(0))) { 7351 std::swap(TVal, FVal); 7352 std::swap(CTVal, CFVal); 7353 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 7354 } 7355 } else if (CTVal && CFVal) { 7356 const int64_t TrueVal = CTVal->getSExtValue(); 7357 const int64_t FalseVal = CFVal->getSExtValue(); 7358 bool Swap = false; 7359 7360 // If both TVal and FVal are constants, see if FVal is the 7361 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC 7362 // instead of a CSEL in that case. 7363 if (TrueVal == ~FalseVal) { 7364 Opcode = AArch64ISD::CSINV; 7365 } else if (FalseVal > std::numeric_limits<int64_t>::min() && 7366 TrueVal == -FalseVal) { 7367 Opcode = AArch64ISD::CSNEG; 7368 } else if (TVal.getValueType() == MVT::i32) { 7369 // If our operands are only 32-bit wide, make sure we use 32-bit 7370 // arithmetic for the check whether we can use CSINC. This ensures that 7371 // the addition in the check will wrap around properly in case there is 7372 // an overflow (which would not be the case if we do the check with 7373 // 64-bit arithmetic). 7374 const uint32_t TrueVal32 = CTVal->getZExtValue(); 7375 const uint32_t FalseVal32 = CFVal->getZExtValue(); 7376 7377 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) { 7378 Opcode = AArch64ISD::CSINC; 7379 7380 if (TrueVal32 > FalseVal32) { 7381 Swap = true; 7382 } 7383 } 7384 // 64-bit check whether we can use CSINC. 7385 } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) { 7386 Opcode = AArch64ISD::CSINC; 7387 7388 if (TrueVal > FalseVal) { 7389 Swap = true; 7390 } 7391 } 7392 7393 // Swap TVal and FVal if necessary. 7394 if (Swap) { 7395 std::swap(TVal, FVal); 7396 std::swap(CTVal, CFVal); 7397 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 7398 } 7399 7400 if (Opcode != AArch64ISD::CSEL) { 7401 // Drop FVal since we can get its value by simply inverting/negating 7402 // TVal. 7403 FVal = TVal; 7404 } 7405 } 7406 7407 // Avoid materializing a constant when possible by reusing a known value in 7408 // a register. However, don't perform this optimization if the known value 7409 // is one, zero or negative one in the case of a CSEL. We can always 7410 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the 7411 // FVal, respectively. 7412 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS); 7413 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() && 7414 !RHSVal->isNullValue() && !RHSVal->isAllOnesValue()) { 7415 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); 7416 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to 7417 // "a != C ? x : a" to avoid materializing C. 7418 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ) 7419 TVal = LHS; 7420 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE) 7421 FVal = LHS; 7422 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) { 7423 assert (CTVal && CFVal && "Expected constant operands for CSNEG."); 7424 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to 7425 // avoid materializing C. 7426 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); 7427 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) { 7428 Opcode = AArch64ISD::CSINV; 7429 TVal = LHS; 7430 FVal = DAG.getConstant(0, dl, FVal.getValueType()); 7431 } 7432 } 7433 7434 SDValue CCVal; 7435 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); 7436 EVT VT = TVal.getValueType(); 7437 return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp); 7438 } 7439 7440 // Now we know we're dealing with FP values. 7441 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 || 7442 LHS.getValueType() == MVT::f64); 7443 assert(LHS.getValueType() == RHS.getValueType()); 7444 EVT VT = TVal.getValueType(); 7445 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 7446 7447 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally 7448 // clean. Some of them require two CSELs to implement. 7449 AArch64CC::CondCode CC1, CC2; 7450 changeFPCCToAArch64CC(CC, CC1, CC2); 7451 7452 if (DAG.getTarget().Options.UnsafeFPMath) { 7453 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and 7454 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0. 7455 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS); 7456 if (RHSVal && RHSVal->isZero()) { 7457 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal); 7458 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal); 7459 7460 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) && 7461 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType()) 7462 TVal = LHS; 7463 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) && 7464 CFVal && CFVal->isZero() && 7465 FVal.getValueType() == LHS.getValueType()) 7466 FVal = LHS; 7467 } 7468 } 7469 7470 // Emit first, and possibly only, CSEL. 7471 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 7472 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); 7473 7474 // If we need a second CSEL, emit it, using the output of the first as the 7475 // RHS. We're effectively OR'ing the two CC's together. 7476 if (CC2 != AArch64CC::AL) { 7477 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); 7478 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); 7479 } 7480 7481 // Otherwise, return the output of the first CSEL. 7482 return CS1; 7483 } 7484 7485 SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op, 7486 SelectionDAG &DAG) const { 7487 7488 EVT Ty = Op.getValueType(); 7489 auto Idx = Op.getConstantOperandAPInt(2); 7490 if (Idx.sge(-1) && Idx.slt(Ty.getVectorMinNumElements())) 7491 return Op; 7492 return SDValue(); 7493 } 7494 7495 SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, 7496 SelectionDAG &DAG) const { 7497 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 7498 SDValue LHS = Op.getOperand(0); 7499 SDValue RHS = Op.getOperand(1); 7500 SDValue TVal = Op.getOperand(2); 7501 SDValue FVal = Op.getOperand(3); 7502 SDLoc DL(Op); 7503 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); 7504 } 7505 7506 SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, 7507 SelectionDAG &DAG) const { 7508 SDValue CCVal = Op->getOperand(0); 7509 SDValue TVal = Op->getOperand(1); 7510 SDValue FVal = Op->getOperand(2); 7511 SDLoc DL(Op); 7512 7513 EVT Ty = Op.getValueType(); 7514 if (Ty.isScalableVector()) { 7515 SDValue TruncCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, CCVal); 7516 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount()); 7517 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, TruncCC); 7518 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal); 7519 } 7520 7521 if (useSVEForFixedLengthVectorVT(Ty)) { 7522 // FIXME: Ideally this would be the same as above using i1 types, however 7523 // for the moment we can't deal with fixed i1 vector types properly, so 7524 // instead extend the predicate to a result type sized integer vector. 7525 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits()); 7526 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount()); 7527 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT); 7528 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal); 7529 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal); 7530 } 7531 7532 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select 7533 // instruction. 7534 if (ISD::isOverflowIntrOpRes(CCVal)) { 7535 // Only lower legal XALUO ops. 7536 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0))) 7537 return SDValue(); 7538 7539 AArch64CC::CondCode OFCC; 7540 SDValue Value, Overflow; 7541 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG); 7542 SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32); 7543 7544 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal, 7545 CCVal, Overflow); 7546 } 7547 7548 // Lower it the same way as we would lower a SELECT_CC node. 7549 ISD::CondCode CC; 7550 SDValue LHS, RHS; 7551 if (CCVal.getOpcode() == ISD::SETCC) { 7552 LHS = CCVal.getOperand(0); 7553 RHS = CCVal.getOperand(1); 7554 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get(); 7555 } else { 7556 LHS = CCVal; 7557 RHS = DAG.getConstant(0, DL, CCVal.getValueType()); 7558 CC = ISD::SETNE; 7559 } 7560 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); 7561 } 7562 7563 SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op, 7564 SelectionDAG &DAG) const { 7565 // Jump table entries as PC relative offsets. No additional tweaking 7566 // is necessary here. Just get the address of the jump table. 7567 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 7568 7569 if (getTargetMachine().getCodeModel() == CodeModel::Large && 7570 !Subtarget->isTargetMachO()) { 7571 return getAddrLarge(JT, DAG); 7572 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) { 7573 return getAddrTiny(JT, DAG); 7574 } 7575 return getAddr(JT, DAG); 7576 } 7577 7578 SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op, 7579 SelectionDAG &DAG) const { 7580 // Jump table entries as PC relative offsets. No additional tweaking 7581 // is necessary here. Just get the address of the jump table. 7582 SDLoc DL(Op); 7583 SDValue JT = Op.getOperand(1); 7584 SDValue Entry = Op.getOperand(2); 7585 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex(); 7586 7587 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); 7588 AFI->setJumpTableEntryInfo(JTI, 4, nullptr); 7589 7590 SDNode *Dest = 7591 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT, 7592 Entry, DAG.getTargetJumpTable(JTI, MVT::i32)); 7593 return DAG.getNode(ISD::BRIND, DL, MVT::Other, Op.getOperand(0), 7594 SDValue(Dest, 0)); 7595 } 7596 7597 SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op, 7598 SelectionDAG &DAG) const { 7599 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 7600 7601 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 7602 // Use the GOT for the large code model on iOS. 7603 if (Subtarget->isTargetMachO()) { 7604 return getGOT(CP, DAG); 7605 } 7606 return getAddrLarge(CP, DAG); 7607 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) { 7608 return getAddrTiny(CP, DAG); 7609 } else { 7610 return getAddr(CP, DAG); 7611 } 7612 } 7613 7614 SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op, 7615 SelectionDAG &DAG) const { 7616 BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op); 7617 if (getTargetMachine().getCodeModel() == CodeModel::Large && 7618 !Subtarget->isTargetMachO()) { 7619 return getAddrLarge(BA, DAG); 7620 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) { 7621 return getAddrTiny(BA, DAG); 7622 } 7623 return getAddr(BA, DAG); 7624 } 7625 7626 SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op, 7627 SelectionDAG &DAG) const { 7628 AArch64FunctionInfo *FuncInfo = 7629 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); 7630 7631 SDLoc DL(Op); 7632 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), 7633 getPointerTy(DAG.getDataLayout())); 7634 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout())); 7635 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 7636 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 7637 MachinePointerInfo(SV)); 7638 } 7639 7640 SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op, 7641 SelectionDAG &DAG) const { 7642 AArch64FunctionInfo *FuncInfo = 7643 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); 7644 7645 SDLoc DL(Op); 7646 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0 7647 ? FuncInfo->getVarArgsGPRIndex() 7648 : FuncInfo->getVarArgsStackIndex(), 7649 getPointerTy(DAG.getDataLayout())); 7650 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 7651 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 7652 MachinePointerInfo(SV)); 7653 } 7654 7655 SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, 7656 SelectionDAG &DAG) const { 7657 // The layout of the va_list struct is specified in the AArch64 Procedure Call 7658 // Standard, section B.3. 7659 MachineFunction &MF = DAG.getMachineFunction(); 7660 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 7661 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8; 7662 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout()); 7663 auto PtrVT = getPointerTy(DAG.getDataLayout()); 7664 SDLoc DL(Op); 7665 7666 SDValue Chain = Op.getOperand(0); 7667 SDValue VAList = Op.getOperand(1); 7668 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 7669 SmallVector<SDValue, 4> MemOps; 7670 7671 // void *__stack at offset 0 7672 unsigned Offset = 0; 7673 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT); 7674 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT); 7675 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList, 7676 MachinePointerInfo(SV), Align(PtrSize))); 7677 7678 // void *__gr_top at offset 8 (4 on ILP32) 7679 Offset += PtrSize; 7680 int GPRSize = FuncInfo->getVarArgsGPRSize(); 7681 if (GPRSize > 0) { 7682 SDValue GRTop, GRTopAddr; 7683 7684 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 7685 DAG.getConstant(Offset, DL, PtrVT)); 7686 7687 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT); 7688 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop, 7689 DAG.getConstant(GPRSize, DL, PtrVT)); 7690 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT); 7691 7692 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr, 7693 MachinePointerInfo(SV, Offset), 7694 Align(PtrSize))); 7695 } 7696 7697 // void *__vr_top at offset 16 (8 on ILP32) 7698 Offset += PtrSize; 7699 int FPRSize = FuncInfo->getVarArgsFPRSize(); 7700 if (FPRSize > 0) { 7701 SDValue VRTop, VRTopAddr; 7702 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 7703 DAG.getConstant(Offset, DL, PtrVT)); 7704 7705 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT); 7706 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop, 7707 DAG.getConstant(FPRSize, DL, PtrVT)); 7708 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT); 7709 7710 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr, 7711 MachinePointerInfo(SV, Offset), 7712 Align(PtrSize))); 7713 } 7714 7715 // int __gr_offs at offset 24 (12 on ILP32) 7716 Offset += PtrSize; 7717 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 7718 DAG.getConstant(Offset, DL, PtrVT)); 7719 MemOps.push_back( 7720 DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), 7721 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4))); 7722 7723 // int __vr_offs at offset 28 (16 on ILP32) 7724 Offset += 4; 7725 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 7726 DAG.getConstant(Offset, DL, PtrVT)); 7727 MemOps.push_back( 7728 DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), 7729 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4))); 7730 7731 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); 7732 } 7733 7734 SDValue AArch64TargetLowering::LowerVASTART(SDValue Op, 7735 SelectionDAG &DAG) const { 7736 MachineFunction &MF = DAG.getMachineFunction(); 7737 7738 if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv())) 7739 return LowerWin64_VASTART(Op, DAG); 7740 else if (Subtarget->isTargetDarwin()) 7741 return LowerDarwin_VASTART(Op, DAG); 7742 else 7743 return LowerAAPCS_VASTART(Op, DAG); 7744 } 7745 7746 SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, 7747 SelectionDAG &DAG) const { 7748 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single 7749 // pointer. 7750 SDLoc DL(Op); 7751 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8; 7752 unsigned VaListSize = 7753 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows()) 7754 ? PtrSize 7755 : Subtarget->isTargetILP32() ? 20 : 32; 7756 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 7757 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 7758 7759 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2), 7760 DAG.getConstant(VaListSize, DL, MVT::i32), 7761 Align(PtrSize), false, false, false, 7762 MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV)); 7763 } 7764 7765 SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 7766 assert(Subtarget->isTargetDarwin() && 7767 "automatic va_arg instruction only works on Darwin"); 7768 7769 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 7770 EVT VT = Op.getValueType(); 7771 SDLoc DL(Op); 7772 SDValue Chain = Op.getOperand(0); 7773 SDValue Addr = Op.getOperand(1); 7774 MaybeAlign Align(Op.getConstantOperandVal(3)); 7775 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8; 7776 auto PtrVT = getPointerTy(DAG.getDataLayout()); 7777 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout()); 7778 SDValue VAList = 7779 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V)); 7780 Chain = VAList.getValue(1); 7781 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT); 7782 7783 if (VT.isScalableVector()) 7784 report_fatal_error("Passing SVE types to variadic functions is " 7785 "currently not supported"); 7786 7787 if (Align && *Align > MinSlotSize) { 7788 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 7789 DAG.getConstant(Align->value() - 1, DL, PtrVT)); 7790 VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList, 7791 DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT)); 7792 } 7793 7794 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); 7795 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); 7796 7797 // Scalar integer and FP values smaller than 64 bits are implicitly extended 7798 // up to 64 bits. At the very least, we have to increase the striding of the 7799 // vaargs list to match this, and for FP values we need to introduce 7800 // FP_ROUND nodes as well. 7801 if (VT.isInteger() && !VT.isVector()) 7802 ArgSize = std::max(ArgSize, MinSlotSize); 7803 bool NeedFPTrunc = false; 7804 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) { 7805 ArgSize = 8; 7806 NeedFPTrunc = true; 7807 } 7808 7809 // Increment the pointer, VAList, to the next vaarg 7810 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 7811 DAG.getConstant(ArgSize, DL, PtrVT)); 7812 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT); 7813 7814 // Store the incremented VAList to the legalized pointer 7815 SDValue APStore = 7816 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V)); 7817 7818 // Load the actual argument out of the pointer VAList 7819 if (NeedFPTrunc) { 7820 // Load the value as an f64. 7821 SDValue WideFP = 7822 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo()); 7823 // Round the value down to an f32. 7824 SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0), 7825 DAG.getIntPtrConstant(1, DL)); 7826 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) }; 7827 // Merge the rounded value with the chain output of the load. 7828 return DAG.getMergeValues(Ops, DL); 7829 } 7830 7831 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo()); 7832 } 7833 7834 SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, 7835 SelectionDAG &DAG) const { 7836 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 7837 MFI.setFrameAddressIsTaken(true); 7838 7839 EVT VT = Op.getValueType(); 7840 SDLoc DL(Op); 7841 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7842 SDValue FrameAddr = 7843 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64); 7844 while (Depth--) 7845 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr, 7846 MachinePointerInfo()); 7847 7848 if (Subtarget->isTargetILP32()) 7849 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr, 7850 DAG.getValueType(VT)); 7851 7852 return FrameAddr; 7853 } 7854 7855 SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op, 7856 SelectionDAG &DAG) const { 7857 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 7858 7859 EVT VT = getPointerTy(DAG.getDataLayout()); 7860 SDLoc DL(Op); 7861 int FI = MFI.CreateFixedObject(4, 0, false); 7862 return DAG.getFrameIndex(FI, VT); 7863 } 7864 7865 #define GET_REGISTER_MATCHER 7866 #include "AArch64GenAsmMatcher.inc" 7867 7868 // FIXME? Maybe this could be a TableGen attribute on some registers and 7869 // this table could be generated automatically from RegInfo. 7870 Register AArch64TargetLowering:: 7871 getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const { 7872 Register Reg = MatchRegisterName(RegName); 7873 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) { 7874 const MCRegisterInfo *MRI = Subtarget->getRegisterInfo(); 7875 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false); 7876 if (!Subtarget->isXRegisterReserved(DwarfRegNum)) 7877 Reg = 0; 7878 } 7879 if (Reg) 7880 return Reg; 7881 report_fatal_error(Twine("Invalid register name \"" 7882 + StringRef(RegName) + "\".")); 7883 } 7884 7885 SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op, 7886 SelectionDAG &DAG) const { 7887 DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true); 7888 7889 EVT VT = Op.getValueType(); 7890 SDLoc DL(Op); 7891 7892 SDValue FrameAddr = 7893 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT); 7894 SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout())); 7895 7896 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset); 7897 } 7898 7899 SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, 7900 SelectionDAG &DAG) const { 7901 MachineFunction &MF = DAG.getMachineFunction(); 7902 MachineFrameInfo &MFI = MF.getFrameInfo(); 7903 MFI.setReturnAddressIsTaken(true); 7904 7905 EVT VT = Op.getValueType(); 7906 SDLoc DL(Op); 7907 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7908 SDValue ReturnAddress; 7909 if (Depth) { 7910 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 7911 SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout())); 7912 ReturnAddress = DAG.getLoad( 7913 VT, DL, DAG.getEntryNode(), 7914 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo()); 7915 } else { 7916 // Return LR, which contains the return address. Mark it an implicit 7917 // live-in. 7918 unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass); 7919 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); 7920 } 7921 7922 // The XPACLRI instruction assembles to a hint-space instruction before 7923 // Armv8.3-A therefore this instruction can be safely used for any pre 7924 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use 7925 // that instead. 7926 SDNode *St; 7927 if (Subtarget->hasPAuth()) { 7928 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress); 7929 } else { 7930 // XPACLRI operates on LR therefore we must move the operand accordingly. 7931 SDValue Chain = 7932 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress); 7933 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain); 7934 } 7935 return SDValue(St, 0); 7936 } 7937 7938 /// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two 7939 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 7940 SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op, 7941 SelectionDAG &DAG) const { 7942 SDValue Lo, Hi; 7943 expandShiftParts(Op.getNode(), Lo, Hi, DAG); 7944 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op)); 7945 } 7946 7947 bool AArch64TargetLowering::isOffsetFoldingLegal( 7948 const GlobalAddressSDNode *GA) const { 7949 // Offsets are folded in the DAG combine rather than here so that we can 7950 // intelligently choose an offset based on the uses. 7951 return false; 7952 } 7953 7954 bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, 7955 bool OptForSize) const { 7956 bool IsLegal = false; 7957 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and 7958 // 16-bit case when target has full fp16 support. 7959 // FIXME: We should be able to handle f128 as well with a clever lowering. 7960 const APInt ImmInt = Imm.bitcastToAPInt(); 7961 if (VT == MVT::f64) 7962 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero(); 7963 else if (VT == MVT::f32) 7964 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero(); 7965 else if (VT == MVT::f16 && Subtarget->hasFullFP16()) 7966 IsLegal = AArch64_AM::getFP16Imm(ImmInt) != -1 || Imm.isPosZero(); 7967 // TODO: fmov h0, w0 is also legal, however on't have an isel pattern to 7968 // generate that fmov. 7969 7970 // If we can not materialize in immediate field for fmov, check if the 7971 // value can be encoded as the immediate operand of a logical instruction. 7972 // The immediate value will be created with either MOVZ, MOVN, or ORR. 7973 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) { 7974 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr; 7975 // however the mov+fmov sequence is always better because of the reduced 7976 // cache pressure. The timings are still the same if you consider 7977 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the 7978 // movw+movk is fused). So we limit up to 2 instrdduction at most. 7979 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 7980 AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(), 7981 Insn); 7982 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2)); 7983 IsLegal = Insn.size() <= Limit; 7984 } 7985 7986 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT.getEVTString() 7987 << " imm value: "; Imm.dump();); 7988 return IsLegal; 7989 } 7990 7991 //===----------------------------------------------------------------------===// 7992 // AArch64 Optimization Hooks 7993 //===----------------------------------------------------------------------===// 7994 7995 static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, 7996 SDValue Operand, SelectionDAG &DAG, 7997 int &ExtraSteps) { 7998 EVT VT = Operand.getValueType(); 7999 if (ST->hasNEON() && 8000 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 || 8001 VT == MVT::f32 || VT == MVT::v1f32 || 8002 VT == MVT::v2f32 || VT == MVT::v4f32)) { 8003 if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified) 8004 // For the reciprocal estimates, convergence is quadratic, so the number 8005 // of digits is doubled after each iteration. In ARMv8, the accuracy of 8006 // the initial estimate is 2^-8. Thus the number of extra steps to refine 8007 // the result for float (23 mantissa bits) is 2 and for double (52 8008 // mantissa bits) is 3. 8009 ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2; 8010 8011 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand); 8012 } 8013 8014 return SDValue(); 8015 } 8016 8017 SDValue 8018 AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG, 8019 const DenormalMode &Mode) const { 8020 SDLoc DL(Op); 8021 EVT VT = Op.getValueType(); 8022 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 8023 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); 8024 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ); 8025 } 8026 8027 SDValue 8028 AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op, 8029 SelectionDAG &DAG) const { 8030 return Op; 8031 } 8032 8033 SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand, 8034 SelectionDAG &DAG, int Enabled, 8035 int &ExtraSteps, 8036 bool &UseOneConst, 8037 bool Reciprocal) const { 8038 if (Enabled == ReciprocalEstimate::Enabled || 8039 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt())) 8040 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand, 8041 DAG, ExtraSteps)) { 8042 SDLoc DL(Operand); 8043 EVT VT = Operand.getValueType(); 8044 8045 SDNodeFlags Flags; 8046 Flags.setAllowReassociation(true); 8047 8048 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2) 8049 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N) 8050 for (int i = ExtraSteps; i > 0; --i) { 8051 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate, 8052 Flags); 8053 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags); 8054 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags); 8055 } 8056 if (!Reciprocal) 8057 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags); 8058 8059 ExtraSteps = 0; 8060 return Estimate; 8061 } 8062 8063 return SDValue(); 8064 } 8065 8066 SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand, 8067 SelectionDAG &DAG, int Enabled, 8068 int &ExtraSteps) const { 8069 if (Enabled == ReciprocalEstimate::Enabled) 8070 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand, 8071 DAG, ExtraSteps)) { 8072 SDLoc DL(Operand); 8073 EVT VT = Operand.getValueType(); 8074 8075 SDNodeFlags Flags; 8076 Flags.setAllowReassociation(true); 8077 8078 // Newton reciprocal iteration: E * (2 - X * E) 8079 // AArch64 reciprocal iteration instruction: (2 - M * N) 8080 for (int i = ExtraSteps; i > 0; --i) { 8081 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand, 8082 Estimate, Flags); 8083 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags); 8084 } 8085 8086 ExtraSteps = 0; 8087 return Estimate; 8088 } 8089 8090 return SDValue(); 8091 } 8092 8093 //===----------------------------------------------------------------------===// 8094 // AArch64 Inline Assembly Support 8095 //===----------------------------------------------------------------------===// 8096 8097 // Table of Constraints 8098 // TODO: This is the current set of constraints supported by ARM for the 8099 // compiler, not all of them may make sense. 8100 // 8101 // r - A general register 8102 // w - An FP/SIMD register of some size in the range v0-v31 8103 // x - An FP/SIMD register of some size in the range v0-v15 8104 // I - Constant that can be used with an ADD instruction 8105 // J - Constant that can be used with a SUB instruction 8106 // K - Constant that can be used with a 32-bit logical instruction 8107 // L - Constant that can be used with a 64-bit logical instruction 8108 // M - Constant that can be used as a 32-bit MOV immediate 8109 // N - Constant that can be used as a 64-bit MOV immediate 8110 // Q - A memory reference with base register and no offset 8111 // S - A symbolic address 8112 // Y - Floating point constant zero 8113 // Z - Integer constant zero 8114 // 8115 // Note that general register operands will be output using their 64-bit x 8116 // register name, whatever the size of the variable, unless the asm operand 8117 // is prefixed by the %w modifier. Floating-point and SIMD register operands 8118 // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or 8119 // %q modifier. 8120 const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const { 8121 // At this point, we have to lower this constraint to something else, so we 8122 // lower it to an "r" or "w". However, by doing this we will force the result 8123 // to be in register, while the X constraint is much more permissive. 8124 // 8125 // Although we are correct (we are free to emit anything, without 8126 // constraints), we might break use cases that would expect us to be more 8127 // efficient and emit something else. 8128 if (!Subtarget->hasFPARMv8()) 8129 return "r"; 8130 8131 if (ConstraintVT.isFloatingPoint()) 8132 return "w"; 8133 8134 if (ConstraintVT.isVector() && 8135 (ConstraintVT.getSizeInBits() == 64 || 8136 ConstraintVT.getSizeInBits() == 128)) 8137 return "w"; 8138 8139 return "r"; 8140 } 8141 8142 enum PredicateConstraint { 8143 Upl, 8144 Upa, 8145 Invalid 8146 }; 8147 8148 static PredicateConstraint parsePredicateConstraint(StringRef Constraint) { 8149 PredicateConstraint P = PredicateConstraint::Invalid; 8150 if (Constraint == "Upa") 8151 P = PredicateConstraint::Upa; 8152 if (Constraint == "Upl") 8153 P = PredicateConstraint::Upl; 8154 return P; 8155 } 8156 8157 /// getConstraintType - Given a constraint letter, return the type of 8158 /// constraint it is for this target. 8159 AArch64TargetLowering::ConstraintType 8160 AArch64TargetLowering::getConstraintType(StringRef Constraint) const { 8161 if (Constraint.size() == 1) { 8162 switch (Constraint[0]) { 8163 default: 8164 break; 8165 case 'x': 8166 case 'w': 8167 case 'y': 8168 return C_RegisterClass; 8169 // An address with a single base register. Due to the way we 8170 // currently handle addresses it is the same as 'r'. 8171 case 'Q': 8172 return C_Memory; 8173 case 'I': 8174 case 'J': 8175 case 'K': 8176 case 'L': 8177 case 'M': 8178 case 'N': 8179 case 'Y': 8180 case 'Z': 8181 return C_Immediate; 8182 case 'z': 8183 case 'S': // A symbolic address 8184 return C_Other; 8185 } 8186 } else if (parsePredicateConstraint(Constraint) != 8187 PredicateConstraint::Invalid) 8188 return C_RegisterClass; 8189 return TargetLowering::getConstraintType(Constraint); 8190 } 8191 8192 /// Examine constraint type and operand type and determine a weight value. 8193 /// This object must already have been set up with the operand type 8194 /// and the current alternative constraint selected. 8195 TargetLowering::ConstraintWeight 8196 AArch64TargetLowering::getSingleConstraintMatchWeight( 8197 AsmOperandInfo &info, const char *constraint) const { 8198 ConstraintWeight weight = CW_Invalid; 8199 Value *CallOperandVal = info.CallOperandVal; 8200 // If we don't have a value, we can't do a match, 8201 // but allow it at the lowest weight. 8202 if (!CallOperandVal) 8203 return CW_Default; 8204 Type *type = CallOperandVal->getType(); 8205 // Look at the constraint type. 8206 switch (*constraint) { 8207 default: 8208 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 8209 break; 8210 case 'x': 8211 case 'w': 8212 case 'y': 8213 if (type->isFloatingPointTy() || type->isVectorTy()) 8214 weight = CW_Register; 8215 break; 8216 case 'z': 8217 weight = CW_Constant; 8218 break; 8219 case 'U': 8220 if (parsePredicateConstraint(constraint) != PredicateConstraint::Invalid) 8221 weight = CW_Register; 8222 break; 8223 } 8224 return weight; 8225 } 8226 8227 std::pair<unsigned, const TargetRegisterClass *> 8228 AArch64TargetLowering::getRegForInlineAsmConstraint( 8229 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { 8230 if (Constraint.size() == 1) { 8231 switch (Constraint[0]) { 8232 case 'r': 8233 if (VT.isScalableVector()) 8234 return std::make_pair(0U, nullptr); 8235 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512) 8236 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass); 8237 if (VT.getFixedSizeInBits() == 64) 8238 return std::make_pair(0U, &AArch64::GPR64commonRegClass); 8239 return std::make_pair(0U, &AArch64::GPR32commonRegClass); 8240 case 'w': { 8241 if (!Subtarget->hasFPARMv8()) 8242 break; 8243 if (VT.isScalableVector()) { 8244 if (VT.getVectorElementType() != MVT::i1) 8245 return std::make_pair(0U, &AArch64::ZPRRegClass); 8246 return std::make_pair(0U, nullptr); 8247 } 8248 uint64_t VTSize = VT.getFixedSizeInBits(); 8249 if (VTSize == 16) 8250 return std::make_pair(0U, &AArch64::FPR16RegClass); 8251 if (VTSize == 32) 8252 return std::make_pair(0U, &AArch64::FPR32RegClass); 8253 if (VTSize == 64) 8254 return std::make_pair(0U, &AArch64::FPR64RegClass); 8255 if (VTSize == 128) 8256 return std::make_pair(0U, &AArch64::FPR128RegClass); 8257 break; 8258 } 8259 // The instructions that this constraint is designed for can 8260 // only take 128-bit registers so just use that regclass. 8261 case 'x': 8262 if (!Subtarget->hasFPARMv8()) 8263 break; 8264 if (VT.isScalableVector()) 8265 return std::make_pair(0U, &AArch64::ZPR_4bRegClass); 8266 if (VT.getSizeInBits() == 128) 8267 return std::make_pair(0U, &AArch64::FPR128_loRegClass); 8268 break; 8269 case 'y': 8270 if (!Subtarget->hasFPARMv8()) 8271 break; 8272 if (VT.isScalableVector()) 8273 return std::make_pair(0U, &AArch64::ZPR_3bRegClass); 8274 break; 8275 } 8276 } else { 8277 PredicateConstraint PC = parsePredicateConstraint(Constraint); 8278 if (PC != PredicateConstraint::Invalid) { 8279 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1) 8280 return std::make_pair(0U, nullptr); 8281 bool restricted = (PC == PredicateConstraint::Upl); 8282 return restricted ? std::make_pair(0U, &AArch64::PPR_3bRegClass) 8283 : std::make_pair(0U, &AArch64::PPRRegClass); 8284 } 8285 } 8286 if (StringRef("{cc}").equals_insensitive(Constraint)) 8287 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass); 8288 8289 // Use the default implementation in TargetLowering to convert the register 8290 // constraint into a member of a register class. 8291 std::pair<unsigned, const TargetRegisterClass *> Res; 8292 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 8293 8294 // Not found as a standard register? 8295 if (!Res.second) { 8296 unsigned Size = Constraint.size(); 8297 if ((Size == 4 || Size == 5) && Constraint[0] == '{' && 8298 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') { 8299 int RegNo; 8300 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo); 8301 if (!Failed && RegNo >= 0 && RegNo <= 31) { 8302 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size. 8303 // By default we'll emit v0-v31 for this unless there's a modifier where 8304 // we'll emit the correct register as well. 8305 if (VT != MVT::Other && VT.getSizeInBits() == 64) { 8306 Res.first = AArch64::FPR64RegClass.getRegister(RegNo); 8307 Res.second = &AArch64::FPR64RegClass; 8308 } else { 8309 Res.first = AArch64::FPR128RegClass.getRegister(RegNo); 8310 Res.second = &AArch64::FPR128RegClass; 8311 } 8312 } 8313 } 8314 } 8315 8316 if (Res.second && !Subtarget->hasFPARMv8() && 8317 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) && 8318 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second)) 8319 return std::make_pair(0U, nullptr); 8320 8321 return Res; 8322 } 8323 8324 EVT AArch64TargetLowering::getAsmOperandValueType(const DataLayout &DL, 8325 llvm::Type *Ty, 8326 bool AllowUnknown) const { 8327 if (Subtarget->hasLS64() && Ty->isIntegerTy(512)) 8328 return EVT(MVT::i64x8); 8329 8330 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown); 8331 } 8332 8333 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 8334 /// vector. If it is invalid, don't add anything to Ops. 8335 void AArch64TargetLowering::LowerAsmOperandForConstraint( 8336 SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, 8337 SelectionDAG &DAG) const { 8338 SDValue Result; 8339 8340 // Currently only support length 1 constraints. 8341 if (Constraint.length() != 1) 8342 return; 8343 8344 char ConstraintLetter = Constraint[0]; 8345 switch (ConstraintLetter) { 8346 default: 8347 break; 8348 8349 // This set of constraints deal with valid constants for various instructions. 8350 // Validate and return a target constant for them if we can. 8351 case 'z': { 8352 // 'z' maps to xzr or wzr so it needs an input of 0. 8353 if (!isNullConstant(Op)) 8354 return; 8355 8356 if (Op.getValueType() == MVT::i64) 8357 Result = DAG.getRegister(AArch64::XZR, MVT::i64); 8358 else 8359 Result = DAG.getRegister(AArch64::WZR, MVT::i32); 8360 break; 8361 } 8362 case 'S': { 8363 // An absolute symbolic address or label reference. 8364 if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) { 8365 Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op), 8366 GA->getValueType(0)); 8367 } else if (const BlockAddressSDNode *BA = 8368 dyn_cast<BlockAddressSDNode>(Op)) { 8369 Result = 8370 DAG.getTargetBlockAddress(BA->getBlockAddress(), BA->getValueType(0)); 8371 } else 8372 return; 8373 break; 8374 } 8375 8376 case 'I': 8377 case 'J': 8378 case 'K': 8379 case 'L': 8380 case 'M': 8381 case 'N': 8382 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 8383 if (!C) 8384 return; 8385 8386 // Grab the value and do some validation. 8387 uint64_t CVal = C->getZExtValue(); 8388 switch (ConstraintLetter) { 8389 // The I constraint applies only to simple ADD or SUB immediate operands: 8390 // i.e. 0 to 4095 with optional shift by 12 8391 // The J constraint applies only to ADD or SUB immediates that would be 8392 // valid when negated, i.e. if [an add pattern] were to be output as a SUB 8393 // instruction [or vice versa], in other words -1 to -4095 with optional 8394 // left shift by 12. 8395 case 'I': 8396 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal)) 8397 break; 8398 return; 8399 case 'J': { 8400 uint64_t NVal = -C->getSExtValue(); 8401 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) { 8402 CVal = C->getSExtValue(); 8403 break; 8404 } 8405 return; 8406 } 8407 // The K and L constraints apply *only* to logical immediates, including 8408 // what used to be the MOVI alias for ORR (though the MOVI alias has now 8409 // been removed and MOV should be used). So these constraints have to 8410 // distinguish between bit patterns that are valid 32-bit or 64-bit 8411 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but 8412 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice 8413 // versa. 8414 case 'K': 8415 if (AArch64_AM::isLogicalImmediate(CVal, 32)) 8416 break; 8417 return; 8418 case 'L': 8419 if (AArch64_AM::isLogicalImmediate(CVal, 64)) 8420 break; 8421 return; 8422 // The M and N constraints are a superset of K and L respectively, for use 8423 // with the MOV (immediate) alias. As well as the logical immediates they 8424 // also match 32 or 64-bit immediates that can be loaded either using a 8425 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca 8426 // (M) or 64-bit 0x1234000000000000 (N) etc. 8427 // As a note some of this code is liberally stolen from the asm parser. 8428 case 'M': { 8429 if (!isUInt<32>(CVal)) 8430 return; 8431 if (AArch64_AM::isLogicalImmediate(CVal, 32)) 8432 break; 8433 if ((CVal & 0xFFFF) == CVal) 8434 break; 8435 if ((CVal & 0xFFFF0000ULL) == CVal) 8436 break; 8437 uint64_t NCVal = ~(uint32_t)CVal; 8438 if ((NCVal & 0xFFFFULL) == NCVal) 8439 break; 8440 if ((NCVal & 0xFFFF0000ULL) == NCVal) 8441 break; 8442 return; 8443 } 8444 case 'N': { 8445 if (AArch64_AM::isLogicalImmediate(CVal, 64)) 8446 break; 8447 if ((CVal & 0xFFFFULL) == CVal) 8448 break; 8449 if ((CVal & 0xFFFF0000ULL) == CVal) 8450 break; 8451 if ((CVal & 0xFFFF00000000ULL) == CVal) 8452 break; 8453 if ((CVal & 0xFFFF000000000000ULL) == CVal) 8454 break; 8455 uint64_t NCVal = ~CVal; 8456 if ((NCVal & 0xFFFFULL) == NCVal) 8457 break; 8458 if ((NCVal & 0xFFFF0000ULL) == NCVal) 8459 break; 8460 if ((NCVal & 0xFFFF00000000ULL) == NCVal) 8461 break; 8462 if ((NCVal & 0xFFFF000000000000ULL) == NCVal) 8463 break; 8464 return; 8465 } 8466 default: 8467 return; 8468 } 8469 8470 // All assembler immediates are 64-bit integers. 8471 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64); 8472 break; 8473 } 8474 8475 if (Result.getNode()) { 8476 Ops.push_back(Result); 8477 return; 8478 } 8479 8480 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 8481 } 8482 8483 //===----------------------------------------------------------------------===// 8484 // AArch64 Advanced SIMD Support 8485 //===----------------------------------------------------------------------===// 8486 8487 /// WidenVector - Given a value in the V64 register class, produce the 8488 /// equivalent value in the V128 register class. 8489 static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) { 8490 EVT VT = V64Reg.getValueType(); 8491 unsigned NarrowSize = VT.getVectorNumElements(); 8492 MVT EltTy = VT.getVectorElementType().getSimpleVT(); 8493 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize); 8494 SDLoc DL(V64Reg); 8495 8496 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy), 8497 V64Reg, DAG.getConstant(0, DL, MVT::i64)); 8498 } 8499 8500 /// getExtFactor - Determine the adjustment factor for the position when 8501 /// generating an "extract from vector registers" instruction. 8502 static unsigned getExtFactor(SDValue &V) { 8503 EVT EltType = V.getValueType().getVectorElementType(); 8504 return EltType.getSizeInBits() / 8; 8505 } 8506 8507 /// NarrowVector - Given a value in the V128 register class, produce the 8508 /// equivalent value in the V64 register class. 8509 static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) { 8510 EVT VT = V128Reg.getValueType(); 8511 unsigned WideSize = VT.getVectorNumElements(); 8512 MVT EltTy = VT.getVectorElementType().getSimpleVT(); 8513 MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2); 8514 SDLoc DL(V128Reg); 8515 8516 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg); 8517 } 8518 8519 // Gather data to see if the operation can be modelled as a 8520 // shuffle in combination with VEXTs. 8521 SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, 8522 SelectionDAG &DAG) const { 8523 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 8524 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n"); 8525 SDLoc dl(Op); 8526 EVT VT = Op.getValueType(); 8527 assert(!VT.isScalableVector() && 8528 "Scalable vectors cannot be used with ISD::BUILD_VECTOR"); 8529 unsigned NumElts = VT.getVectorNumElements(); 8530 8531 struct ShuffleSourceInfo { 8532 SDValue Vec; 8533 unsigned MinElt; 8534 unsigned MaxElt; 8535 8536 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to 8537 // be compatible with the shuffle we intend to construct. As a result 8538 // ShuffleVec will be some sliding window into the original Vec. 8539 SDValue ShuffleVec; 8540 8541 // Code should guarantee that element i in Vec starts at element "WindowBase 8542 // + i * WindowScale in ShuffleVec". 8543 int WindowBase; 8544 int WindowScale; 8545 8546 ShuffleSourceInfo(SDValue Vec) 8547 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0), 8548 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {} 8549 8550 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } 8551 }; 8552 8553 // First gather all vectors used as an immediate source for this BUILD_VECTOR 8554 // node. 8555 SmallVector<ShuffleSourceInfo, 2> Sources; 8556 for (unsigned i = 0; i < NumElts; ++i) { 8557 SDValue V = Op.getOperand(i); 8558 if (V.isUndef()) 8559 continue; 8560 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 8561 !isa<ConstantSDNode>(V.getOperand(1))) { 8562 LLVM_DEBUG( 8563 dbgs() << "Reshuffle failed: " 8564 "a shuffle can only come from building a vector from " 8565 "various elements of other vectors, provided their " 8566 "indices are constant\n"); 8567 return SDValue(); 8568 } 8569 8570 // Add this element source to the list if it's not already there. 8571 SDValue SourceVec = V.getOperand(0); 8572 auto Source = find(Sources, SourceVec); 8573 if (Source == Sources.end()) 8574 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); 8575 8576 // Update the minimum and maximum lane number seen. 8577 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); 8578 Source->MinElt = std::min(Source->MinElt, EltNo); 8579 Source->MaxElt = std::max(Source->MaxElt, EltNo); 8580 } 8581 8582 if (Sources.size() > 2) { 8583 LLVM_DEBUG( 8584 dbgs() << "Reshuffle failed: currently only do something sane when at " 8585 "most two source vectors are involved\n"); 8586 return SDValue(); 8587 } 8588 8589 // Find out the smallest element size among result and two sources, and use 8590 // it as element size to build the shuffle_vector. 8591 EVT SmallestEltTy = VT.getVectorElementType(); 8592 for (auto &Source : Sources) { 8593 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); 8594 if (SrcEltTy.bitsLT(SmallestEltTy)) { 8595 SmallestEltTy = SrcEltTy; 8596 } 8597 } 8598 unsigned ResMultiplier = 8599 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits(); 8600 uint64_t VTSize = VT.getFixedSizeInBits(); 8601 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits(); 8602 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); 8603 8604 // If the source vector is too wide or too narrow, we may nevertheless be able 8605 // to construct a compatible shuffle either by concatenating it with UNDEF or 8606 // extracting a suitable range of elements. 8607 for (auto &Src : Sources) { 8608 EVT SrcVT = Src.ShuffleVec.getValueType(); 8609 8610 uint64_t SrcVTSize = SrcVT.getFixedSizeInBits(); 8611 if (SrcVTSize == VTSize) 8612 continue; 8613 8614 // This stage of the search produces a source with the same element type as 8615 // the original, but with a total width matching the BUILD_VECTOR output. 8616 EVT EltVT = SrcVT.getVectorElementType(); 8617 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits(); 8618 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); 8619 8620 if (SrcVTSize < VTSize) { 8621 assert(2 * SrcVTSize == VTSize); 8622 // We can pad out the smaller vector for free, so if it's part of a 8623 // shuffle... 8624 Src.ShuffleVec = 8625 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, 8626 DAG.getUNDEF(Src.ShuffleVec.getValueType())); 8627 continue; 8628 } 8629 8630 if (SrcVTSize != 2 * VTSize) { 8631 LLVM_DEBUG( 8632 dbgs() << "Reshuffle failed: result vector too small to extract\n"); 8633 return SDValue(); 8634 } 8635 8636 if (Src.MaxElt - Src.MinElt >= NumSrcElts) { 8637 LLVM_DEBUG( 8638 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n"); 8639 return SDValue(); 8640 } 8641 8642 if (Src.MinElt >= NumSrcElts) { 8643 // The extraction can just take the second half 8644 Src.ShuffleVec = 8645 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 8646 DAG.getConstant(NumSrcElts, dl, MVT::i64)); 8647 Src.WindowBase = -NumSrcElts; 8648 } else if (Src.MaxElt < NumSrcElts) { 8649 // The extraction can just take the first half 8650 Src.ShuffleVec = 8651 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 8652 DAG.getConstant(0, dl, MVT::i64)); 8653 } else { 8654 // An actual VEXT is needed 8655 SDValue VEXTSrc1 = 8656 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 8657 DAG.getConstant(0, dl, MVT::i64)); 8658 SDValue VEXTSrc2 = 8659 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 8660 DAG.getConstant(NumSrcElts, dl, MVT::i64)); 8661 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1); 8662 8663 if (!SrcVT.is64BitVector()) { 8664 LLVM_DEBUG( 8665 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT " 8666 "for SVE vectors."); 8667 return SDValue(); 8668 } 8669 8670 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1, 8671 VEXTSrc2, 8672 DAG.getConstant(Imm, dl, MVT::i32)); 8673 Src.WindowBase = -Src.MinElt; 8674 } 8675 } 8676 8677 // Another possible incompatibility occurs from the vector element types. We 8678 // can fix this by bitcasting the source vectors to the same type we intend 8679 // for the shuffle. 8680 for (auto &Src : Sources) { 8681 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); 8682 if (SrcEltTy == SmallestEltTy) 8683 continue; 8684 assert(ShuffleVT.getVectorElementType() == SmallestEltTy); 8685 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); 8686 Src.WindowScale = 8687 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits(); 8688 Src.WindowBase *= Src.WindowScale; 8689 } 8690 8691 // Final sanity check before we try to actually produce a shuffle. 8692 LLVM_DEBUG(for (auto Src 8693 : Sources) 8694 assert(Src.ShuffleVec.getValueType() == ShuffleVT);); 8695 8696 // The stars all align, our next step is to produce the mask for the shuffle. 8697 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); 8698 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits(); 8699 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { 8700 SDValue Entry = Op.getOperand(i); 8701 if (Entry.isUndef()) 8702 continue; 8703 8704 auto Src = find(Sources, Entry.getOperand(0)); 8705 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); 8706 8707 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit 8708 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this 8709 // segment. 8710 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); 8711 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(), 8712 VT.getScalarSizeInBits()); 8713 int LanesDefined = BitsDefined / BitsPerShuffleLane; 8714 8715 // This source is expected to fill ResMultiplier lanes of the final shuffle, 8716 // starting at the appropriate offset. 8717 int *LaneMask = &Mask[i * ResMultiplier]; 8718 8719 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; 8720 ExtractBase += NumElts * (Src - Sources.begin()); 8721 for (int j = 0; j < LanesDefined; ++j) 8722 LaneMask[j] = ExtractBase + j; 8723 } 8724 8725 // Final check before we try to produce nonsense... 8726 if (!isShuffleMaskLegal(Mask, ShuffleVT)) { 8727 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n"); 8728 return SDValue(); 8729 } 8730 8731 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; 8732 for (unsigned i = 0; i < Sources.size(); ++i) 8733 ShuffleOps[i] = Sources[i].ShuffleVec; 8734 8735 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], 8736 ShuffleOps[1], Mask); 8737 SDValue V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); 8738 8739 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump(); 8740 dbgs() << "Reshuffle, creating node: "; V.dump();); 8741 8742 return V; 8743 } 8744 8745 // check if an EXT instruction can handle the shuffle mask when the 8746 // vector sources of the shuffle are the same. 8747 static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { 8748 unsigned NumElts = VT.getVectorNumElements(); 8749 8750 // Assume that the first shuffle index is not UNDEF. Fail if it is. 8751 if (M[0] < 0) 8752 return false; 8753 8754 Imm = M[0]; 8755 8756 // If this is a VEXT shuffle, the immediate value is the index of the first 8757 // element. The other shuffle indices must be the successive elements after 8758 // the first one. 8759 unsigned ExpectedElt = Imm; 8760 for (unsigned i = 1; i < NumElts; ++i) { 8761 // Increment the expected index. If it wraps around, just follow it 8762 // back to index zero and keep going. 8763 ++ExpectedElt; 8764 if (ExpectedElt == NumElts) 8765 ExpectedElt = 0; 8766 8767 if (M[i] < 0) 8768 continue; // ignore UNDEF indices 8769 if (ExpectedElt != static_cast<unsigned>(M[i])) 8770 return false; 8771 } 8772 8773 return true; 8774 } 8775 8776 /// Check if a vector shuffle corresponds to a DUP instructions with a larger 8777 /// element width than the vector lane type. If that is the case the function 8778 /// returns true and writes the value of the DUP instruction lane operand into 8779 /// DupLaneOp 8780 static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize, 8781 unsigned &DupLaneOp) { 8782 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && 8783 "Only possible block sizes for wide DUP are: 16, 32, 64"); 8784 8785 if (BlockSize <= VT.getScalarSizeInBits()) 8786 return false; 8787 if (BlockSize % VT.getScalarSizeInBits() != 0) 8788 return false; 8789 if (VT.getSizeInBits() % BlockSize != 0) 8790 return false; 8791 8792 size_t SingleVecNumElements = VT.getVectorNumElements(); 8793 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits(); 8794 size_t NumBlocks = VT.getSizeInBits() / BlockSize; 8795 8796 // We are looking for masks like 8797 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element 8798 // might be replaced by 'undefined'. BlockIndices will eventually contain 8799 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7] 8800 // for the above examples) 8801 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1); 8802 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++) 8803 for (size_t I = 0; I < NumEltsPerBlock; I++) { 8804 int Elt = M[BlockIndex * NumEltsPerBlock + I]; 8805 if (Elt < 0) 8806 continue; 8807 // For now we don't support shuffles that use the second operand 8808 if ((unsigned)Elt >= SingleVecNumElements) 8809 return false; 8810 if (BlockElts[I] < 0) 8811 BlockElts[I] = Elt; 8812 else if (BlockElts[I] != Elt) 8813 return false; 8814 } 8815 8816 // We found a candidate block (possibly with some undefs). It must be a 8817 // sequence of consecutive integers starting with a value divisible by 8818 // NumEltsPerBlock with some values possibly replaced by undef-s. 8819 8820 // Find first non-undef element 8821 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; }); 8822 assert(FirstRealEltIter != BlockElts.end() && 8823 "Shuffle with all-undefs must have been caught by previous cases, " 8824 "e.g. isSplat()"); 8825 if (FirstRealEltIter == BlockElts.end()) { 8826 DupLaneOp = 0; 8827 return true; 8828 } 8829 8830 // Index of FirstRealElt in BlockElts 8831 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin(); 8832 8833 if ((unsigned)*FirstRealEltIter < FirstRealIndex) 8834 return false; 8835 // BlockElts[0] must have the following value if it isn't undef: 8836 size_t Elt0 = *FirstRealEltIter - FirstRealIndex; 8837 8838 // Check the first element 8839 if (Elt0 % NumEltsPerBlock != 0) 8840 return false; 8841 // Check that the sequence indeed consists of consecutive integers (modulo 8842 // undefs) 8843 for (size_t I = 0; I < NumEltsPerBlock; I++) 8844 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I) 8845 return false; 8846 8847 DupLaneOp = Elt0 / NumEltsPerBlock; 8848 return true; 8849 } 8850 8851 // check if an EXT instruction can handle the shuffle mask when the 8852 // vector sources of the shuffle are different. 8853 static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT, 8854 unsigned &Imm) { 8855 // Look for the first non-undef element. 8856 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; }); 8857 8858 // Benefit form APInt to handle overflow when calculating expected element. 8859 unsigned NumElts = VT.getVectorNumElements(); 8860 unsigned MaskBits = APInt(32, NumElts * 2).logBase2(); 8861 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1); 8862 // The following shuffle indices must be the successive elements after the 8863 // first real element. 8864 const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(), 8865 [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;}); 8866 if (FirstWrongElt != M.end()) 8867 return false; 8868 8869 // The index of an EXT is the first element if it is not UNDEF. 8870 // Watch out for the beginning UNDEFs. The EXT index should be the expected 8871 // value of the first element. E.g. 8872 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>. 8873 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>. 8874 // ExpectedElt is the last mask index plus 1. 8875 Imm = ExpectedElt.getZExtValue(); 8876 8877 // There are two difference cases requiring to reverse input vectors. 8878 // For example, for vector <4 x i32> we have the following cases, 8879 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>) 8880 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>) 8881 // For both cases, we finally use mask <5, 6, 7, 0>, which requires 8882 // to reverse two input vectors. 8883 if (Imm < NumElts) 8884 ReverseEXT = true; 8885 else 8886 Imm -= NumElts; 8887 8888 return true; 8889 } 8890 8891 /// isREVMask - Check if a vector shuffle corresponds to a REV 8892 /// instruction with the specified blocksize. (The order of the elements 8893 /// within each block of the vector is reversed.) 8894 static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { 8895 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && 8896 "Only possible block sizes for REV are: 16, 32, 64"); 8897 8898 unsigned EltSz = VT.getScalarSizeInBits(); 8899 if (EltSz == 64) 8900 return false; 8901 8902 unsigned NumElts = VT.getVectorNumElements(); 8903 unsigned BlockElts = M[0] + 1; 8904 // If the first shuffle index is UNDEF, be optimistic. 8905 if (M[0] < 0) 8906 BlockElts = BlockSize / EltSz; 8907 8908 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) 8909 return false; 8910 8911 for (unsigned i = 0; i < NumElts; ++i) { 8912 if (M[i] < 0) 8913 continue; // ignore UNDEF indices 8914 if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts)) 8915 return false; 8916 } 8917 8918 return true; 8919 } 8920 8921 static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 8922 unsigned NumElts = VT.getVectorNumElements(); 8923 if (NumElts % 2 != 0) 8924 return false; 8925 WhichResult = (M[0] == 0 ? 0 : 1); 8926 unsigned Idx = WhichResult * NumElts / 2; 8927 for (unsigned i = 0; i != NumElts; i += 2) { 8928 if ((M[i] >= 0 && (unsigned)M[i] != Idx) || 8929 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts)) 8930 return false; 8931 Idx += 1; 8932 } 8933 8934 return true; 8935 } 8936 8937 static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 8938 unsigned NumElts = VT.getVectorNumElements(); 8939 WhichResult = (M[0] == 0 ? 0 : 1); 8940 for (unsigned i = 0; i != NumElts; ++i) { 8941 if (M[i] < 0) 8942 continue; // ignore UNDEF indices 8943 if ((unsigned)M[i] != 2 * i + WhichResult) 8944 return false; 8945 } 8946 8947 return true; 8948 } 8949 8950 static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 8951 unsigned NumElts = VT.getVectorNumElements(); 8952 if (NumElts % 2 != 0) 8953 return false; 8954 WhichResult = (M[0] == 0 ? 0 : 1); 8955 for (unsigned i = 0; i < NumElts; i += 2) { 8956 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || 8957 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult)) 8958 return false; 8959 } 8960 return true; 8961 } 8962 8963 /// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of 8964 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 8965 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. 8966 static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 8967 unsigned NumElts = VT.getVectorNumElements(); 8968 if (NumElts % 2 != 0) 8969 return false; 8970 WhichResult = (M[0] == 0 ? 0 : 1); 8971 unsigned Idx = WhichResult * NumElts / 2; 8972 for (unsigned i = 0; i != NumElts; i += 2) { 8973 if ((M[i] >= 0 && (unsigned)M[i] != Idx) || 8974 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx)) 8975 return false; 8976 Idx += 1; 8977 } 8978 8979 return true; 8980 } 8981 8982 /// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of 8983 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 8984 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, 8985 static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 8986 unsigned Half = VT.getVectorNumElements() / 2; 8987 WhichResult = (M[0] == 0 ? 0 : 1); 8988 for (unsigned j = 0; j != 2; ++j) { 8989 unsigned Idx = WhichResult; 8990 for (unsigned i = 0; i != Half; ++i) { 8991 int MIdx = M[i + j * Half]; 8992 if (MIdx >= 0 && (unsigned)MIdx != Idx) 8993 return false; 8994 Idx += 2; 8995 } 8996 } 8997 8998 return true; 8999 } 9000 9001 /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of 9002 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 9003 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. 9004 static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 9005 unsigned NumElts = VT.getVectorNumElements(); 9006 if (NumElts % 2 != 0) 9007 return false; 9008 WhichResult = (M[0] == 0 ? 0 : 1); 9009 for (unsigned i = 0; i < NumElts; i += 2) { 9010 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || 9011 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult)) 9012 return false; 9013 } 9014 return true; 9015 } 9016 9017 static bool isINSMask(ArrayRef<int> M, int NumInputElements, 9018 bool &DstIsLeft, int &Anomaly) { 9019 if (M.size() != static_cast<size_t>(NumInputElements)) 9020 return false; 9021 9022 int NumLHSMatch = 0, NumRHSMatch = 0; 9023 int LastLHSMismatch = -1, LastRHSMismatch = -1; 9024 9025 for (int i = 0; i < NumInputElements; ++i) { 9026 if (M[i] == -1) { 9027 ++NumLHSMatch; 9028 ++NumRHSMatch; 9029 continue; 9030 } 9031 9032 if (M[i] == i) 9033 ++NumLHSMatch; 9034 else 9035 LastLHSMismatch = i; 9036 9037 if (M[i] == i + NumInputElements) 9038 ++NumRHSMatch; 9039 else 9040 LastRHSMismatch = i; 9041 } 9042 9043 if (NumLHSMatch == NumInputElements - 1) { 9044 DstIsLeft = true; 9045 Anomaly = LastLHSMismatch; 9046 return true; 9047 } else if (NumRHSMatch == NumInputElements - 1) { 9048 DstIsLeft = false; 9049 Anomaly = LastRHSMismatch; 9050 return true; 9051 } 9052 9053 return false; 9054 } 9055 9056 static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) { 9057 if (VT.getSizeInBits() != 128) 9058 return false; 9059 9060 unsigned NumElts = VT.getVectorNumElements(); 9061 9062 for (int I = 0, E = NumElts / 2; I != E; I++) { 9063 if (Mask[I] != I) 9064 return false; 9065 } 9066 9067 int Offset = NumElts / 2; 9068 for (int I = NumElts / 2, E = NumElts; I != E; I++) { 9069 if (Mask[I] != I + SplitLHS * Offset) 9070 return false; 9071 } 9072 9073 return true; 9074 } 9075 9076 static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) { 9077 SDLoc DL(Op); 9078 EVT VT = Op.getValueType(); 9079 SDValue V0 = Op.getOperand(0); 9080 SDValue V1 = Op.getOperand(1); 9081 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask(); 9082 9083 if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() || 9084 VT.getVectorElementType() != V1.getValueType().getVectorElementType()) 9085 return SDValue(); 9086 9087 bool SplitV0 = V0.getValueSizeInBits() == 128; 9088 9089 if (!isConcatMask(Mask, VT, SplitV0)) 9090 return SDValue(); 9091 9092 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); 9093 if (SplitV0) { 9094 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0, 9095 DAG.getConstant(0, DL, MVT::i64)); 9096 } 9097 if (V1.getValueSizeInBits() == 128) { 9098 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1, 9099 DAG.getConstant(0, DL, MVT::i64)); 9100 } 9101 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1); 9102 } 9103 9104 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit 9105 /// the specified operations to build the shuffle. 9106 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, 9107 SDValue RHS, SelectionDAG &DAG, 9108 const SDLoc &dl) { 9109 unsigned OpNum = (PFEntry >> 26) & 0x0F; 9110 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1); 9111 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1); 9112 9113 enum { 9114 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 9115 OP_VREV, 9116 OP_VDUP0, 9117 OP_VDUP1, 9118 OP_VDUP2, 9119 OP_VDUP3, 9120 OP_VEXT1, 9121 OP_VEXT2, 9122 OP_VEXT3, 9123 OP_VUZPL, // VUZP, left result 9124 OP_VUZPR, // VUZP, right result 9125 OP_VZIPL, // VZIP, left result 9126 OP_VZIPR, // VZIP, right result 9127 OP_VTRNL, // VTRN, left result 9128 OP_VTRNR // VTRN, right result 9129 }; 9130 9131 if (OpNum == OP_COPY) { 9132 if (LHSID == (1 * 9 + 2) * 9 + 3) 9133 return LHS; 9134 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!"); 9135 return RHS; 9136 } 9137 9138 SDValue OpLHS, OpRHS; 9139 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); 9140 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); 9141 EVT VT = OpLHS.getValueType(); 9142 9143 switch (OpNum) { 9144 default: 9145 llvm_unreachable("Unknown shuffle opcode!"); 9146 case OP_VREV: 9147 // VREV divides the vector in half and swaps within the half. 9148 if (VT.getVectorElementType() == MVT::i32 || 9149 VT.getVectorElementType() == MVT::f32) 9150 return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS); 9151 // vrev <4 x i16> -> REV32 9152 if (VT.getVectorElementType() == MVT::i16 || 9153 VT.getVectorElementType() == MVT::f16 || 9154 VT.getVectorElementType() == MVT::bf16) 9155 return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS); 9156 // vrev <4 x i8> -> REV16 9157 assert(VT.getVectorElementType() == MVT::i8); 9158 return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS); 9159 case OP_VDUP0: 9160 case OP_VDUP1: 9161 case OP_VDUP2: 9162 case OP_VDUP3: { 9163 EVT EltTy = VT.getVectorElementType(); 9164 unsigned Opcode; 9165 if (EltTy == MVT::i8) 9166 Opcode = AArch64ISD::DUPLANE8; 9167 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16) 9168 Opcode = AArch64ISD::DUPLANE16; 9169 else if (EltTy == MVT::i32 || EltTy == MVT::f32) 9170 Opcode = AArch64ISD::DUPLANE32; 9171 else if (EltTy == MVT::i64 || EltTy == MVT::f64) 9172 Opcode = AArch64ISD::DUPLANE64; 9173 else 9174 llvm_unreachable("Invalid vector element type?"); 9175 9176 if (VT.getSizeInBits() == 64) 9177 OpLHS = WidenVector(OpLHS, DAG); 9178 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64); 9179 return DAG.getNode(Opcode, dl, VT, OpLHS, Lane); 9180 } 9181 case OP_VEXT1: 9182 case OP_VEXT2: 9183 case OP_VEXT3: { 9184 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS); 9185 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS, 9186 DAG.getConstant(Imm, dl, MVT::i32)); 9187 } 9188 case OP_VUZPL: 9189 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS, 9190 OpRHS); 9191 case OP_VUZPR: 9192 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS, 9193 OpRHS); 9194 case OP_VZIPL: 9195 return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS, 9196 OpRHS); 9197 case OP_VZIPR: 9198 return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS, 9199 OpRHS); 9200 case OP_VTRNL: 9201 return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS, 9202 OpRHS); 9203 case OP_VTRNR: 9204 return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS, 9205 OpRHS); 9206 } 9207 } 9208 9209 static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask, 9210 SelectionDAG &DAG) { 9211 // Check to see if we can use the TBL instruction. 9212 SDValue V1 = Op.getOperand(0); 9213 SDValue V2 = Op.getOperand(1); 9214 SDLoc DL(Op); 9215 9216 EVT EltVT = Op.getValueType().getVectorElementType(); 9217 unsigned BytesPerElt = EltVT.getSizeInBits() / 8; 9218 9219 SmallVector<SDValue, 8> TBLMask; 9220 for (int Val : ShuffleMask) { 9221 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { 9222 unsigned Offset = Byte + Val * BytesPerElt; 9223 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32)); 9224 } 9225 } 9226 9227 MVT IndexVT = MVT::v8i8; 9228 unsigned IndexLen = 8; 9229 if (Op.getValueSizeInBits() == 128) { 9230 IndexVT = MVT::v16i8; 9231 IndexLen = 16; 9232 } 9233 9234 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1); 9235 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2); 9236 9237 SDValue Shuffle; 9238 if (V2.getNode()->isUndef()) { 9239 if (IndexLen == 8) 9240 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst); 9241 Shuffle = DAG.getNode( 9242 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, 9243 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, 9244 DAG.getBuildVector(IndexVT, DL, 9245 makeArrayRef(TBLMask.data(), IndexLen))); 9246 } else { 9247 if (IndexLen == 8) { 9248 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst); 9249 Shuffle = DAG.getNode( 9250 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, 9251 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, 9252 DAG.getBuildVector(IndexVT, DL, 9253 makeArrayRef(TBLMask.data(), IndexLen))); 9254 } else { 9255 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we 9256 // cannot currently represent the register constraints on the input 9257 // table registers. 9258 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst, 9259 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0], 9260 // IndexLen)); 9261 Shuffle = DAG.getNode( 9262 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, 9263 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst, 9264 V2Cst, DAG.getBuildVector(IndexVT, DL, 9265 makeArrayRef(TBLMask.data(), IndexLen))); 9266 } 9267 } 9268 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle); 9269 } 9270 9271 static unsigned getDUPLANEOp(EVT EltType) { 9272 if (EltType == MVT::i8) 9273 return AArch64ISD::DUPLANE8; 9274 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16) 9275 return AArch64ISD::DUPLANE16; 9276 if (EltType == MVT::i32 || EltType == MVT::f32) 9277 return AArch64ISD::DUPLANE32; 9278 if (EltType == MVT::i64 || EltType == MVT::f64) 9279 return AArch64ISD::DUPLANE64; 9280 9281 llvm_unreachable("Invalid vector element type?"); 9282 } 9283 9284 static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT, 9285 unsigned Opcode, SelectionDAG &DAG) { 9286 // Try to eliminate a bitcasted extract subvector before a DUPLANE. 9287 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) { 9288 // Match: dup (bitcast (extract_subv X, C)), LaneC 9289 if (BitCast.getOpcode() != ISD::BITCAST || 9290 BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR) 9291 return false; 9292 9293 // The extract index must align in the destination type. That may not 9294 // happen if the bitcast is from narrow to wide type. 9295 SDValue Extract = BitCast.getOperand(0); 9296 unsigned ExtIdx = Extract.getConstantOperandVal(1); 9297 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits(); 9298 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth; 9299 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits(); 9300 if (ExtIdxInBits % CastedEltBitWidth != 0) 9301 return false; 9302 9303 // Update the lane value by offsetting with the scaled extract index. 9304 LaneC += ExtIdxInBits / CastedEltBitWidth; 9305 9306 // Determine the casted vector type of the wide vector input. 9307 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC' 9308 // Examples: 9309 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3 9310 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5 9311 unsigned SrcVecNumElts = 9312 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth; 9313 CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(), 9314 SrcVecNumElts); 9315 return true; 9316 }; 9317 MVT CastVT; 9318 if (getScaledOffsetDup(V, Lane, CastVT)) { 9319 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0)); 9320 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR) { 9321 // The lane is incremented by the index of the extract. 9322 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3 9323 Lane += V.getConstantOperandVal(1); 9324 V = V.getOperand(0); 9325 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) { 9326 // The lane is decremented if we are splatting from the 2nd operand. 9327 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1 9328 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2; 9329 Lane -= Idx * VT.getVectorNumElements() / 2; 9330 V = WidenVector(V.getOperand(Idx), DAG); 9331 } else if (VT.getSizeInBits() == 64) { 9332 // Widen the operand to 128-bit register with undef. 9333 V = WidenVector(V, DAG); 9334 } 9335 return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64)); 9336 } 9337 9338 SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, 9339 SelectionDAG &DAG) const { 9340 SDLoc dl(Op); 9341 EVT VT = Op.getValueType(); 9342 9343 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 9344 9345 if (useSVEForFixedLengthVectorVT(VT)) 9346 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG); 9347 9348 // Convert shuffles that are directly supported on NEON to target-specific 9349 // DAG nodes, instead of keeping them as shuffles and matching them again 9350 // during code selection. This is more efficient and avoids the possibility 9351 // of inconsistencies between legalization and selection. 9352 ArrayRef<int> ShuffleMask = SVN->getMask(); 9353 9354 SDValue V1 = Op.getOperand(0); 9355 SDValue V2 = Op.getOperand(1); 9356 9357 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!"); 9358 assert(ShuffleMask.size() == VT.getVectorNumElements() && 9359 "Unexpected VECTOR_SHUFFLE mask size!"); 9360 9361 if (SVN->isSplat()) { 9362 int Lane = SVN->getSplatIndex(); 9363 // If this is undef splat, generate it via "just" vdup, if possible. 9364 if (Lane == -1) 9365 Lane = 0; 9366 9367 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) 9368 return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(), 9369 V1.getOperand(0)); 9370 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non- 9371 // constant. If so, we can just reference the lane's definition directly. 9372 if (V1.getOpcode() == ISD::BUILD_VECTOR && 9373 !isa<ConstantSDNode>(V1.getOperand(Lane))) 9374 return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane)); 9375 9376 // Otherwise, duplicate from the lane of the input vector. 9377 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType()); 9378 return constructDup(V1, Lane, dl, VT, Opcode, DAG); 9379 } 9380 9381 // Check if the mask matches a DUP for a wider element 9382 for (unsigned LaneSize : {64U, 32U, 16U}) { 9383 unsigned Lane = 0; 9384 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) { 9385 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64 9386 : LaneSize == 32 ? AArch64ISD::DUPLANE32 9387 : AArch64ISD::DUPLANE16; 9388 // Cast V1 to an integer vector with required lane size 9389 MVT NewEltTy = MVT::getIntegerVT(LaneSize); 9390 unsigned NewEltCount = VT.getSizeInBits() / LaneSize; 9391 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount); 9392 V1 = DAG.getBitcast(NewVecTy, V1); 9393 // Constuct the DUP instruction 9394 V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG); 9395 // Cast back to the original type 9396 return DAG.getBitcast(VT, V1); 9397 } 9398 } 9399 9400 if (isREVMask(ShuffleMask, VT, 64)) 9401 return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2); 9402 if (isREVMask(ShuffleMask, VT, 32)) 9403 return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2); 9404 if (isREVMask(ShuffleMask, VT, 16)) 9405 return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2); 9406 9407 if (((VT.getVectorNumElements() == 8 && VT.getScalarSizeInBits() == 16) || 9408 (VT.getVectorNumElements() == 16 && VT.getScalarSizeInBits() == 8)) && 9409 ShuffleVectorInst::isReverseMask(ShuffleMask)) { 9410 SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1); 9411 return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev, 9412 DAG.getConstant(8, dl, MVT::i32)); 9413 } 9414 9415 bool ReverseEXT = false; 9416 unsigned Imm; 9417 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) { 9418 if (ReverseEXT) 9419 std::swap(V1, V2); 9420 Imm *= getExtFactor(V1); 9421 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2, 9422 DAG.getConstant(Imm, dl, MVT::i32)); 9423 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) { 9424 Imm *= getExtFactor(V1); 9425 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1, 9426 DAG.getConstant(Imm, dl, MVT::i32)); 9427 } 9428 9429 unsigned WhichResult; 9430 if (isZIPMask(ShuffleMask, VT, WhichResult)) { 9431 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; 9432 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); 9433 } 9434 if (isUZPMask(ShuffleMask, VT, WhichResult)) { 9435 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; 9436 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); 9437 } 9438 if (isTRNMask(ShuffleMask, VT, WhichResult)) { 9439 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; 9440 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); 9441 } 9442 9443 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 9444 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; 9445 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); 9446 } 9447 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 9448 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; 9449 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); 9450 } 9451 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 9452 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; 9453 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); 9454 } 9455 9456 if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG)) 9457 return Concat; 9458 9459 bool DstIsLeft; 9460 int Anomaly; 9461 int NumInputElements = V1.getValueType().getVectorNumElements(); 9462 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) { 9463 SDValue DstVec = DstIsLeft ? V1 : V2; 9464 SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64); 9465 9466 SDValue SrcVec = V1; 9467 int SrcLane = ShuffleMask[Anomaly]; 9468 if (SrcLane >= NumInputElements) { 9469 SrcVec = V2; 9470 SrcLane -= VT.getVectorNumElements(); 9471 } 9472 SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64); 9473 9474 EVT ScalarVT = VT.getVectorElementType(); 9475 9476 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger()) 9477 ScalarVT = MVT::i32; 9478 9479 return DAG.getNode( 9480 ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, 9481 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV), 9482 DstLaneV); 9483 } 9484 9485 // If the shuffle is not directly supported and it has 4 elements, use 9486 // the PerfectShuffle-generated table to synthesize it from other shuffles. 9487 unsigned NumElts = VT.getVectorNumElements(); 9488 if (NumElts == 4) { 9489 unsigned PFIndexes[4]; 9490 for (unsigned i = 0; i != 4; ++i) { 9491 if (ShuffleMask[i] < 0) 9492 PFIndexes[i] = 8; 9493 else 9494 PFIndexes[i] = ShuffleMask[i]; 9495 } 9496 9497 // Compute the index in the perfect shuffle table. 9498 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + 9499 PFIndexes[2] * 9 + PFIndexes[3]; 9500 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 9501 unsigned Cost = (PFEntry >> 30); 9502 9503 if (Cost <= 4) 9504 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 9505 } 9506 9507 return GenerateTBL(Op, ShuffleMask, DAG); 9508 } 9509 9510 SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op, 9511 SelectionDAG &DAG) const { 9512 SDLoc dl(Op); 9513 EVT VT = Op.getValueType(); 9514 EVT ElemVT = VT.getScalarType(); 9515 SDValue SplatVal = Op.getOperand(0); 9516 9517 if (useSVEForFixedLengthVectorVT(VT)) 9518 return LowerToScalableOp(Op, DAG); 9519 9520 // Extend input splat value where needed to fit into a GPR (32b or 64b only) 9521 // FPRs don't have this restriction. 9522 switch (ElemVT.getSimpleVT().SimpleTy) { 9523 case MVT::i1: { 9524 // The only legal i1 vectors are SVE vectors, so we can use SVE-specific 9525 // lowering code. 9526 if (auto *ConstVal = dyn_cast<ConstantSDNode>(SplatVal)) { 9527 if (ConstVal->isOne()) 9528 return getPTrue(DAG, dl, VT, AArch64SVEPredPattern::all); 9529 // TODO: Add special case for constant false 9530 } 9531 // The general case of i1. There isn't any natural way to do this, 9532 // so we use some trickery with whilelo. 9533 SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64); 9534 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i64, SplatVal, 9535 DAG.getValueType(MVT::i1)); 9536 SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, 9537 MVT::i64); 9538 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, 9539 DAG.getConstant(0, dl, MVT::i64), SplatVal); 9540 } 9541 case MVT::i8: 9542 case MVT::i16: 9543 case MVT::i32: 9544 SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32); 9545 break; 9546 case MVT::i64: 9547 SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64); 9548 break; 9549 case MVT::f16: 9550 case MVT::bf16: 9551 case MVT::f32: 9552 case MVT::f64: 9553 // Fine as is 9554 break; 9555 default: 9556 report_fatal_error("Unsupported SPLAT_VECTOR input operand type"); 9557 } 9558 9559 return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal); 9560 } 9561 9562 SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op, 9563 SelectionDAG &DAG) const { 9564 SDLoc DL(Op); 9565 9566 EVT VT = Op.getValueType(); 9567 if (!isTypeLegal(VT) || !VT.isScalableVector()) 9568 return SDValue(); 9569 9570 // Current lowering only supports the SVE-ACLE types. 9571 if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock) 9572 return SDValue(); 9573 9574 // The DUPQ operation is indepedent of element type so normalise to i64s. 9575 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1)); 9576 SDValue Idx128 = Op.getOperand(2); 9577 9578 // DUPQ can be used when idx is in range. 9579 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128); 9580 if (CIdx && (CIdx->getZExtValue() <= 3)) { 9581 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64); 9582 SDNode *DUPQ = 9583 DAG.getMachineNode(AArch64::DUP_ZZI_Q, DL, MVT::nxv2i64, V, CI); 9584 return DAG.getNode(ISD::BITCAST, DL, VT, SDValue(DUPQ, 0)); 9585 } 9586 9587 // The ACLE says this must produce the same result as: 9588 // svtbl(data, svadd_x(svptrue_b64(), 9589 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1), 9590 // index * 2)) 9591 SDValue One = DAG.getConstant(1, DL, MVT::i64); 9592 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One); 9593 9594 // create the vector 0,1,0,1,... 9595 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64); 9596 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne); 9597 9598 // create the vector idx64,idx64+1,idx64,idx64+1,... 9599 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128); 9600 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64); 9601 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64); 9602 9603 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],... 9604 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask); 9605 return DAG.getNode(ISD::BITCAST, DL, VT, TBL); 9606 } 9607 9608 9609 static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, 9610 APInt &UndefBits) { 9611 EVT VT = BVN->getValueType(0); 9612 APInt SplatBits, SplatUndef; 9613 unsigned SplatBitSize; 9614 bool HasAnyUndefs; 9615 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 9616 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize; 9617 9618 for (unsigned i = 0; i < NumSplats; ++i) { 9619 CnstBits <<= SplatBitSize; 9620 UndefBits <<= SplatBitSize; 9621 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits()); 9622 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits()); 9623 } 9624 9625 return true; 9626 } 9627 9628 return false; 9629 } 9630 9631 // Try 64-bit splatted SIMD immediate. 9632 static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, 9633 const APInt &Bits) { 9634 if (Bits.getHiBits(64) == Bits.getLoBits(64)) { 9635 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); 9636 EVT VT = Op.getValueType(); 9637 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64; 9638 9639 if (AArch64_AM::isAdvSIMDModImmType10(Value)) { 9640 Value = AArch64_AM::encodeAdvSIMDModImmType10(Value); 9641 9642 SDLoc dl(Op); 9643 SDValue Mov = DAG.getNode(NewOp, dl, MovTy, 9644 DAG.getConstant(Value, dl, MVT::i32)); 9645 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 9646 } 9647 } 9648 9649 return SDValue(); 9650 } 9651 9652 // Try 32-bit splatted SIMD immediate. 9653 static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, 9654 const APInt &Bits, 9655 const SDValue *LHS = nullptr) { 9656 if (Bits.getHiBits(64) == Bits.getLoBits(64)) { 9657 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); 9658 EVT VT = Op.getValueType(); 9659 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 9660 bool isAdvSIMDModImm = false; 9661 uint64_t Shift; 9662 9663 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) { 9664 Value = AArch64_AM::encodeAdvSIMDModImmType1(Value); 9665 Shift = 0; 9666 } 9667 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) { 9668 Value = AArch64_AM::encodeAdvSIMDModImmType2(Value); 9669 Shift = 8; 9670 } 9671 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) { 9672 Value = AArch64_AM::encodeAdvSIMDModImmType3(Value); 9673 Shift = 16; 9674 } 9675 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) { 9676 Value = AArch64_AM::encodeAdvSIMDModImmType4(Value); 9677 Shift = 24; 9678 } 9679 9680 if (isAdvSIMDModImm) { 9681 SDLoc dl(Op); 9682 SDValue Mov; 9683 9684 if (LHS) 9685 Mov = DAG.getNode(NewOp, dl, MovTy, *LHS, 9686 DAG.getConstant(Value, dl, MVT::i32), 9687 DAG.getConstant(Shift, dl, MVT::i32)); 9688 else 9689 Mov = DAG.getNode(NewOp, dl, MovTy, 9690 DAG.getConstant(Value, dl, MVT::i32), 9691 DAG.getConstant(Shift, dl, MVT::i32)); 9692 9693 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 9694 } 9695 } 9696 9697 return SDValue(); 9698 } 9699 9700 // Try 16-bit splatted SIMD immediate. 9701 static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, 9702 const APInt &Bits, 9703 const SDValue *LHS = nullptr) { 9704 if (Bits.getHiBits(64) == Bits.getLoBits(64)) { 9705 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); 9706 EVT VT = Op.getValueType(); 9707 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 9708 bool isAdvSIMDModImm = false; 9709 uint64_t Shift; 9710 9711 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) { 9712 Value = AArch64_AM::encodeAdvSIMDModImmType5(Value); 9713 Shift = 0; 9714 } 9715 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) { 9716 Value = AArch64_AM::encodeAdvSIMDModImmType6(Value); 9717 Shift = 8; 9718 } 9719 9720 if (isAdvSIMDModImm) { 9721 SDLoc dl(Op); 9722 SDValue Mov; 9723 9724 if (LHS) 9725 Mov = DAG.getNode(NewOp, dl, MovTy, *LHS, 9726 DAG.getConstant(Value, dl, MVT::i32), 9727 DAG.getConstant(Shift, dl, MVT::i32)); 9728 else 9729 Mov = DAG.getNode(NewOp, dl, MovTy, 9730 DAG.getConstant(Value, dl, MVT::i32), 9731 DAG.getConstant(Shift, dl, MVT::i32)); 9732 9733 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 9734 } 9735 } 9736 9737 return SDValue(); 9738 } 9739 9740 // Try 32-bit splatted SIMD immediate with shifted ones. 9741 static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, 9742 SelectionDAG &DAG, const APInt &Bits) { 9743 if (Bits.getHiBits(64) == Bits.getLoBits(64)) { 9744 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); 9745 EVT VT = Op.getValueType(); 9746 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 9747 bool isAdvSIMDModImm = false; 9748 uint64_t Shift; 9749 9750 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) { 9751 Value = AArch64_AM::encodeAdvSIMDModImmType7(Value); 9752 Shift = 264; 9753 } 9754 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) { 9755 Value = AArch64_AM::encodeAdvSIMDModImmType8(Value); 9756 Shift = 272; 9757 } 9758 9759 if (isAdvSIMDModImm) { 9760 SDLoc dl(Op); 9761 SDValue Mov = DAG.getNode(NewOp, dl, MovTy, 9762 DAG.getConstant(Value, dl, MVT::i32), 9763 DAG.getConstant(Shift, dl, MVT::i32)); 9764 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 9765 } 9766 } 9767 9768 return SDValue(); 9769 } 9770 9771 // Try 8-bit splatted SIMD immediate. 9772 static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, 9773 const APInt &Bits) { 9774 if (Bits.getHiBits(64) == Bits.getLoBits(64)) { 9775 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); 9776 EVT VT = Op.getValueType(); 9777 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8; 9778 9779 if (AArch64_AM::isAdvSIMDModImmType9(Value)) { 9780 Value = AArch64_AM::encodeAdvSIMDModImmType9(Value); 9781 9782 SDLoc dl(Op); 9783 SDValue Mov = DAG.getNode(NewOp, dl, MovTy, 9784 DAG.getConstant(Value, dl, MVT::i32)); 9785 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 9786 } 9787 } 9788 9789 return SDValue(); 9790 } 9791 9792 // Try FP splatted SIMD immediate. 9793 static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, 9794 const APInt &Bits) { 9795 if (Bits.getHiBits(64) == Bits.getLoBits(64)) { 9796 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); 9797 EVT VT = Op.getValueType(); 9798 bool isWide = (VT.getSizeInBits() == 128); 9799 MVT MovTy; 9800 bool isAdvSIMDModImm = false; 9801 9802 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) { 9803 Value = AArch64_AM::encodeAdvSIMDModImmType11(Value); 9804 MovTy = isWide ? MVT::v4f32 : MVT::v2f32; 9805 } 9806 else if (isWide && 9807 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) { 9808 Value = AArch64_AM::encodeAdvSIMDModImmType12(Value); 9809 MovTy = MVT::v2f64; 9810 } 9811 9812 if (isAdvSIMDModImm) { 9813 SDLoc dl(Op); 9814 SDValue Mov = DAG.getNode(NewOp, dl, MovTy, 9815 DAG.getConstant(Value, dl, MVT::i32)); 9816 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 9817 } 9818 } 9819 9820 return SDValue(); 9821 } 9822 9823 // Specialized code to quickly find if PotentialBVec is a BuildVector that 9824 // consists of only the same constant int value, returned in reference arg 9825 // ConstVal 9826 static bool isAllConstantBuildVector(const SDValue &PotentialBVec, 9827 uint64_t &ConstVal) { 9828 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec); 9829 if (!Bvec) 9830 return false; 9831 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0)); 9832 if (!FirstElt) 9833 return false; 9834 EVT VT = Bvec->getValueType(0); 9835 unsigned NumElts = VT.getVectorNumElements(); 9836 for (unsigned i = 1; i < NumElts; ++i) 9837 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt) 9838 return false; 9839 ConstVal = FirstElt->getZExtValue(); 9840 return true; 9841 } 9842 9843 static unsigned getIntrinsicID(const SDNode *N) { 9844 unsigned Opcode = N->getOpcode(); 9845 switch (Opcode) { 9846 default: 9847 return Intrinsic::not_intrinsic; 9848 case ISD::INTRINSIC_WO_CHAIN: { 9849 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 9850 if (IID < Intrinsic::num_intrinsics) 9851 return IID; 9852 return Intrinsic::not_intrinsic; 9853 } 9854 } 9855 } 9856 9857 // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)), 9858 // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a 9859 // BUILD_VECTORs with constant element C1, C2 is a constant, and: 9860 // - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2) 9861 // - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2) 9862 // The (or (lsl Y, C2), (and X, BvecC1)) case is also handled. 9863 static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { 9864 EVT VT = N->getValueType(0); 9865 9866 if (!VT.isVector()) 9867 return SDValue(); 9868 9869 SDLoc DL(N); 9870 9871 SDValue And; 9872 SDValue Shift; 9873 9874 SDValue FirstOp = N->getOperand(0); 9875 unsigned FirstOpc = FirstOp.getOpcode(); 9876 SDValue SecondOp = N->getOperand(1); 9877 unsigned SecondOpc = SecondOp.getOpcode(); 9878 9879 // Is one of the operands an AND or a BICi? The AND may have been optimised to 9880 // a BICi in order to use an immediate instead of a register. 9881 // Is the other operand an shl or lshr? This will have been turned into: 9882 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift. 9883 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) && 9884 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR)) { 9885 And = FirstOp; 9886 Shift = SecondOp; 9887 9888 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) && 9889 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR)) { 9890 And = SecondOp; 9891 Shift = FirstOp; 9892 } else 9893 return SDValue(); 9894 9895 bool IsAnd = And.getOpcode() == ISD::AND; 9896 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR; 9897 9898 // Is the shift amount constant? 9899 ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1)); 9900 if (!C2node) 9901 return SDValue(); 9902 9903 uint64_t C1; 9904 if (IsAnd) { 9905 // Is the and mask vector all constant? 9906 if (!isAllConstantBuildVector(And.getOperand(1), C1)) 9907 return SDValue(); 9908 } else { 9909 // Reconstruct the corresponding AND immediate from the two BICi immediates. 9910 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1)); 9911 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2)); 9912 assert(C1nodeImm && C1nodeShift); 9913 C1 = ~(C1nodeImm->getZExtValue() << C1nodeShift->getZExtValue()); 9914 } 9915 9916 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or 9917 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account 9918 // how much one can shift elements of a particular size? 9919 uint64_t C2 = C2node->getZExtValue(); 9920 unsigned ElemSizeInBits = VT.getScalarSizeInBits(); 9921 if (C2 > ElemSizeInBits) 9922 return SDValue(); 9923 9924 APInt C1AsAPInt(ElemSizeInBits, C1); 9925 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2) 9926 : APInt::getLowBitsSet(ElemSizeInBits, C2); 9927 if (C1AsAPInt != RequiredC1) 9928 return SDValue(); 9929 9930 SDValue X = And.getOperand(0); 9931 SDValue Y = Shift.getOperand(0); 9932 9933 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI; 9934 SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Shift.getOperand(1)); 9935 9936 LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n"); 9937 LLVM_DEBUG(N->dump(&DAG)); 9938 LLVM_DEBUG(dbgs() << "into: \n"); 9939 LLVM_DEBUG(ResultSLI->dump(&DAG)); 9940 9941 ++NumShiftInserts; 9942 return ResultSLI; 9943 } 9944 9945 SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, 9946 SelectionDAG &DAG) const { 9947 if (useSVEForFixedLengthVectorVT(Op.getValueType())) 9948 return LowerToScalableOp(Op, DAG); 9949 9950 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2)) 9951 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG)) 9952 return Res; 9953 9954 EVT VT = Op.getValueType(); 9955 9956 SDValue LHS = Op.getOperand(0); 9957 BuildVectorSDNode *BVN = 9958 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode()); 9959 if (!BVN) { 9960 // OR commutes, so try swapping the operands. 9961 LHS = Op.getOperand(1); 9962 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode()); 9963 } 9964 if (!BVN) 9965 return Op; 9966 9967 APInt DefBits(VT.getSizeInBits(), 0); 9968 APInt UndefBits(VT.getSizeInBits(), 0); 9969 if (resolveBuildVector(BVN, DefBits, UndefBits)) { 9970 SDValue NewOp; 9971 9972 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG, 9973 DefBits, &LHS)) || 9974 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG, 9975 DefBits, &LHS))) 9976 return NewOp; 9977 9978 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG, 9979 UndefBits, &LHS)) || 9980 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG, 9981 UndefBits, &LHS))) 9982 return NewOp; 9983 } 9984 9985 // We can always fall back to a non-immediate OR. 9986 return Op; 9987 } 9988 9989 // Normalize the operands of BUILD_VECTOR. The value of constant operands will 9990 // be truncated to fit element width. 9991 static SDValue NormalizeBuildVector(SDValue Op, 9992 SelectionDAG &DAG) { 9993 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 9994 SDLoc dl(Op); 9995 EVT VT = Op.getValueType(); 9996 EVT EltTy= VT.getVectorElementType(); 9997 9998 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16) 9999 return Op; 10000 10001 SmallVector<SDValue, 16> Ops; 10002 for (SDValue Lane : Op->ops()) { 10003 // For integer vectors, type legalization would have promoted the 10004 // operands already. Otherwise, if Op is a floating-point splat 10005 // (with operands cast to integers), then the only possibilities 10006 // are constants and UNDEFs. 10007 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) { 10008 APInt LowBits(EltTy.getSizeInBits(), 10009 CstLane->getZExtValue()); 10010 Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32); 10011 } else if (Lane.getNode()->isUndef()) { 10012 Lane = DAG.getUNDEF(MVT::i32); 10013 } else { 10014 assert(Lane.getValueType() == MVT::i32 && 10015 "Unexpected BUILD_VECTOR operand type"); 10016 } 10017 Ops.push_back(Lane); 10018 } 10019 return DAG.getBuildVector(VT, dl, Ops); 10020 } 10021 10022 static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG) { 10023 EVT VT = Op.getValueType(); 10024 10025 APInt DefBits(VT.getSizeInBits(), 0); 10026 APInt UndefBits(VT.getSizeInBits(), 0); 10027 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 10028 if (resolveBuildVector(BVN, DefBits, UndefBits)) { 10029 SDValue NewOp; 10030 if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) || 10031 (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || 10032 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) || 10033 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || 10034 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) || 10035 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits))) 10036 return NewOp; 10037 10038 DefBits = ~DefBits; 10039 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) || 10040 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) || 10041 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits))) 10042 return NewOp; 10043 10044 DefBits = UndefBits; 10045 if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) || 10046 (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || 10047 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) || 10048 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || 10049 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) || 10050 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits))) 10051 return NewOp; 10052 10053 DefBits = ~UndefBits; 10054 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) || 10055 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) || 10056 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits))) 10057 return NewOp; 10058 } 10059 10060 return SDValue(); 10061 } 10062 10063 SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, 10064 SelectionDAG &DAG) const { 10065 EVT VT = Op.getValueType(); 10066 10067 // Try to build a simple constant vector. 10068 Op = NormalizeBuildVector(Op, DAG); 10069 if (VT.isInteger()) { 10070 // Certain vector constants, used to express things like logical NOT and 10071 // arithmetic NEG, are passed through unmodified. This allows special 10072 // patterns for these operations to match, which will lower these constants 10073 // to whatever is proven necessary. 10074 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 10075 if (BVN->isConstant()) 10076 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) { 10077 unsigned BitSize = VT.getVectorElementType().getSizeInBits(); 10078 APInt Val(BitSize, 10079 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue()); 10080 if (Val.isNullValue() || Val.isAllOnesValue()) 10081 return Op; 10082 } 10083 } 10084 10085 if (SDValue V = ConstantBuildVector(Op, DAG)) 10086 return V; 10087 10088 // Scan through the operands to find some interesting properties we can 10089 // exploit: 10090 // 1) If only one value is used, we can use a DUP, or 10091 // 2) if only the low element is not undef, we can just insert that, or 10092 // 3) if only one constant value is used (w/ some non-constant lanes), 10093 // we can splat the constant value into the whole vector then fill 10094 // in the non-constant lanes. 10095 // 4) FIXME: If different constant values are used, but we can intelligently 10096 // select the values we'll be overwriting for the non-constant 10097 // lanes such that we can directly materialize the vector 10098 // some other way (MOVI, e.g.), we can be sneaky. 10099 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP. 10100 SDLoc dl(Op); 10101 unsigned NumElts = VT.getVectorNumElements(); 10102 bool isOnlyLowElement = true; 10103 bool usesOnlyOneValue = true; 10104 bool usesOnlyOneConstantValue = true; 10105 bool isConstant = true; 10106 bool AllLanesExtractElt = true; 10107 unsigned NumConstantLanes = 0; 10108 unsigned NumDifferentLanes = 0; 10109 unsigned NumUndefLanes = 0; 10110 SDValue Value; 10111 SDValue ConstantValue; 10112 for (unsigned i = 0; i < NumElts; ++i) { 10113 SDValue V = Op.getOperand(i); 10114 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) 10115 AllLanesExtractElt = false; 10116 if (V.isUndef()) { 10117 ++NumUndefLanes; 10118 continue; 10119 } 10120 if (i > 0) 10121 isOnlyLowElement = false; 10122 if (!isIntOrFPConstant(V)) 10123 isConstant = false; 10124 10125 if (isIntOrFPConstant(V)) { 10126 ++NumConstantLanes; 10127 if (!ConstantValue.getNode()) 10128 ConstantValue = V; 10129 else if (ConstantValue != V) 10130 usesOnlyOneConstantValue = false; 10131 } 10132 10133 if (!Value.getNode()) 10134 Value = V; 10135 else if (V != Value) { 10136 usesOnlyOneValue = false; 10137 ++NumDifferentLanes; 10138 } 10139 } 10140 10141 if (!Value.getNode()) { 10142 LLVM_DEBUG( 10143 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n"); 10144 return DAG.getUNDEF(VT); 10145 } 10146 10147 // Convert BUILD_VECTOR where all elements but the lowest are undef into 10148 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector 10149 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR. 10150 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) { 10151 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 " 10152 "SCALAR_TO_VECTOR node\n"); 10153 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); 10154 } 10155 10156 if (AllLanesExtractElt) { 10157 SDNode *Vector = nullptr; 10158 bool Even = false; 10159 bool Odd = false; 10160 // Check whether the extract elements match the Even pattern <0,2,4,...> or 10161 // the Odd pattern <1,3,5,...>. 10162 for (unsigned i = 0; i < NumElts; ++i) { 10163 SDValue V = Op.getOperand(i); 10164 const SDNode *N = V.getNode(); 10165 if (!isa<ConstantSDNode>(N->getOperand(1))) 10166 break; 10167 SDValue N0 = N->getOperand(0); 10168 10169 // All elements are extracted from the same vector. 10170 if (!Vector) { 10171 Vector = N0.getNode(); 10172 // Check that the type of EXTRACT_VECTOR_ELT matches the type of 10173 // BUILD_VECTOR. 10174 if (VT.getVectorElementType() != 10175 N0.getValueType().getVectorElementType()) 10176 break; 10177 } else if (Vector != N0.getNode()) { 10178 Odd = false; 10179 Even = false; 10180 break; 10181 } 10182 10183 // Extracted values are either at Even indices <0,2,4,...> or at Odd 10184 // indices <1,3,5,...>. 10185 uint64_t Val = N->getConstantOperandVal(1); 10186 if (Val == 2 * i) { 10187 Even = true; 10188 continue; 10189 } 10190 if (Val - 1 == 2 * i) { 10191 Odd = true; 10192 continue; 10193 } 10194 10195 // Something does not match: abort. 10196 Odd = false; 10197 Even = false; 10198 break; 10199 } 10200 if (Even || Odd) { 10201 SDValue LHS = 10202 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0), 10203 DAG.getConstant(0, dl, MVT::i64)); 10204 SDValue RHS = 10205 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0), 10206 DAG.getConstant(NumElts, dl, MVT::i64)); 10207 10208 if (Even && !Odd) 10209 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS, 10210 RHS); 10211 if (Odd && !Even) 10212 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS, 10213 RHS); 10214 } 10215 } 10216 10217 // Use DUP for non-constant splats. For f32 constant splats, reduce to 10218 // i32 and try again. 10219 if (usesOnlyOneValue) { 10220 if (!isConstant) { 10221 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 10222 Value.getValueType() != VT) { 10223 LLVM_DEBUG( 10224 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n"); 10225 return DAG.getNode(AArch64ISD::DUP, dl, VT, Value); 10226 } 10227 10228 // This is actually a DUPLANExx operation, which keeps everything vectory. 10229 10230 SDValue Lane = Value.getOperand(1); 10231 Value = Value.getOperand(0); 10232 if (Value.getValueSizeInBits() == 64) { 10233 LLVM_DEBUG( 10234 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, " 10235 "widening it\n"); 10236 Value = WidenVector(Value, DAG); 10237 } 10238 10239 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType()); 10240 return DAG.getNode(Opcode, dl, VT, Value, Lane); 10241 } 10242 10243 if (VT.getVectorElementType().isFloatingPoint()) { 10244 SmallVector<SDValue, 8> Ops; 10245 EVT EltTy = VT.getVectorElementType(); 10246 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 || 10247 EltTy == MVT::f64) && "Unsupported floating-point vector type"); 10248 LLVM_DEBUG( 10249 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int " 10250 "BITCASTS, and try again\n"); 10251 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits()); 10252 for (unsigned i = 0; i < NumElts; ++i) 10253 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i))); 10254 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts); 10255 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); 10256 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: "; 10257 Val.dump();); 10258 Val = LowerBUILD_VECTOR(Val, DAG); 10259 if (Val.getNode()) 10260 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 10261 } 10262 } 10263 10264 // If we need to insert a small number of different non-constant elements and 10265 // the vector width is sufficiently large, prefer using DUP with the common 10266 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred, 10267 // skip the constant lane handling below. 10268 bool PreferDUPAndInsert = 10269 !isConstant && NumDifferentLanes >= 1 && 10270 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) && 10271 NumDifferentLanes >= NumConstantLanes; 10272 10273 // If there was only one constant value used and for more than one lane, 10274 // start by splatting that value, then replace the non-constant lanes. This 10275 // is better than the default, which will perform a separate initialization 10276 // for each lane. 10277 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) { 10278 // Firstly, try to materialize the splat constant. 10279 SDValue Vec = DAG.getSplatBuildVector(VT, dl, ConstantValue), 10280 Val = ConstantBuildVector(Vec, DAG); 10281 if (!Val) { 10282 // Otherwise, materialize the constant and splat it. 10283 Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue); 10284 DAG.ReplaceAllUsesWith(Vec.getNode(), &Val); 10285 } 10286 10287 // Now insert the non-constant lanes. 10288 for (unsigned i = 0; i < NumElts; ++i) { 10289 SDValue V = Op.getOperand(i); 10290 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); 10291 if (!isIntOrFPConstant(V)) 10292 // Note that type legalization likely mucked about with the VT of the 10293 // source operand, so we may have to convert it here before inserting. 10294 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx); 10295 } 10296 return Val; 10297 } 10298 10299 // This will generate a load from the constant pool. 10300 if (isConstant) { 10301 LLVM_DEBUG( 10302 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default " 10303 "expansion\n"); 10304 return SDValue(); 10305 } 10306 10307 // Empirical tests suggest this is rarely worth it for vectors of length <= 2. 10308 if (NumElts >= 4) { 10309 if (SDValue shuffle = ReconstructShuffle(Op, DAG)) 10310 return shuffle; 10311 } 10312 10313 if (PreferDUPAndInsert) { 10314 // First, build a constant vector with the common element. 10315 SmallVector<SDValue, 8> Ops(NumElts, Value); 10316 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG); 10317 // Next, insert the elements that do not match the common value. 10318 for (unsigned I = 0; I < NumElts; ++I) 10319 if (Op.getOperand(I) != Value) 10320 NewVector = 10321 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector, 10322 Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64)); 10323 10324 return NewVector; 10325 } 10326 10327 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we 10328 // know the default expansion would otherwise fall back on something even 10329 // worse. For a vector with one or two non-undef values, that's 10330 // scalar_to_vector for the elements followed by a shuffle (provided the 10331 // shuffle is valid for the target) and materialization element by element 10332 // on the stack followed by a load for everything else. 10333 if (!isConstant && !usesOnlyOneValue) { 10334 LLVM_DEBUG( 10335 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence " 10336 "of INSERT_VECTOR_ELT\n"); 10337 10338 SDValue Vec = DAG.getUNDEF(VT); 10339 SDValue Op0 = Op.getOperand(0); 10340 unsigned i = 0; 10341 10342 // Use SCALAR_TO_VECTOR for lane zero to 10343 // a) Avoid a RMW dependency on the full vector register, and 10344 // b) Allow the register coalescer to fold away the copy if the 10345 // value is already in an S or D register, and we're forced to emit an 10346 // INSERT_SUBREG that we can't fold anywhere. 10347 // 10348 // We also allow types like i8 and i16 which are illegal scalar but legal 10349 // vector element types. After type-legalization the inserted value is 10350 // extended (i32) and it is safe to cast them to the vector type by ignoring 10351 // the upper bits of the lowest lane (e.g. v8i8, v4i16). 10352 if (!Op0.isUndef()) { 10353 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n"); 10354 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0); 10355 ++i; 10356 } 10357 LLVM_DEBUG(if (i < NumElts) dbgs() 10358 << "Creating nodes for the other vector elements:\n";); 10359 for (; i < NumElts; ++i) { 10360 SDValue V = Op.getOperand(i); 10361 if (V.isUndef()) 10362 continue; 10363 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); 10364 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); 10365 } 10366 return Vec; 10367 } 10368 10369 LLVM_DEBUG( 10370 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find " 10371 "better alternative\n"); 10372 return SDValue(); 10373 } 10374 10375 SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op, 10376 SelectionDAG &DAG) const { 10377 if (useSVEForFixedLengthVectorVT(Op.getValueType())) 10378 return LowerFixedLengthConcatVectorsToSVE(Op, DAG); 10379 10380 assert(Op.getValueType().isScalableVector() && 10381 isTypeLegal(Op.getValueType()) && 10382 "Expected legal scalable vector type!"); 10383 10384 if (isTypeLegal(Op.getOperand(0).getValueType()) && Op.getNumOperands() == 2) 10385 return Op; 10386 10387 return SDValue(); 10388 } 10389 10390 SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 10391 SelectionDAG &DAG) const { 10392 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!"); 10393 10394 if (useSVEForFixedLengthVectorVT(Op.getValueType())) 10395 return LowerFixedLengthInsertVectorElt(Op, DAG); 10396 10397 // Check for non-constant or out of range lane. 10398 EVT VT = Op.getOperand(0).getValueType(); 10399 10400 if (VT.getScalarType() == MVT::i1) { 10401 EVT VectorVT = getPromotedVTForPredicate(VT); 10402 SDLoc DL(Op); 10403 SDValue ExtendedVector = 10404 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT); 10405 SDValue ExtendedValue = 10406 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL, 10407 VectorVT.getScalarType().getSizeInBits() < 32 10408 ? MVT::i32 10409 : VectorVT.getScalarType()); 10410 ExtendedVector = 10411 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector, 10412 ExtendedValue, Op.getOperand(2)); 10413 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT); 10414 } 10415 10416 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 10417 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) 10418 return SDValue(); 10419 10420 // Insertion/extraction are legal for V128 types. 10421 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || 10422 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || 10423 VT == MVT::v8f16 || VT == MVT::v8bf16) 10424 return Op; 10425 10426 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && 10427 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 && 10428 VT != MVT::v4bf16) 10429 return SDValue(); 10430 10431 // For V64 types, we perform insertion by expanding the value 10432 // to a V128 type and perform the insertion on that. 10433 SDLoc DL(Op); 10434 SDValue WideVec = WidenVector(Op.getOperand(0), DAG); 10435 EVT WideTy = WideVec.getValueType(); 10436 10437 SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec, 10438 Op.getOperand(1), Op.getOperand(2)); 10439 // Re-narrow the resultant vector. 10440 return NarrowVector(Node, DAG); 10441 } 10442 10443 SDValue 10444 AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 10445 SelectionDAG &DAG) const { 10446 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!"); 10447 EVT VT = Op.getOperand(0).getValueType(); 10448 10449 if (VT.getScalarType() == MVT::i1) { 10450 // We can't directly extract from an SVE predicate; extend it first. 10451 // (This isn't the only possible lowering, but it's straightforward.) 10452 EVT VectorVT = getPromotedVTForPredicate(VT); 10453 SDLoc DL(Op); 10454 SDValue Extend = 10455 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0)); 10456 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32; 10457 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy, 10458 Extend, Op.getOperand(1)); 10459 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType()); 10460 } 10461 10462 if (useSVEForFixedLengthVectorVT(VT)) 10463 return LowerFixedLengthExtractVectorElt(Op, DAG); 10464 10465 // Check for non-constant or out of range lane. 10466 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 10467 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) 10468 return SDValue(); 10469 10470 // Insertion/extraction are legal for V128 types. 10471 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || 10472 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || 10473 VT == MVT::v8f16 || VT == MVT::v8bf16) 10474 return Op; 10475 10476 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && 10477 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 && 10478 VT != MVT::v4bf16) 10479 return SDValue(); 10480 10481 // For V64 types, we perform extraction by expanding the value 10482 // to a V128 type and perform the extraction on that. 10483 SDLoc DL(Op); 10484 SDValue WideVec = WidenVector(Op.getOperand(0), DAG); 10485 EVT WideTy = WideVec.getValueType(); 10486 10487 EVT ExtrTy = WideTy.getVectorElementType(); 10488 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8) 10489 ExtrTy = MVT::i32; 10490 10491 // For extractions, we just return the result directly. 10492 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec, 10493 Op.getOperand(1)); 10494 } 10495 10496 SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, 10497 SelectionDAG &DAG) const { 10498 assert(Op.getValueType().isFixedLengthVector() && 10499 "Only cases that extract a fixed length vector are supported!"); 10500 10501 EVT InVT = Op.getOperand(0).getValueType(); 10502 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 10503 unsigned Size = Op.getValueSizeInBits(); 10504 10505 if (InVT.isScalableVector()) { 10506 // This will be matched by custom code during ISelDAGToDAG. 10507 if (Idx == 0 && isPackedVectorType(InVT, DAG)) 10508 return Op; 10509 10510 return SDValue(); 10511 } 10512 10513 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel. 10514 if (Idx == 0 && InVT.getSizeInBits() <= 128) 10515 return Op; 10516 10517 // If this is extracting the upper 64-bits of a 128-bit vector, we match 10518 // that directly. 10519 if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 && 10520 InVT.getSizeInBits() == 128) 10521 return Op; 10522 10523 return SDValue(); 10524 } 10525 10526 SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, 10527 SelectionDAG &DAG) const { 10528 assert(Op.getValueType().isScalableVector() && 10529 "Only expect to lower inserts into scalable vectors!"); 10530 10531 EVT InVT = Op.getOperand(1).getValueType(); 10532 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 10533 10534 if (InVT.isScalableVector()) { 10535 SDLoc DL(Op); 10536 EVT VT = Op.getValueType(); 10537 10538 if (!isTypeLegal(VT) || !VT.isInteger()) 10539 return SDValue(); 10540 10541 SDValue Vec0 = Op.getOperand(0); 10542 SDValue Vec1 = Op.getOperand(1); 10543 10544 // Ensure the subvector is half the size of the main vector. 10545 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2)) 10546 return SDValue(); 10547 10548 // Extend elements of smaller vector... 10549 EVT WideVT = InVT.widenIntegerVectorElementType(*(DAG.getContext())); 10550 SDValue ExtVec = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1); 10551 10552 if (Idx == 0) { 10553 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0); 10554 return DAG.getNode(AArch64ISD::UZP1, DL, VT, ExtVec, HiVec0); 10555 } else if (Idx == InVT.getVectorMinNumElements()) { 10556 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0); 10557 return DAG.getNode(AArch64ISD::UZP1, DL, VT, LoVec0, ExtVec); 10558 } 10559 10560 return SDValue(); 10561 } 10562 10563 // This will be matched by custom code during ISelDAGToDAG. 10564 if (Idx == 0 && isPackedVectorType(InVT, DAG) && Op.getOperand(0).isUndef()) 10565 return Op; 10566 10567 return SDValue(); 10568 } 10569 10570 SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const { 10571 EVT VT = Op.getValueType(); 10572 10573 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) 10574 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG); 10575 10576 assert(VT.isScalableVector() && "Expected a scalable vector."); 10577 10578 bool Signed = Op.getOpcode() == ISD::SDIV; 10579 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED; 10580 10581 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64) 10582 return LowerToPredicatedOp(Op, DAG, PredOpcode); 10583 10584 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit 10585 // operations, and truncate the result. 10586 EVT WidenedVT; 10587 if (VT == MVT::nxv16i8) 10588 WidenedVT = MVT::nxv8i16; 10589 else if (VT == MVT::nxv8i16) 10590 WidenedVT = MVT::nxv4i32; 10591 else 10592 llvm_unreachable("Unexpected Custom DIV operation"); 10593 10594 SDLoc dl(Op); 10595 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO; 10596 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI; 10597 SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0)); 10598 SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1)); 10599 SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0)); 10600 SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1)); 10601 SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo); 10602 SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi); 10603 return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi); 10604 } 10605 10606 bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { 10607 // Currently no fixed length shuffles that require SVE are legal. 10608 if (useSVEForFixedLengthVectorVT(VT)) 10609 return false; 10610 10611 if (VT.getVectorNumElements() == 4 && 10612 (VT.is128BitVector() || VT.is64BitVector())) { 10613 unsigned PFIndexes[4]; 10614 for (unsigned i = 0; i != 4; ++i) { 10615 if (M[i] < 0) 10616 PFIndexes[i] = 8; 10617 else 10618 PFIndexes[i] = M[i]; 10619 } 10620 10621 // Compute the index in the perfect shuffle table. 10622 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + 10623 PFIndexes[2] * 9 + PFIndexes[3]; 10624 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 10625 unsigned Cost = (PFEntry >> 30); 10626 10627 if (Cost <= 4) 10628 return true; 10629 } 10630 10631 bool DummyBool; 10632 int DummyInt; 10633 unsigned DummyUnsigned; 10634 10635 return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) || 10636 isREVMask(M, VT, 32) || isREVMask(M, VT, 16) || 10637 isEXTMask(M, VT, DummyBool, DummyUnsigned) || 10638 // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM. 10639 isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) || 10640 isZIPMask(M, VT, DummyUnsigned) || 10641 isTRN_v_undef_Mask(M, VT, DummyUnsigned) || 10642 isUZP_v_undef_Mask(M, VT, DummyUnsigned) || 10643 isZIP_v_undef_Mask(M, VT, DummyUnsigned) || 10644 isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) || 10645 isConcatMask(M, VT, VT.getSizeInBits() == 128)); 10646 } 10647 10648 /// getVShiftImm - Check if this is a valid build_vector for the immediate 10649 /// operand of a vector shift operation, where all the elements of the 10650 /// build_vector must have the same constant integer value. 10651 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { 10652 // Ignore bit_converts. 10653 while (Op.getOpcode() == ISD::BITCAST) 10654 Op = Op.getOperand(0); 10655 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); 10656 APInt SplatBits, SplatUndef; 10657 unsigned SplatBitSize; 10658 bool HasAnyUndefs; 10659 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, 10660 HasAnyUndefs, ElementBits) || 10661 SplatBitSize > ElementBits) 10662 return false; 10663 Cnt = SplatBits.getSExtValue(); 10664 return true; 10665 } 10666 10667 /// isVShiftLImm - Check if this is a valid build_vector for the immediate 10668 /// operand of a vector shift left operation. That value must be in the range: 10669 /// 0 <= Value < ElementBits for a left shift; or 10670 /// 0 <= Value <= ElementBits for a long left shift. 10671 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { 10672 assert(VT.isVector() && "vector shift count is not a vector type"); 10673 int64_t ElementBits = VT.getScalarSizeInBits(); 10674 if (!getVShiftImm(Op, ElementBits, Cnt)) 10675 return false; 10676 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); 10677 } 10678 10679 /// isVShiftRImm - Check if this is a valid build_vector for the immediate 10680 /// operand of a vector shift right operation. The value must be in the range: 10681 /// 1 <= Value <= ElementBits for a right shift; or 10682 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) { 10683 assert(VT.isVector() && "vector shift count is not a vector type"); 10684 int64_t ElementBits = VT.getScalarSizeInBits(); 10685 if (!getVShiftImm(Op, ElementBits, Cnt)) 10686 return false; 10687 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); 10688 } 10689 10690 SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op, 10691 SelectionDAG &DAG) const { 10692 EVT VT = Op.getValueType(); 10693 10694 if (VT.getScalarType() == MVT::i1) { 10695 // Lower i1 truncate to `(x & 1) != 0`. 10696 SDLoc dl(Op); 10697 EVT OpVT = Op.getOperand(0).getValueType(); 10698 SDValue Zero = DAG.getConstant(0, dl, OpVT); 10699 SDValue One = DAG.getConstant(1, dl, OpVT); 10700 SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One); 10701 return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE); 10702 } 10703 10704 if (!VT.isVector() || VT.isScalableVector()) 10705 return SDValue(); 10706 10707 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType())) 10708 return LowerFixedLengthVectorTruncateToSVE(Op, DAG); 10709 10710 return SDValue(); 10711 } 10712 10713 SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, 10714 SelectionDAG &DAG) const { 10715 EVT VT = Op.getValueType(); 10716 SDLoc DL(Op); 10717 int64_t Cnt; 10718 10719 if (!Op.getOperand(1).getValueType().isVector()) 10720 return Op; 10721 unsigned EltSize = VT.getScalarSizeInBits(); 10722 10723 switch (Op.getOpcode()) { 10724 default: 10725 llvm_unreachable("unexpected shift opcode"); 10726 10727 case ISD::SHL: 10728 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT)) 10729 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED); 10730 10731 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) 10732 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0), 10733 DAG.getConstant(Cnt, DL, MVT::i32)); 10734 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 10735 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL, 10736 MVT::i32), 10737 Op.getOperand(0), Op.getOperand(1)); 10738 case ISD::SRA: 10739 case ISD::SRL: 10740 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT)) { 10741 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED 10742 : AArch64ISD::SRL_PRED; 10743 return LowerToPredicatedOp(Op, DAG, Opc); 10744 } 10745 10746 // Right shift immediate 10747 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) { 10748 unsigned Opc = 10749 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR; 10750 return DAG.getNode(Opc, DL, VT, Op.getOperand(0), 10751 DAG.getConstant(Cnt, DL, MVT::i32)); 10752 } 10753 10754 // Right shift register. Note, there is not a shift right register 10755 // instruction, but the shift left register instruction takes a signed 10756 // value, where negative numbers specify a right shift. 10757 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl 10758 : Intrinsic::aarch64_neon_ushl; 10759 // negate the shift amount 10760 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), 10761 Op.getOperand(1)); 10762 SDValue NegShiftLeft = 10763 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 10764 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0), 10765 NegShift); 10766 return NegShiftLeft; 10767 } 10768 10769 return SDValue(); 10770 } 10771 10772 static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, 10773 AArch64CC::CondCode CC, bool NoNans, EVT VT, 10774 const SDLoc &dl, SelectionDAG &DAG) { 10775 EVT SrcVT = LHS.getValueType(); 10776 assert(VT.getSizeInBits() == SrcVT.getSizeInBits() && 10777 "function only supposed to emit natural comparisons"); 10778 10779 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode()); 10780 APInt CnstBits(VT.getSizeInBits(), 0); 10781 APInt UndefBits(VT.getSizeInBits(), 0); 10782 bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits); 10783 bool IsZero = IsCnst && (CnstBits == 0); 10784 10785 if (SrcVT.getVectorElementType().isFloatingPoint()) { 10786 switch (CC) { 10787 default: 10788 return SDValue(); 10789 case AArch64CC::NE: { 10790 SDValue Fcmeq; 10791 if (IsZero) 10792 Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); 10793 else 10794 Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); 10795 return DAG.getNOT(dl, Fcmeq, VT); 10796 } 10797 case AArch64CC::EQ: 10798 if (IsZero) 10799 return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); 10800 return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); 10801 case AArch64CC::GE: 10802 if (IsZero) 10803 return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS); 10804 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS); 10805 case AArch64CC::GT: 10806 if (IsZero) 10807 return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS); 10808 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS); 10809 case AArch64CC::LS: 10810 if (IsZero) 10811 return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS); 10812 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS); 10813 case AArch64CC::LT: 10814 if (!NoNans) 10815 return SDValue(); 10816 // If we ignore NaNs then we can use to the MI implementation. 10817 LLVM_FALLTHROUGH; 10818 case AArch64CC::MI: 10819 if (IsZero) 10820 return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS); 10821 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS); 10822 } 10823 } 10824 10825 switch (CC) { 10826 default: 10827 return SDValue(); 10828 case AArch64CC::NE: { 10829 SDValue Cmeq; 10830 if (IsZero) 10831 Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); 10832 else 10833 Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); 10834 return DAG.getNOT(dl, Cmeq, VT); 10835 } 10836 case AArch64CC::EQ: 10837 if (IsZero) 10838 return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); 10839 return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); 10840 case AArch64CC::GE: 10841 if (IsZero) 10842 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS); 10843 return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS); 10844 case AArch64CC::GT: 10845 if (IsZero) 10846 return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS); 10847 return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS); 10848 case AArch64CC::LE: 10849 if (IsZero) 10850 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS); 10851 return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS); 10852 case AArch64CC::LS: 10853 return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS); 10854 case AArch64CC::LO: 10855 return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS); 10856 case AArch64CC::LT: 10857 if (IsZero) 10858 return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS); 10859 return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS); 10860 case AArch64CC::HI: 10861 return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS); 10862 case AArch64CC::HS: 10863 return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS); 10864 } 10865 } 10866 10867 SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, 10868 SelectionDAG &DAG) const { 10869 if (Op.getValueType().isScalableVector()) 10870 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO); 10871 10872 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType())) 10873 return LowerFixedLengthVectorSetccToSVE(Op, DAG); 10874 10875 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 10876 SDValue LHS = Op.getOperand(0); 10877 SDValue RHS = Op.getOperand(1); 10878 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger(); 10879 SDLoc dl(Op); 10880 10881 if (LHS.getValueType().getVectorElementType().isInteger()) { 10882 assert(LHS.getValueType() == RHS.getValueType()); 10883 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); 10884 SDValue Cmp = 10885 EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG); 10886 return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); 10887 } 10888 10889 const bool FullFP16 = 10890 static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16(); 10891 10892 // Make v4f16 (only) fcmp operations utilise vector instructions 10893 // v8f16 support will be a litle more complicated 10894 if (!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) { 10895 if (LHS.getValueType().getVectorNumElements() == 4) { 10896 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS); 10897 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS); 10898 SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC); 10899 DAG.ReplaceAllUsesWith(Op, NewSetcc); 10900 CmpVT = MVT::v4i32; 10901 } else 10902 return SDValue(); 10903 } 10904 10905 assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) || 10906 LHS.getValueType().getVectorElementType() != MVT::f128); 10907 10908 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally 10909 // clean. Some of them require two branches to implement. 10910 AArch64CC::CondCode CC1, CC2; 10911 bool ShouldInvert; 10912 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert); 10913 10914 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath; 10915 SDValue Cmp = 10916 EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG); 10917 if (!Cmp.getNode()) 10918 return SDValue(); 10919 10920 if (CC2 != AArch64CC::AL) { 10921 SDValue Cmp2 = 10922 EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG); 10923 if (!Cmp2.getNode()) 10924 return SDValue(); 10925 10926 Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2); 10927 } 10928 10929 Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); 10930 10931 if (ShouldInvert) 10932 Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType()); 10933 10934 return Cmp; 10935 } 10936 10937 static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, 10938 SelectionDAG &DAG) { 10939 SDValue VecOp = ScalarOp.getOperand(0); 10940 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp); 10941 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx, 10942 DAG.getConstant(0, DL, MVT::i64)); 10943 } 10944 10945 SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op, 10946 SelectionDAG &DAG) const { 10947 SDValue Src = Op.getOperand(0); 10948 10949 // Try to lower fixed length reductions to SVE. 10950 EVT SrcVT = Src.getValueType(); 10951 bool OverrideNEON = Op.getOpcode() == ISD::VECREDUCE_AND || 10952 Op.getOpcode() == ISD::VECREDUCE_OR || 10953 Op.getOpcode() == ISD::VECREDUCE_XOR || 10954 Op.getOpcode() == ISD::VECREDUCE_FADD || 10955 (Op.getOpcode() != ISD::VECREDUCE_ADD && 10956 SrcVT.getVectorElementType() == MVT::i64); 10957 if (SrcVT.isScalableVector() || 10958 useSVEForFixedLengthVectorVT(SrcVT, OverrideNEON)) { 10959 10960 if (SrcVT.getVectorElementType() == MVT::i1) 10961 return LowerPredReductionToSVE(Op, DAG); 10962 10963 switch (Op.getOpcode()) { 10964 case ISD::VECREDUCE_ADD: 10965 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG); 10966 case ISD::VECREDUCE_AND: 10967 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG); 10968 case ISD::VECREDUCE_OR: 10969 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG); 10970 case ISD::VECREDUCE_SMAX: 10971 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG); 10972 case ISD::VECREDUCE_SMIN: 10973 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG); 10974 case ISD::VECREDUCE_UMAX: 10975 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG); 10976 case ISD::VECREDUCE_UMIN: 10977 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG); 10978 case ISD::VECREDUCE_XOR: 10979 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG); 10980 case ISD::VECREDUCE_FADD: 10981 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG); 10982 case ISD::VECREDUCE_FMAX: 10983 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG); 10984 case ISD::VECREDUCE_FMIN: 10985 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG); 10986 default: 10987 llvm_unreachable("Unhandled fixed length reduction"); 10988 } 10989 } 10990 10991 // Lower NEON reductions. 10992 SDLoc dl(Op); 10993 switch (Op.getOpcode()) { 10994 case ISD::VECREDUCE_ADD: 10995 return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG); 10996 case ISD::VECREDUCE_SMAX: 10997 return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG); 10998 case ISD::VECREDUCE_SMIN: 10999 return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG); 11000 case ISD::VECREDUCE_UMAX: 11001 return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG); 11002 case ISD::VECREDUCE_UMIN: 11003 return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG); 11004 case ISD::VECREDUCE_FMAX: { 11005 return DAG.getNode( 11006 ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), 11007 DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32), 11008 Src); 11009 } 11010 case ISD::VECREDUCE_FMIN: { 11011 return DAG.getNode( 11012 ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), 11013 DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32), 11014 Src); 11015 } 11016 default: 11017 llvm_unreachable("Unhandled reduction"); 11018 } 11019 } 11020 11021 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op, 11022 SelectionDAG &DAG) const { 11023 auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget()); 11024 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics()) 11025 return SDValue(); 11026 11027 // LSE has an atomic load-add instruction, but not a load-sub. 11028 SDLoc dl(Op); 11029 MVT VT = Op.getSimpleValueType(); 11030 SDValue RHS = Op.getOperand(2); 11031 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode()); 11032 RHS = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), RHS); 11033 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, AN->getMemoryVT(), 11034 Op.getOperand(0), Op.getOperand(1), RHS, 11035 AN->getMemOperand()); 11036 } 11037 11038 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op, 11039 SelectionDAG &DAG) const { 11040 auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget()); 11041 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics()) 11042 return SDValue(); 11043 11044 // LSE has an atomic load-clear instruction, but not a load-and. 11045 SDLoc dl(Op); 11046 MVT VT = Op.getSimpleValueType(); 11047 SDValue RHS = Op.getOperand(2); 11048 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode()); 11049 RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS); 11050 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(), 11051 Op.getOperand(0), Op.getOperand(1), RHS, 11052 AN->getMemOperand()); 11053 } 11054 11055 SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC( 11056 SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const { 11057 SDLoc dl(Op); 11058 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 11059 SDValue Callee = DAG.getTargetExternalSymbol("__chkstk", PtrVT, 0); 11060 11061 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 11062 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask(); 11063 if (Subtarget->hasCustomCallingConv()) 11064 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask); 11065 11066 Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size, 11067 DAG.getConstant(4, dl, MVT::i64)); 11068 Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue()); 11069 Chain = 11070 DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue), 11071 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64), 11072 DAG.getRegisterMask(Mask), Chain.getValue(1)); 11073 // To match the actual intent better, we should read the output from X15 here 11074 // again (instead of potentially spilling it to the stack), but rereading Size 11075 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined 11076 // here. 11077 11078 Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size, 11079 DAG.getConstant(4, dl, MVT::i64)); 11080 return Chain; 11081 } 11082 11083 SDValue 11084 AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 11085 SelectionDAG &DAG) const { 11086 assert(Subtarget->isTargetWindows() && 11087 "Only Windows alloca probing supported"); 11088 SDLoc dl(Op); 11089 // Get the inputs. 11090 SDNode *Node = Op.getNode(); 11091 SDValue Chain = Op.getOperand(0); 11092 SDValue Size = Op.getOperand(1); 11093 MaybeAlign Align = 11094 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue(); 11095 EVT VT = Node->getValueType(0); 11096 11097 if (DAG.getMachineFunction().getFunction().hasFnAttribute( 11098 "no-stack-arg-probe")) { 11099 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); 11100 Chain = SP.getValue(1); 11101 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); 11102 if (Align) 11103 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), 11104 DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); 11105 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); 11106 SDValue Ops[2] = {SP, Chain}; 11107 return DAG.getMergeValues(Ops, dl); 11108 } 11109 11110 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); 11111 11112 Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG); 11113 11114 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); 11115 Chain = SP.getValue(1); 11116 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); 11117 if (Align) 11118 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), 11119 DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); 11120 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); 11121 11122 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), 11123 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl); 11124 11125 SDValue Ops[2] = {SP, Chain}; 11126 return DAG.getMergeValues(Ops, dl); 11127 } 11128 11129 SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op, 11130 SelectionDAG &DAG) const { 11131 EVT VT = Op.getValueType(); 11132 assert(VT != MVT::i64 && "Expected illegal VSCALE node"); 11133 11134 SDLoc DL(Op); 11135 APInt MulImm = cast<ConstantSDNode>(Op.getOperand(0))->getAPIntValue(); 11136 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sextOrSelf(64)), 11137 DL, VT); 11138 } 11139 11140 /// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics. 11141 template <unsigned NumVecs> 11142 static bool 11143 setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL, 11144 AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI) { 11145 Info.opc = ISD::INTRINSIC_VOID; 11146 // Retrieve EC from first vector argument. 11147 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType()); 11148 ElementCount EC = VT.getVectorElementCount(); 11149 #ifndef NDEBUG 11150 // Check the assumption that all input vectors are the same type. 11151 for (unsigned I = 0; I < NumVecs; ++I) 11152 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) && 11153 "Invalid type."); 11154 #endif 11155 // memVT is `NumVecs * VT`. 11156 Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(), 11157 EC * NumVecs); 11158 Info.ptrVal = CI.getArgOperand(CI.getNumArgOperands() - 1); 11159 Info.offset = 0; 11160 Info.align.reset(); 11161 Info.flags = MachineMemOperand::MOStore; 11162 return true; 11163 } 11164 11165 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as 11166 /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment 11167 /// specified in the intrinsic calls. 11168 bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 11169 const CallInst &I, 11170 MachineFunction &MF, 11171 unsigned Intrinsic) const { 11172 auto &DL = I.getModule()->getDataLayout(); 11173 switch (Intrinsic) { 11174 case Intrinsic::aarch64_sve_st2: 11175 return setInfoSVEStN<2>(*this, DL, Info, I); 11176 case Intrinsic::aarch64_sve_st3: 11177 return setInfoSVEStN<3>(*this, DL, Info, I); 11178 case Intrinsic::aarch64_sve_st4: 11179 return setInfoSVEStN<4>(*this, DL, Info, I); 11180 case Intrinsic::aarch64_neon_ld2: 11181 case Intrinsic::aarch64_neon_ld3: 11182 case Intrinsic::aarch64_neon_ld4: 11183 case Intrinsic::aarch64_neon_ld1x2: 11184 case Intrinsic::aarch64_neon_ld1x3: 11185 case Intrinsic::aarch64_neon_ld1x4: 11186 case Intrinsic::aarch64_neon_ld2lane: 11187 case Intrinsic::aarch64_neon_ld3lane: 11188 case Intrinsic::aarch64_neon_ld4lane: 11189 case Intrinsic::aarch64_neon_ld2r: 11190 case Intrinsic::aarch64_neon_ld3r: 11191 case Intrinsic::aarch64_neon_ld4r: { 11192 Info.opc = ISD::INTRINSIC_W_CHAIN; 11193 // Conservatively set memVT to the entire set of vectors loaded. 11194 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 11195 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 11196 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); 11197 Info.offset = 0; 11198 Info.align.reset(); 11199 // volatile loads with NEON intrinsics not supported 11200 Info.flags = MachineMemOperand::MOLoad; 11201 return true; 11202 } 11203 case Intrinsic::aarch64_neon_st2: 11204 case Intrinsic::aarch64_neon_st3: 11205 case Intrinsic::aarch64_neon_st4: 11206 case Intrinsic::aarch64_neon_st1x2: 11207 case Intrinsic::aarch64_neon_st1x3: 11208 case Intrinsic::aarch64_neon_st1x4: 11209 case Intrinsic::aarch64_neon_st2lane: 11210 case Intrinsic::aarch64_neon_st3lane: 11211 case Intrinsic::aarch64_neon_st4lane: { 11212 Info.opc = ISD::INTRINSIC_VOID; 11213 // Conservatively set memVT to the entire set of vectors stored. 11214 unsigned NumElts = 0; 11215 for (unsigned ArgI = 0, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { 11216 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 11217 if (!ArgTy->isVectorTy()) 11218 break; 11219 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 11220 } 11221 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 11222 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); 11223 Info.offset = 0; 11224 Info.align.reset(); 11225 // volatile stores with NEON intrinsics not supported 11226 Info.flags = MachineMemOperand::MOStore; 11227 return true; 11228 } 11229 case Intrinsic::aarch64_ldaxr: 11230 case Intrinsic::aarch64_ldxr: { 11231 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType()); 11232 Info.opc = ISD::INTRINSIC_W_CHAIN; 11233 Info.memVT = MVT::getVT(PtrTy->getElementType()); 11234 Info.ptrVal = I.getArgOperand(0); 11235 Info.offset = 0; 11236 Info.align = DL.getABITypeAlign(PtrTy->getElementType()); 11237 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 11238 return true; 11239 } 11240 case Intrinsic::aarch64_stlxr: 11241 case Intrinsic::aarch64_stxr: { 11242 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); 11243 Info.opc = ISD::INTRINSIC_W_CHAIN; 11244 Info.memVT = MVT::getVT(PtrTy->getElementType()); 11245 Info.ptrVal = I.getArgOperand(1); 11246 Info.offset = 0; 11247 Info.align = DL.getABITypeAlign(PtrTy->getElementType()); 11248 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 11249 return true; 11250 } 11251 case Intrinsic::aarch64_ldaxp: 11252 case Intrinsic::aarch64_ldxp: 11253 Info.opc = ISD::INTRINSIC_W_CHAIN; 11254 Info.memVT = MVT::i128; 11255 Info.ptrVal = I.getArgOperand(0); 11256 Info.offset = 0; 11257 Info.align = Align(16); 11258 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 11259 return true; 11260 case Intrinsic::aarch64_stlxp: 11261 case Intrinsic::aarch64_stxp: 11262 Info.opc = ISD::INTRINSIC_W_CHAIN; 11263 Info.memVT = MVT::i128; 11264 Info.ptrVal = I.getArgOperand(2); 11265 Info.offset = 0; 11266 Info.align = Align(16); 11267 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 11268 return true; 11269 case Intrinsic::aarch64_sve_ldnt1: { 11270 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); 11271 Info.opc = ISD::INTRINSIC_W_CHAIN; 11272 Info.memVT = MVT::getVT(I.getType()); 11273 Info.ptrVal = I.getArgOperand(1); 11274 Info.offset = 0; 11275 Info.align = DL.getABITypeAlign(PtrTy->getElementType()); 11276 Info.flags = MachineMemOperand::MOLoad; 11277 if (Intrinsic == Intrinsic::aarch64_sve_ldnt1) 11278 Info.flags |= MachineMemOperand::MONonTemporal; 11279 return true; 11280 } 11281 case Intrinsic::aarch64_sve_stnt1: { 11282 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(2)->getType()); 11283 Info.opc = ISD::INTRINSIC_W_CHAIN; 11284 Info.memVT = MVT::getVT(I.getOperand(0)->getType()); 11285 Info.ptrVal = I.getArgOperand(2); 11286 Info.offset = 0; 11287 Info.align = DL.getABITypeAlign(PtrTy->getElementType()); 11288 Info.flags = MachineMemOperand::MOStore; 11289 if (Intrinsic == Intrinsic::aarch64_sve_stnt1) 11290 Info.flags |= MachineMemOperand::MONonTemporal; 11291 return true; 11292 } 11293 default: 11294 break; 11295 } 11296 11297 return false; 11298 } 11299 11300 bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load, 11301 ISD::LoadExtType ExtTy, 11302 EVT NewVT) const { 11303 // TODO: This may be worth removing. Check regression tests for diffs. 11304 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT)) 11305 return false; 11306 11307 // If we're reducing the load width in order to avoid having to use an extra 11308 // instruction to do extension then it's probably a good idea. 11309 if (ExtTy != ISD::NON_EXTLOAD) 11310 return true; 11311 // Don't reduce load width if it would prevent us from combining a shift into 11312 // the offset. 11313 MemSDNode *Mem = dyn_cast<MemSDNode>(Load); 11314 assert(Mem); 11315 const SDValue &Base = Mem->getBasePtr(); 11316 if (Base.getOpcode() == ISD::ADD && 11317 Base.getOperand(1).getOpcode() == ISD::SHL && 11318 Base.getOperand(1).hasOneUse() && 11319 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) { 11320 // The shift can be combined if it matches the size of the value being 11321 // loaded (and so reducing the width would make it not match). 11322 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1); 11323 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8; 11324 if (ShiftAmount == Log2_32(LoadBytes)) 11325 return false; 11326 } 11327 // We have no reason to disallow reducing the load width, so allow it. 11328 return true; 11329 } 11330 11331 // Truncations from 64-bit GPR to 32-bit GPR is free. 11332 bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { 11333 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 11334 return false; 11335 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedSize(); 11336 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedSize(); 11337 return NumBits1 > NumBits2; 11338 } 11339 bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 11340 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) 11341 return false; 11342 uint64_t NumBits1 = VT1.getFixedSizeInBits(); 11343 uint64_t NumBits2 = VT2.getFixedSizeInBits(); 11344 return NumBits1 > NumBits2; 11345 } 11346 11347 /// Check if it is profitable to hoist instruction in then/else to if. 11348 /// Not profitable if I and it's user can form a FMA instruction 11349 /// because we prefer FMSUB/FMADD. 11350 bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const { 11351 if (I->getOpcode() != Instruction::FMul) 11352 return true; 11353 11354 if (!I->hasOneUse()) 11355 return true; 11356 11357 Instruction *User = I->user_back(); 11358 11359 if (User && 11360 !(User->getOpcode() == Instruction::FSub || 11361 User->getOpcode() == Instruction::FAdd)) 11362 return true; 11363 11364 const TargetOptions &Options = getTargetMachine().Options; 11365 const Function *F = I->getFunction(); 11366 const DataLayout &DL = F->getParent()->getDataLayout(); 11367 Type *Ty = User->getOperand(0)->getType(); 11368 11369 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) && 11370 isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) && 11371 (Options.AllowFPOpFusion == FPOpFusion::Fast || 11372 Options.UnsafeFPMath)); 11373 } 11374 11375 // All 32-bit GPR operations implicitly zero the high-half of the corresponding 11376 // 64-bit GPR. 11377 bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { 11378 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 11379 return false; 11380 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 11381 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 11382 return NumBits1 == 32 && NumBits2 == 64; 11383 } 11384 bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 11385 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) 11386 return false; 11387 unsigned NumBits1 = VT1.getSizeInBits(); 11388 unsigned NumBits2 = VT2.getSizeInBits(); 11389 return NumBits1 == 32 && NumBits2 == 64; 11390 } 11391 11392 bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 11393 EVT VT1 = Val.getValueType(); 11394 if (isZExtFree(VT1, VT2)) { 11395 return true; 11396 } 11397 11398 if (Val.getOpcode() != ISD::LOAD) 11399 return false; 11400 11401 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend. 11402 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() && 11403 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() && 11404 VT1.getSizeInBits() <= 32); 11405 } 11406 11407 bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const { 11408 if (isa<FPExtInst>(Ext)) 11409 return false; 11410 11411 // Vector types are not free. 11412 if (Ext->getType()->isVectorTy()) 11413 return false; 11414 11415 for (const Use &U : Ext->uses()) { 11416 // The extension is free if we can fold it with a left shift in an 11417 // addressing mode or an arithmetic operation: add, sub, and cmp. 11418 11419 // Is there a shift? 11420 const Instruction *Instr = cast<Instruction>(U.getUser()); 11421 11422 // Is this a constant shift? 11423 switch (Instr->getOpcode()) { 11424 case Instruction::Shl: 11425 if (!isa<ConstantInt>(Instr->getOperand(1))) 11426 return false; 11427 break; 11428 case Instruction::GetElementPtr: { 11429 gep_type_iterator GTI = gep_type_begin(Instr); 11430 auto &DL = Ext->getModule()->getDataLayout(); 11431 std::advance(GTI, U.getOperandNo()-1); 11432 Type *IdxTy = GTI.getIndexedType(); 11433 // This extension will end up with a shift because of the scaling factor. 11434 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0. 11435 // Get the shift amount based on the scaling factor: 11436 // log2(sizeof(IdxTy)) - log2(8). 11437 uint64_t ShiftAmt = 11438 countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy).getFixedSize()) - 3; 11439 // Is the constant foldable in the shift of the addressing mode? 11440 // I.e., shift amount is between 1 and 4 inclusive. 11441 if (ShiftAmt == 0 || ShiftAmt > 4) 11442 return false; 11443 break; 11444 } 11445 case Instruction::Trunc: 11446 // Check if this is a noop. 11447 // trunc(sext ty1 to ty2) to ty1. 11448 if (Instr->getType() == Ext->getOperand(0)->getType()) 11449 continue; 11450 LLVM_FALLTHROUGH; 11451 default: 11452 return false; 11453 } 11454 11455 // At this point we can use the bfm family, so this extension is free 11456 // for that use. 11457 } 11458 return true; 11459 } 11460 11461 /// Check if both Op1 and Op2 are shufflevector extracts of either the lower 11462 /// or upper half of the vector elements. 11463 static bool areExtractShuffleVectors(Value *Op1, Value *Op2) { 11464 auto areTypesHalfed = [](Value *FullV, Value *HalfV) { 11465 auto *FullTy = FullV->getType(); 11466 auto *HalfTy = HalfV->getType(); 11467 return FullTy->getPrimitiveSizeInBits().getFixedSize() == 11468 2 * HalfTy->getPrimitiveSizeInBits().getFixedSize(); 11469 }; 11470 11471 auto extractHalf = [](Value *FullV, Value *HalfV) { 11472 auto *FullVT = cast<FixedVectorType>(FullV->getType()); 11473 auto *HalfVT = cast<FixedVectorType>(HalfV->getType()); 11474 return FullVT->getNumElements() == 2 * HalfVT->getNumElements(); 11475 }; 11476 11477 ArrayRef<int> M1, M2; 11478 Value *S1Op1, *S2Op1; 11479 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) || 11480 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2)))) 11481 return false; 11482 11483 // Check that the operands are half as wide as the result and we extract 11484 // half of the elements of the input vectors. 11485 if (!areTypesHalfed(S1Op1, Op1) || !areTypesHalfed(S2Op1, Op2) || 11486 !extractHalf(S1Op1, Op1) || !extractHalf(S2Op1, Op2)) 11487 return false; 11488 11489 // Check the mask extracts either the lower or upper half of vector 11490 // elements. 11491 int M1Start = -1; 11492 int M2Start = -1; 11493 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2; 11494 if (!ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start) || 11495 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start) || 11496 M1Start != M2Start || (M1Start != 0 && M2Start != (NumElements / 2))) 11497 return false; 11498 11499 return true; 11500 } 11501 11502 /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth 11503 /// of the vector elements. 11504 static bool areExtractExts(Value *Ext1, Value *Ext2) { 11505 auto areExtDoubled = [](Instruction *Ext) { 11506 return Ext->getType()->getScalarSizeInBits() == 11507 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); 11508 }; 11509 11510 if (!match(Ext1, m_ZExtOrSExt(m_Value())) || 11511 !match(Ext2, m_ZExtOrSExt(m_Value())) || 11512 !areExtDoubled(cast<Instruction>(Ext1)) || 11513 !areExtDoubled(cast<Instruction>(Ext2))) 11514 return false; 11515 11516 return true; 11517 } 11518 11519 /// Check if Op could be used with vmull_high_p64 intrinsic. 11520 static bool isOperandOfVmullHighP64(Value *Op) { 11521 Value *VectorOperand = nullptr; 11522 ConstantInt *ElementIndex = nullptr; 11523 return match(Op, m_ExtractElt(m_Value(VectorOperand), 11524 m_ConstantInt(ElementIndex))) && 11525 ElementIndex->getValue() == 1 && 11526 isa<FixedVectorType>(VectorOperand->getType()) && 11527 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2; 11528 } 11529 11530 /// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic. 11531 static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) { 11532 return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2); 11533 } 11534 11535 /// Check if sinking \p I's operands to I's basic block is profitable, because 11536 /// the operands can be folded into a target instruction, e.g. 11537 /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2). 11538 bool AArch64TargetLowering::shouldSinkOperands( 11539 Instruction *I, SmallVectorImpl<Use *> &Ops) const { 11540 if (!I->getType()->isVectorTy()) 11541 return false; 11542 11543 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 11544 switch (II->getIntrinsicID()) { 11545 case Intrinsic::aarch64_neon_umull: 11546 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1))) 11547 return false; 11548 Ops.push_back(&II->getOperandUse(0)); 11549 Ops.push_back(&II->getOperandUse(1)); 11550 return true; 11551 11552 case Intrinsic::aarch64_neon_pmull64: 11553 if (!areOperandsOfVmullHighP64(II->getArgOperand(0), 11554 II->getArgOperand(1))) 11555 return false; 11556 Ops.push_back(&II->getArgOperandUse(0)); 11557 Ops.push_back(&II->getArgOperandUse(1)); 11558 return true; 11559 11560 default: 11561 return false; 11562 } 11563 } 11564 11565 switch (I->getOpcode()) { 11566 case Instruction::Sub: 11567 case Instruction::Add: { 11568 if (!areExtractExts(I->getOperand(0), I->getOperand(1))) 11569 return false; 11570 11571 // If the exts' operands extract either the lower or upper elements, we 11572 // can sink them too. 11573 auto Ext1 = cast<Instruction>(I->getOperand(0)); 11574 auto Ext2 = cast<Instruction>(I->getOperand(1)); 11575 if (areExtractShuffleVectors(Ext1, Ext2)) { 11576 Ops.push_back(&Ext1->getOperandUse(0)); 11577 Ops.push_back(&Ext2->getOperandUse(0)); 11578 } 11579 11580 Ops.push_back(&I->getOperandUse(0)); 11581 Ops.push_back(&I->getOperandUse(1)); 11582 11583 return true; 11584 } 11585 case Instruction::Mul: { 11586 bool IsProfitable = false; 11587 for (auto &Op : I->operands()) { 11588 // Make sure we are not already sinking this operand 11589 if (any_of(Ops, [&](Use *U) { return U->get() == Op; })) 11590 continue; 11591 11592 ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op); 11593 if (!Shuffle || !Shuffle->isZeroEltSplat()) 11594 continue; 11595 11596 Value *ShuffleOperand = Shuffle->getOperand(0); 11597 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand); 11598 if (!Insert) 11599 continue; 11600 11601 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1)); 11602 if (!OperandInstr) 11603 continue; 11604 11605 ConstantInt *ElementConstant = 11606 dyn_cast<ConstantInt>(Insert->getOperand(2)); 11607 // Check that the insertelement is inserting into element 0 11608 if (!ElementConstant || ElementConstant->getZExtValue() != 0) 11609 continue; 11610 11611 unsigned Opcode = OperandInstr->getOpcode(); 11612 if (Opcode != Instruction::SExt && Opcode != Instruction::ZExt) 11613 continue; 11614 11615 Ops.push_back(&Shuffle->getOperandUse(0)); 11616 Ops.push_back(&Op); 11617 IsProfitable = true; 11618 } 11619 11620 return IsProfitable; 11621 } 11622 default: 11623 return false; 11624 } 11625 return false; 11626 } 11627 11628 bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType, 11629 Align &RequiredAligment) const { 11630 if (!LoadedType.isSimple() || 11631 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint())) 11632 return false; 11633 // Cyclone supports unaligned accesses. 11634 RequiredAligment = Align(1); 11635 unsigned NumBits = LoadedType.getSizeInBits(); 11636 return NumBits == 32 || NumBits == 64; 11637 } 11638 11639 /// A helper function for determining the number of interleaved accesses we 11640 /// will generate when lowering accesses of the given type. 11641 unsigned 11642 AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy, 11643 const DataLayout &DL) const { 11644 return (DL.getTypeSizeInBits(VecTy) + 127) / 128; 11645 } 11646 11647 MachineMemOperand::Flags 11648 AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const { 11649 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor && 11650 I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr) 11651 return MOStridedAccess; 11652 return MachineMemOperand::MONone; 11653 } 11654 11655 bool AArch64TargetLowering::isLegalInterleavedAccessType( 11656 VectorType *VecTy, const DataLayout &DL) const { 11657 11658 unsigned VecSize = DL.getTypeSizeInBits(VecTy); 11659 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); 11660 11661 // Ensure the number of vector elements is greater than 1. 11662 if (cast<FixedVectorType>(VecTy)->getNumElements() < 2) 11663 return false; 11664 11665 // Ensure the element type is legal. 11666 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64) 11667 return false; 11668 11669 // Ensure the total vector size is 64 or a multiple of 128. Types larger than 11670 // 128 will be split into multiple interleaved accesses. 11671 return VecSize == 64 || VecSize % 128 == 0; 11672 } 11673 11674 /// Lower an interleaved load into a ldN intrinsic. 11675 /// 11676 /// E.g. Lower an interleaved load (Factor = 2): 11677 /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr 11678 /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements 11679 /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements 11680 /// 11681 /// Into: 11682 /// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr) 11683 /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 11684 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 11685 bool AArch64TargetLowering::lowerInterleavedLoad( 11686 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, 11687 ArrayRef<unsigned> Indices, unsigned Factor) const { 11688 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 11689 "Invalid interleave factor"); 11690 assert(!Shuffles.empty() && "Empty shufflevector input"); 11691 assert(Shuffles.size() == Indices.size() && 11692 "Unmatched number of shufflevectors and indices"); 11693 11694 const DataLayout &DL = LI->getModule()->getDataLayout(); 11695 11696 VectorType *VTy = Shuffles[0]->getType(); 11697 11698 // Skip if we do not have NEON and skip illegal vector types. We can 11699 // "legalize" wide vector types into multiple interleaved accesses as long as 11700 // the vector types are divisible by 128. 11701 if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VTy, DL)) 11702 return false; 11703 11704 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL); 11705 11706 auto *FVTy = cast<FixedVectorType>(VTy); 11707 11708 // A pointer vector can not be the return type of the ldN intrinsics. Need to 11709 // load integer vectors first and then convert to pointer vectors. 11710 Type *EltTy = FVTy->getElementType(); 11711 if (EltTy->isPointerTy()) 11712 FVTy = 11713 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements()); 11714 11715 IRBuilder<> Builder(LI); 11716 11717 // The base address of the load. 11718 Value *BaseAddr = LI->getPointerOperand(); 11719 11720 if (NumLoads > 1) { 11721 // If we're going to generate more than one load, reset the sub-vector type 11722 // to something legal. 11723 FVTy = FixedVectorType::get(FVTy->getElementType(), 11724 FVTy->getNumElements() / NumLoads); 11725 11726 // We will compute the pointer operand of each load from the original base 11727 // address using GEPs. Cast the base address to a pointer to the scalar 11728 // element type. 11729 BaseAddr = Builder.CreateBitCast( 11730 BaseAddr, 11731 FVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace())); 11732 } 11733 11734 Type *PtrTy = FVTy->getPointerTo(LI->getPointerAddressSpace()); 11735 Type *Tys[2] = {FVTy, PtrTy}; 11736 static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2, 11737 Intrinsic::aarch64_neon_ld3, 11738 Intrinsic::aarch64_neon_ld4}; 11739 Function *LdNFunc = 11740 Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); 11741 11742 // Holds sub-vectors extracted from the load intrinsic return values. The 11743 // sub-vectors are associated with the shufflevector instructions they will 11744 // replace. 11745 DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs; 11746 11747 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) { 11748 11749 // If we're generating more than one load, compute the base address of 11750 // subsequent loads as an offset from the previous. 11751 if (LoadCount > 0) 11752 BaseAddr = Builder.CreateConstGEP1_32(FVTy->getElementType(), BaseAddr, 11753 FVTy->getNumElements() * Factor); 11754 11755 CallInst *LdN = Builder.CreateCall( 11756 LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN"); 11757 11758 // Extract and store the sub-vectors returned by the load intrinsic. 11759 for (unsigned i = 0; i < Shuffles.size(); i++) { 11760 ShuffleVectorInst *SVI = Shuffles[i]; 11761 unsigned Index = Indices[i]; 11762 11763 Value *SubVec = Builder.CreateExtractValue(LdN, Index); 11764 11765 // Convert the integer vector to pointer vector if the element is pointer. 11766 if (EltTy->isPointerTy()) 11767 SubVec = Builder.CreateIntToPtr( 11768 SubVec, FixedVectorType::get(SVI->getType()->getElementType(), 11769 FVTy->getNumElements())); 11770 SubVecs[SVI].push_back(SubVec); 11771 } 11772 } 11773 11774 // Replace uses of the shufflevector instructions with the sub-vectors 11775 // returned by the load intrinsic. If a shufflevector instruction is 11776 // associated with more than one sub-vector, those sub-vectors will be 11777 // concatenated into a single wide vector. 11778 for (ShuffleVectorInst *SVI : Shuffles) { 11779 auto &SubVec = SubVecs[SVI]; 11780 auto *WideVec = 11781 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0]; 11782 SVI->replaceAllUsesWith(WideVec); 11783 } 11784 11785 return true; 11786 } 11787 11788 /// Lower an interleaved store into a stN intrinsic. 11789 /// 11790 /// E.g. Lower an interleaved store (Factor = 3): 11791 /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, 11792 /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> 11793 /// store <12 x i32> %i.vec, <12 x i32>* %ptr 11794 /// 11795 /// Into: 11796 /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> 11797 /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> 11798 /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> 11799 /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr) 11800 /// 11801 /// Note that the new shufflevectors will be removed and we'll only generate one 11802 /// st3 instruction in CodeGen. 11803 /// 11804 /// Example for a more general valid mask (Factor 3). Lower: 11805 /// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1, 11806 /// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19> 11807 /// store <12 x i32> %i.vec, <12 x i32>* %ptr 11808 /// 11809 /// Into: 11810 /// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7> 11811 /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> 11812 /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> 11813 /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr) 11814 bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, 11815 ShuffleVectorInst *SVI, 11816 unsigned Factor) const { 11817 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 11818 "Invalid interleave factor"); 11819 11820 auto *VecTy = cast<FixedVectorType>(SVI->getType()); 11821 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); 11822 11823 unsigned LaneLen = VecTy->getNumElements() / Factor; 11824 Type *EltTy = VecTy->getElementType(); 11825 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen); 11826 11827 const DataLayout &DL = SI->getModule()->getDataLayout(); 11828 11829 // Skip if we do not have NEON and skip illegal vector types. We can 11830 // "legalize" wide vector types into multiple interleaved accesses as long as 11831 // the vector types are divisible by 128. 11832 if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL)) 11833 return false; 11834 11835 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); 11836 11837 Value *Op0 = SVI->getOperand(0); 11838 Value *Op1 = SVI->getOperand(1); 11839 IRBuilder<> Builder(SI); 11840 11841 // StN intrinsics don't support pointer vectors as arguments. Convert pointer 11842 // vectors to integer vectors. 11843 if (EltTy->isPointerTy()) { 11844 Type *IntTy = DL.getIntPtrType(EltTy); 11845 unsigned NumOpElts = 11846 cast<FixedVectorType>(Op0->getType())->getNumElements(); 11847 11848 // Convert to the corresponding integer vector. 11849 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts); 11850 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); 11851 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); 11852 11853 SubVecTy = FixedVectorType::get(IntTy, LaneLen); 11854 } 11855 11856 // The base address of the store. 11857 Value *BaseAddr = SI->getPointerOperand(); 11858 11859 if (NumStores > 1) { 11860 // If we're going to generate more than one store, reset the lane length 11861 // and sub-vector type to something legal. 11862 LaneLen /= NumStores; 11863 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen); 11864 11865 // We will compute the pointer operand of each store from the original base 11866 // address using GEPs. Cast the base address to a pointer to the scalar 11867 // element type. 11868 BaseAddr = Builder.CreateBitCast( 11869 BaseAddr, 11870 SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace())); 11871 } 11872 11873 auto Mask = SVI->getShuffleMask(); 11874 11875 Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace()); 11876 Type *Tys[2] = {SubVecTy, PtrTy}; 11877 static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2, 11878 Intrinsic::aarch64_neon_st3, 11879 Intrinsic::aarch64_neon_st4}; 11880 Function *StNFunc = 11881 Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys); 11882 11883 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { 11884 11885 SmallVector<Value *, 5> Ops; 11886 11887 // Split the shufflevector operands into sub vectors for the new stN call. 11888 for (unsigned i = 0; i < Factor; i++) { 11889 unsigned IdxI = StoreCount * LaneLen * Factor + i; 11890 if (Mask[IdxI] >= 0) { 11891 Ops.push_back(Builder.CreateShuffleVector( 11892 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0))); 11893 } else { 11894 unsigned StartMask = 0; 11895 for (unsigned j = 1; j < LaneLen; j++) { 11896 unsigned IdxJ = StoreCount * LaneLen * Factor + j; 11897 if (Mask[IdxJ * Factor + IdxI] >= 0) { 11898 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ; 11899 break; 11900 } 11901 } 11902 // Note: Filling undef gaps with random elements is ok, since 11903 // those elements were being written anyway (with undefs). 11904 // In the case of all undefs we're defaulting to using elems from 0 11905 // Note: StartMask cannot be negative, it's checked in 11906 // isReInterleaveMask 11907 Ops.push_back(Builder.CreateShuffleVector( 11908 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0))); 11909 } 11910 } 11911 11912 // If we generating more than one store, we compute the base address of 11913 // subsequent stores as an offset from the previous. 11914 if (StoreCount > 0) 11915 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(), 11916 BaseAddr, LaneLen * Factor); 11917 11918 Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy)); 11919 Builder.CreateCall(StNFunc, Ops); 11920 } 11921 return true; 11922 } 11923 11924 // Lower an SVE structured load intrinsic returning a tuple type to target 11925 // specific intrinsic taking the same input but returning a multi-result value 11926 // of the split tuple type. 11927 // 11928 // E.g. Lowering an LD3: 11929 // 11930 // call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32( 11931 // <vscale x 4 x i1> %pred, 11932 // <vscale x 4 x i32>* %addr) 11933 // 11934 // Output DAG: 11935 // 11936 // t0: ch = EntryToken 11937 // t2: nxv4i1,ch = CopyFromReg t0, Register:nxv4i1 %0 11938 // t4: i64,ch = CopyFromReg t0, Register:i64 %1 11939 // t5: nxv4i32,nxv4i32,nxv4i32,ch = AArch64ISD::SVE_LD3 t0, t2, t4 11940 // t6: nxv12i32 = concat_vectors t5, t5:1, t5:2 11941 // 11942 // This is called pre-legalization to avoid widening/splitting issues with 11943 // non-power-of-2 tuple types used for LD3, such as nxv12i32. 11944 SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic, 11945 ArrayRef<SDValue> LoadOps, 11946 EVT VT, SelectionDAG &DAG, 11947 const SDLoc &DL) const { 11948 assert(VT.isScalableVector() && "Can only lower scalable vectors"); 11949 11950 unsigned N, Opcode; 11951 static std::map<unsigned, std::pair<unsigned, unsigned>> IntrinsicMap = { 11952 {Intrinsic::aarch64_sve_ld2, {2, AArch64ISD::SVE_LD2_MERGE_ZERO}}, 11953 {Intrinsic::aarch64_sve_ld3, {3, AArch64ISD::SVE_LD3_MERGE_ZERO}}, 11954 {Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4_MERGE_ZERO}}}; 11955 11956 std::tie(N, Opcode) = IntrinsicMap[Intrinsic]; 11957 assert(VT.getVectorElementCount().getKnownMinValue() % N == 0 && 11958 "invalid tuple vector type!"); 11959 11960 EVT SplitVT = 11961 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 11962 VT.getVectorElementCount().divideCoefficientBy(N)); 11963 assert(isTypeLegal(SplitVT)); 11964 11965 SmallVector<EVT, 5> VTs(N, SplitVT); 11966 VTs.push_back(MVT::Other); // Chain 11967 SDVTList NodeTys = DAG.getVTList(VTs); 11968 11969 SDValue PseudoLoad = DAG.getNode(Opcode, DL, NodeTys, LoadOps); 11970 SmallVector<SDValue, 4> PseudoLoadOps; 11971 for (unsigned I = 0; I < N; ++I) 11972 PseudoLoadOps.push_back(SDValue(PseudoLoad.getNode(), I)); 11973 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, PseudoLoadOps); 11974 } 11975 11976 EVT AArch64TargetLowering::getOptimalMemOpType( 11977 const MemOp &Op, const AttributeList &FuncAttributes) const { 11978 bool CanImplicitFloat = 11979 !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat); 11980 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat; 11981 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat; 11982 // Only use AdvSIMD to implement memset of 32-byte and above. It would have 11983 // taken one instruction to materialize the v2i64 zero and one store (with 11984 // restrictive addressing mode). Just do i64 stores. 11985 bool IsSmallMemset = Op.isMemset() && Op.size() < 32; 11986 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) { 11987 if (Op.isAligned(AlignCheck)) 11988 return true; 11989 bool Fast; 11990 return allowsMisalignedMemoryAccesses(VT, 0, Align(1), 11991 MachineMemOperand::MONone, &Fast) && 11992 Fast; 11993 }; 11994 11995 if (CanUseNEON && Op.isMemset() && !IsSmallMemset && 11996 AlignmentIsAcceptable(MVT::v2i64, Align(16))) 11997 return MVT::v2i64; 11998 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16))) 11999 return MVT::f128; 12000 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8))) 12001 return MVT::i64; 12002 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4))) 12003 return MVT::i32; 12004 return MVT::Other; 12005 } 12006 12007 LLT AArch64TargetLowering::getOptimalMemOpLLT( 12008 const MemOp &Op, const AttributeList &FuncAttributes) const { 12009 bool CanImplicitFloat = 12010 !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat); 12011 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat; 12012 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat; 12013 // Only use AdvSIMD to implement memset of 32-byte and above. It would have 12014 // taken one instruction to materialize the v2i64 zero and one store (with 12015 // restrictive addressing mode). Just do i64 stores. 12016 bool IsSmallMemset = Op.isMemset() && Op.size() < 32; 12017 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) { 12018 if (Op.isAligned(AlignCheck)) 12019 return true; 12020 bool Fast; 12021 return allowsMisalignedMemoryAccesses(VT, 0, Align(1), 12022 MachineMemOperand::MONone, &Fast) && 12023 Fast; 12024 }; 12025 12026 if (CanUseNEON && Op.isMemset() && !IsSmallMemset && 12027 AlignmentIsAcceptable(MVT::v2i64, Align(16))) 12028 return LLT::fixed_vector(2, 64); 12029 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16))) 12030 return LLT::scalar(128); 12031 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8))) 12032 return LLT::scalar(64); 12033 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4))) 12034 return LLT::scalar(32); 12035 return LLT(); 12036 } 12037 12038 // 12-bit optionally shifted immediates are legal for adds. 12039 bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const { 12040 if (Immed == std::numeric_limits<int64_t>::min()) { 12041 LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed 12042 << ": avoid UB for INT64_MIN\n"); 12043 return false; 12044 } 12045 // Same encoding for add/sub, just flip the sign. 12046 Immed = std::abs(Immed); 12047 bool IsLegal = ((Immed >> 12) == 0 || 12048 ((Immed & 0xfff) == 0 && Immed >> 24 == 0)); 12049 LLVM_DEBUG(dbgs() << "Is " << Immed 12050 << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n"); 12051 return IsLegal; 12052 } 12053 12054 // Integer comparisons are implemented with ADDS/SUBS, so the range of valid 12055 // immediates is the same as for an add or a sub. 12056 bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const { 12057 return isLegalAddImmediate(Immed); 12058 } 12059 12060 /// isLegalAddressingMode - Return true if the addressing mode represented 12061 /// by AM is legal for this target, for a load/store of the specified type. 12062 bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL, 12063 const AddrMode &AM, Type *Ty, 12064 unsigned AS, Instruction *I) const { 12065 // AArch64 has five basic addressing modes: 12066 // reg 12067 // reg + 9-bit signed offset 12068 // reg + SIZE_IN_BYTES * 12-bit unsigned offset 12069 // reg1 + reg2 12070 // reg + SIZE_IN_BYTES * reg 12071 12072 // No global is ever allowed as a base. 12073 if (AM.BaseGV) 12074 return false; 12075 12076 // No reg+reg+imm addressing. 12077 if (AM.HasBaseReg && AM.BaseOffs && AM.Scale) 12078 return false; 12079 12080 // FIXME: Update this method to support scalable addressing modes. 12081 if (isa<ScalableVectorType>(Ty)) { 12082 uint64_t VecElemNumBytes = 12083 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8; 12084 return AM.HasBaseReg && !AM.BaseOffs && 12085 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes); 12086 } 12087 12088 // check reg + imm case: 12089 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12 12090 uint64_t NumBytes = 0; 12091 if (Ty->isSized()) { 12092 uint64_t NumBits = DL.getTypeSizeInBits(Ty); 12093 NumBytes = NumBits / 8; 12094 if (!isPowerOf2_64(NumBits)) 12095 NumBytes = 0; 12096 } 12097 12098 if (!AM.Scale) { 12099 int64_t Offset = AM.BaseOffs; 12100 12101 // 9-bit signed offset 12102 if (isInt<9>(Offset)) 12103 return true; 12104 12105 // 12-bit unsigned offset 12106 unsigned shift = Log2_64(NumBytes); 12107 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 && 12108 // Must be a multiple of NumBytes (NumBytes is a power of 2) 12109 (Offset >> shift) << shift == Offset) 12110 return true; 12111 return false; 12112 } 12113 12114 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2 12115 12116 return AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes); 12117 } 12118 12119 bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const { 12120 // Consider splitting large offset of struct or array. 12121 return true; 12122 } 12123 12124 InstructionCost AArch64TargetLowering::getScalingFactorCost( 12125 const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const { 12126 // Scaling factors are not free at all. 12127 // Operands | Rt Latency 12128 // ------------------------------------------- 12129 // Rt, [Xn, Xm] | 4 12130 // ------------------------------------------- 12131 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5 12132 // Rt, [Xn, Wm, <extend> #imm] | 12133 if (isLegalAddressingMode(DL, AM, Ty, AS)) 12134 // Scale represents reg2 * scale, thus account for 1 if 12135 // it is not equal to 0 or 1. 12136 return AM.Scale != 0 && AM.Scale != 1; 12137 return -1; 12138 } 12139 12140 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd( 12141 const MachineFunction &MF, EVT VT) const { 12142 VT = VT.getScalarType(); 12143 12144 if (!VT.isSimple()) 12145 return false; 12146 12147 switch (VT.getSimpleVT().SimpleTy) { 12148 case MVT::f16: 12149 return Subtarget->hasFullFP16(); 12150 case MVT::f32: 12151 case MVT::f64: 12152 return true; 12153 default: 12154 break; 12155 } 12156 12157 return false; 12158 } 12159 12160 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F, 12161 Type *Ty) const { 12162 switch (Ty->getScalarType()->getTypeID()) { 12163 case Type::FloatTyID: 12164 case Type::DoubleTyID: 12165 return true; 12166 default: 12167 return false; 12168 } 12169 } 12170 12171 bool AArch64TargetLowering::generateFMAsInMachineCombiner( 12172 EVT VT, CodeGenOpt::Level OptLevel) const { 12173 return (OptLevel >= CodeGenOpt::Aggressive) && !VT.isScalableVector(); 12174 } 12175 12176 const MCPhysReg * 12177 AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const { 12178 // LR is a callee-save register, but we must treat it as clobbered by any call 12179 // site. Hence we include LR in the scratch registers, which are in turn added 12180 // as implicit-defs for stackmaps and patchpoints. 12181 static const MCPhysReg ScratchRegs[] = { 12182 AArch64::X16, AArch64::X17, AArch64::LR, 0 12183 }; 12184 return ScratchRegs; 12185 } 12186 12187 bool 12188 AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N, 12189 CombineLevel Level) const { 12190 N = N->getOperand(0).getNode(); 12191 EVT VT = N->getValueType(0); 12192 // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine 12193 // it with shift to let it be lowered to UBFX. 12194 if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) && 12195 isa<ConstantSDNode>(N->getOperand(1))) { 12196 uint64_t TruncMask = N->getConstantOperandVal(1); 12197 if (isMask_64(TruncMask) && 12198 N->getOperand(0).getOpcode() == ISD::SRL && 12199 isa<ConstantSDNode>(N->getOperand(0)->getOperand(1))) 12200 return false; 12201 } 12202 return true; 12203 } 12204 12205 bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 12206 Type *Ty) const { 12207 assert(Ty->isIntegerTy()); 12208 12209 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 12210 if (BitSize == 0) 12211 return false; 12212 12213 int64_t Val = Imm.getSExtValue(); 12214 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize)) 12215 return true; 12216 12217 if ((int64_t)Val < 0) 12218 Val = ~Val; 12219 if (BitSize == 32) 12220 Val &= (1LL << 32) - 1; 12221 12222 unsigned LZ = countLeadingZeros((uint64_t)Val); 12223 unsigned Shift = (63 - LZ) / 16; 12224 // MOVZ is free so return true for one or fewer MOVK. 12225 return Shift < 3; 12226 } 12227 12228 bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, 12229 unsigned Index) const { 12230 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) 12231 return false; 12232 12233 return (Index == 0 || Index == ResVT.getVectorNumElements()); 12234 } 12235 12236 /// Turn vector tests of the signbit in the form of: 12237 /// xor (sra X, elt_size(X)-1), -1 12238 /// into: 12239 /// cmge X, X, #0 12240 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, 12241 const AArch64Subtarget *Subtarget) { 12242 EVT VT = N->getValueType(0); 12243 if (!Subtarget->hasNEON() || !VT.isVector()) 12244 return SDValue(); 12245 12246 // There must be a shift right algebraic before the xor, and the xor must be a 12247 // 'not' operation. 12248 SDValue Shift = N->getOperand(0); 12249 SDValue Ones = N->getOperand(1); 12250 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() || 12251 !ISD::isBuildVectorAllOnes(Ones.getNode())) 12252 return SDValue(); 12253 12254 // The shift should be smearing the sign bit across each vector element. 12255 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1)); 12256 EVT ShiftEltTy = Shift.getValueType().getVectorElementType(); 12257 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1) 12258 return SDValue(); 12259 12260 return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0)); 12261 } 12262 12263 // Given a vecreduce_add node, detect the below pattern and convert it to the 12264 // node sequence with UABDL, [S|U]ADB and UADDLP. 12265 // 12266 // i32 vecreduce_add( 12267 // v16i32 abs( 12268 // v16i32 sub( 12269 // v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b)))) 12270 // =================> 12271 // i32 vecreduce_add( 12272 // v4i32 UADDLP( 12273 // v8i16 add( 12274 // v8i16 zext( 12275 // v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b 12276 // v8i16 zext( 12277 // v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b 12278 static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N, 12279 SelectionDAG &DAG) { 12280 // Assumed i32 vecreduce_add 12281 if (N->getValueType(0) != MVT::i32) 12282 return SDValue(); 12283 12284 SDValue VecReduceOp0 = N->getOperand(0); 12285 unsigned Opcode = VecReduceOp0.getOpcode(); 12286 // Assumed v16i32 abs 12287 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32) 12288 return SDValue(); 12289 12290 SDValue ABS = VecReduceOp0; 12291 // Assumed v16i32 sub 12292 if (ABS->getOperand(0)->getOpcode() != ISD::SUB || 12293 ABS->getOperand(0)->getValueType(0) != MVT::v16i32) 12294 return SDValue(); 12295 12296 SDValue SUB = ABS->getOperand(0); 12297 unsigned Opcode0 = SUB->getOperand(0).getOpcode(); 12298 unsigned Opcode1 = SUB->getOperand(1).getOpcode(); 12299 // Assumed v16i32 type 12300 if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 || 12301 SUB->getOperand(1)->getValueType(0) != MVT::v16i32) 12302 return SDValue(); 12303 12304 // Assumed zext or sext 12305 bool IsZExt = false; 12306 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) { 12307 IsZExt = true; 12308 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) { 12309 IsZExt = false; 12310 } else 12311 return SDValue(); 12312 12313 SDValue EXT0 = SUB->getOperand(0); 12314 SDValue EXT1 = SUB->getOperand(1); 12315 // Assumed zext's operand has v16i8 type 12316 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 || 12317 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8) 12318 return SDValue(); 12319 12320 // Pattern is dectected. Let's convert it to sequence of nodes. 12321 SDLoc DL(N); 12322 12323 // First, create the node pattern of UABD/SABD. 12324 SDValue UABDHigh8Op0 = 12325 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0), 12326 DAG.getConstant(8, DL, MVT::i64)); 12327 SDValue UABDHigh8Op1 = 12328 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0), 12329 DAG.getConstant(8, DL, MVT::i64)); 12330 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8, 12331 UABDHigh8Op0, UABDHigh8Op1); 12332 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8); 12333 12334 // Second, create the node pattern of UABAL. 12335 SDValue UABDLo8Op0 = 12336 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0), 12337 DAG.getConstant(0, DL, MVT::i64)); 12338 SDValue UABDLo8Op1 = 12339 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0), 12340 DAG.getConstant(0, DL, MVT::i64)); 12341 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8, 12342 UABDLo8Op0, UABDLo8Op1); 12343 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8); 12344 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD); 12345 12346 // Third, create the node of UADDLP. 12347 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL); 12348 12349 // Fourth, create the node of VECREDUCE_ADD. 12350 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP); 12351 } 12352 12353 // Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce 12354 // vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one)) 12355 // vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B)) 12356 static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, 12357 const AArch64Subtarget *ST) { 12358 if (!ST->hasDotProd()) 12359 return performVecReduceAddCombineWithUADDLP(N, DAG); 12360 12361 SDValue Op0 = N->getOperand(0); 12362 if (N->getValueType(0) != MVT::i32 || 12363 Op0.getValueType().getVectorElementType() != MVT::i32) 12364 return SDValue(); 12365 12366 unsigned ExtOpcode = Op0.getOpcode(); 12367 SDValue A = Op0; 12368 SDValue B; 12369 if (ExtOpcode == ISD::MUL) { 12370 A = Op0.getOperand(0); 12371 B = Op0.getOperand(1); 12372 if (A.getOpcode() != B.getOpcode() || 12373 A.getOperand(0).getValueType() != B.getOperand(0).getValueType()) 12374 return SDValue(); 12375 ExtOpcode = A.getOpcode(); 12376 } 12377 if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND) 12378 return SDValue(); 12379 12380 EVT Op0VT = A.getOperand(0).getValueType(); 12381 if (Op0VT != MVT::v8i8 && Op0VT != MVT::v16i8) 12382 return SDValue(); 12383 12384 SDLoc DL(Op0); 12385 // For non-mla reductions B can be set to 1. For MLA we take the operand of 12386 // the extend B. 12387 if (!B) 12388 B = DAG.getConstant(1, DL, Op0VT); 12389 else 12390 B = B.getOperand(0); 12391 12392 SDValue Zeros = 12393 DAG.getConstant(0, DL, Op0VT == MVT::v8i8 ? MVT::v2i32 : MVT::v4i32); 12394 auto DotOpcode = 12395 (ExtOpcode == ISD::ZERO_EXTEND) ? AArch64ISD::UDOT : AArch64ISD::SDOT; 12396 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, 12397 A.getOperand(0), B); 12398 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot); 12399 } 12400 12401 static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, 12402 TargetLowering::DAGCombinerInfo &DCI, 12403 const AArch64Subtarget *Subtarget) { 12404 if (DCI.isBeforeLegalizeOps()) 12405 return SDValue(); 12406 12407 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget); 12408 } 12409 12410 SDValue 12411 AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, 12412 SelectionDAG &DAG, 12413 SmallVectorImpl<SDNode *> &Created) const { 12414 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); 12415 if (isIntDivCheap(N->getValueType(0), Attr)) 12416 return SDValue(N,0); // Lower SDIV as SDIV 12417 12418 // fold (sdiv X, pow2) 12419 EVT VT = N->getValueType(0); 12420 if ((VT != MVT::i32 && VT != MVT::i64) || 12421 !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2())) 12422 return SDValue(); 12423 12424 SDLoc DL(N); 12425 SDValue N0 = N->getOperand(0); 12426 unsigned Lg2 = Divisor.countTrailingZeros(); 12427 SDValue Zero = DAG.getConstant(0, DL, VT); 12428 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT); 12429 12430 // Add (N0 < 0) ? Pow2 - 1 : 0; 12431 SDValue CCVal; 12432 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL); 12433 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne); 12434 SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp); 12435 12436 Created.push_back(Cmp.getNode()); 12437 Created.push_back(Add.getNode()); 12438 Created.push_back(CSel.getNode()); 12439 12440 // Divide by pow2. 12441 SDValue SRA = 12442 DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64)); 12443 12444 // If we're dividing by a positive value, we're done. Otherwise, we must 12445 // negate the result. 12446 if (Divisor.isNonNegative()) 12447 return SRA; 12448 12449 Created.push_back(SRA.getNode()); 12450 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA); 12451 } 12452 12453 static bool IsSVECntIntrinsic(SDValue S) { 12454 switch(getIntrinsicID(S.getNode())) { 12455 default: 12456 break; 12457 case Intrinsic::aarch64_sve_cntb: 12458 case Intrinsic::aarch64_sve_cnth: 12459 case Intrinsic::aarch64_sve_cntw: 12460 case Intrinsic::aarch64_sve_cntd: 12461 return true; 12462 } 12463 return false; 12464 } 12465 12466 /// Calculates what the pre-extend type is, based on the extension 12467 /// operation node provided by \p Extend. 12468 /// 12469 /// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the 12470 /// pre-extend type is pulled directly from the operand, while other extend 12471 /// operations need a bit more inspection to get this information. 12472 /// 12473 /// \param Extend The SDNode from the DAG that represents the extend operation 12474 /// \param DAG The SelectionDAG hosting the \p Extend node 12475 /// 12476 /// \returns The type representing the \p Extend source type, or \p MVT::Other 12477 /// if no valid type can be determined 12478 static EVT calculatePreExtendType(SDValue Extend, SelectionDAG &DAG) { 12479 switch (Extend.getOpcode()) { 12480 case ISD::SIGN_EXTEND: 12481 case ISD::ZERO_EXTEND: 12482 return Extend.getOperand(0).getValueType(); 12483 case ISD::AssertSext: 12484 case ISD::AssertZext: 12485 case ISD::SIGN_EXTEND_INREG: { 12486 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1)); 12487 if (!TypeNode) 12488 return MVT::Other; 12489 return TypeNode->getVT(); 12490 } 12491 case ISD::AND: { 12492 ConstantSDNode *Constant = 12493 dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode()); 12494 if (!Constant) 12495 return MVT::Other; 12496 12497 uint32_t Mask = Constant->getZExtValue(); 12498 12499 if (Mask == UCHAR_MAX) 12500 return MVT::i8; 12501 else if (Mask == USHRT_MAX) 12502 return MVT::i16; 12503 else if (Mask == UINT_MAX) 12504 return MVT::i32; 12505 12506 return MVT::Other; 12507 } 12508 default: 12509 return MVT::Other; 12510 } 12511 12512 llvm_unreachable("Code path unhandled in calculatePreExtendType!"); 12513 } 12514 12515 /// Combines a dup(sext/zext) node pattern into sext/zext(dup) 12516 /// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt 12517 static SDValue performCommonVectorExtendCombine(SDValue VectorShuffle, 12518 SelectionDAG &DAG) { 12519 12520 ShuffleVectorSDNode *ShuffleNode = 12521 dyn_cast<ShuffleVectorSDNode>(VectorShuffle.getNode()); 12522 if (!ShuffleNode) 12523 return SDValue(); 12524 12525 // Ensuring the mask is zero before continuing 12526 if (!ShuffleNode->isSplat() || ShuffleNode->getSplatIndex() != 0) 12527 return SDValue(); 12528 12529 SDValue InsertVectorElt = VectorShuffle.getOperand(0); 12530 12531 if (InsertVectorElt.getOpcode() != ISD::INSERT_VECTOR_ELT) 12532 return SDValue(); 12533 12534 SDValue InsertLane = InsertVectorElt.getOperand(2); 12535 ConstantSDNode *Constant = dyn_cast<ConstantSDNode>(InsertLane.getNode()); 12536 // Ensures the insert is inserting into lane 0 12537 if (!Constant || Constant->getZExtValue() != 0) 12538 return SDValue(); 12539 12540 SDValue Extend = InsertVectorElt.getOperand(1); 12541 unsigned ExtendOpcode = Extend.getOpcode(); 12542 12543 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND || 12544 ExtendOpcode == ISD::SIGN_EXTEND_INREG || 12545 ExtendOpcode == ISD::AssertSext; 12546 if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND && 12547 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND) 12548 return SDValue(); 12549 12550 EVT TargetType = VectorShuffle.getValueType(); 12551 EVT PreExtendType = calculatePreExtendType(Extend, DAG); 12552 12553 if ((TargetType != MVT::v8i16 && TargetType != MVT::v4i32 && 12554 TargetType != MVT::v2i64) || 12555 (PreExtendType == MVT::Other)) 12556 return SDValue(); 12557 12558 // Restrict valid pre-extend data type 12559 if (PreExtendType != MVT::i8 && PreExtendType != MVT::i16 && 12560 PreExtendType != MVT::i32) 12561 return SDValue(); 12562 12563 EVT PreExtendVT = TargetType.changeVectorElementType(PreExtendType); 12564 12565 if (PreExtendVT.getVectorElementCount() != TargetType.getVectorElementCount()) 12566 return SDValue(); 12567 12568 if (TargetType.getScalarSizeInBits() != PreExtendVT.getScalarSizeInBits() * 2) 12569 return SDValue(); 12570 12571 SDLoc DL(VectorShuffle); 12572 12573 SDValue InsertVectorNode = DAG.getNode( 12574 InsertVectorElt.getOpcode(), DL, PreExtendVT, DAG.getUNDEF(PreExtendVT), 12575 DAG.getAnyExtOrTrunc(Extend.getOperand(0), DL, PreExtendType), 12576 DAG.getConstant(0, DL, MVT::i64)); 12577 12578 std::vector<int> ShuffleMask(TargetType.getVectorElementCount().getValue()); 12579 12580 SDValue VectorShuffleNode = 12581 DAG.getVectorShuffle(PreExtendVT, DL, InsertVectorNode, 12582 DAG.getUNDEF(PreExtendVT), ShuffleMask); 12583 12584 SDValue ExtendNode = DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, 12585 DL, TargetType, VectorShuffleNode); 12586 12587 return ExtendNode; 12588 } 12589 12590 /// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) 12591 /// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt 12592 static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) { 12593 // If the value type isn't a vector, none of the operands are going to be dups 12594 if (!Mul->getValueType(0).isVector()) 12595 return SDValue(); 12596 12597 SDValue Op0 = performCommonVectorExtendCombine(Mul->getOperand(0), DAG); 12598 SDValue Op1 = performCommonVectorExtendCombine(Mul->getOperand(1), DAG); 12599 12600 // Neither operands have been changed, don't make any further changes 12601 if (!Op0 && !Op1) 12602 return SDValue(); 12603 12604 SDLoc DL(Mul); 12605 return DAG.getNode(Mul->getOpcode(), DL, Mul->getValueType(0), 12606 Op0 ? Op0 : Mul->getOperand(0), 12607 Op1 ? Op1 : Mul->getOperand(1)); 12608 } 12609 12610 static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, 12611 TargetLowering::DAGCombinerInfo &DCI, 12612 const AArch64Subtarget *Subtarget) { 12613 12614 if (SDValue Ext = performMulVectorExtendCombine(N, DAG)) 12615 return Ext; 12616 12617 if (DCI.isBeforeLegalizeOps()) 12618 return SDValue(); 12619 12620 // The below optimizations require a constant RHS. 12621 if (!isa<ConstantSDNode>(N->getOperand(1))) 12622 return SDValue(); 12623 12624 SDValue N0 = N->getOperand(0); 12625 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(1)); 12626 const APInt &ConstValue = C->getAPIntValue(); 12627 12628 // Allow the scaling to be folded into the `cnt` instruction by preventing 12629 // the scaling to be obscured here. This makes it easier to pattern match. 12630 if (IsSVECntIntrinsic(N0) || 12631 (N0->getOpcode() == ISD::TRUNCATE && 12632 (IsSVECntIntrinsic(N0->getOperand(0))))) 12633 if (ConstValue.sge(1) && ConstValue.sle(16)) 12634 return SDValue(); 12635 12636 // Multiplication of a power of two plus/minus one can be done more 12637 // cheaply as as shift+add/sub. For now, this is true unilaterally. If 12638 // future CPUs have a cheaper MADD instruction, this may need to be 12639 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and 12640 // 64-bit is 5 cycles, so this is always a win. 12641 // More aggressively, some multiplications N0 * C can be lowered to 12642 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M, 12643 // e.g. 6=3*2=(2+1)*2. 12644 // TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45 12645 // which equals to (1+2)*16-(1+2). 12646 12647 // TrailingZeroes is used to test if the mul can be lowered to 12648 // shift+add+shift. 12649 unsigned TrailingZeroes = ConstValue.countTrailingZeros(); 12650 if (TrailingZeroes) { 12651 // Conservatively do not lower to shift+add+shift if the mul might be 12652 // folded into smul or umul. 12653 if (N0->hasOneUse() && (isSignExtended(N0.getNode(), DAG) || 12654 isZeroExtended(N0.getNode(), DAG))) 12655 return SDValue(); 12656 // Conservatively do not lower to shift+add+shift if the mul might be 12657 // folded into madd or msub. 12658 if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD || 12659 N->use_begin()->getOpcode() == ISD::SUB)) 12660 return SDValue(); 12661 } 12662 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub 12663 // and shift+add+shift. 12664 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes); 12665 12666 unsigned ShiftAmt, AddSubOpc; 12667 // Is the shifted value the LHS operand of the add/sub? 12668 bool ShiftValUseIsN0 = true; 12669 // Do we need to negate the result? 12670 bool NegateResult = false; 12671 12672 if (ConstValue.isNonNegative()) { 12673 // (mul x, 2^N + 1) => (add (shl x, N), x) 12674 // (mul x, 2^N - 1) => (sub (shl x, N), x) 12675 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M) 12676 APInt SCVMinus1 = ShiftedConstValue - 1; 12677 APInt CVPlus1 = ConstValue + 1; 12678 if (SCVMinus1.isPowerOf2()) { 12679 ShiftAmt = SCVMinus1.logBase2(); 12680 AddSubOpc = ISD::ADD; 12681 } else if (CVPlus1.isPowerOf2()) { 12682 ShiftAmt = CVPlus1.logBase2(); 12683 AddSubOpc = ISD::SUB; 12684 } else 12685 return SDValue(); 12686 } else { 12687 // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) 12688 // (mul x, -(2^N + 1)) => - (add (shl x, N), x) 12689 APInt CVNegPlus1 = -ConstValue + 1; 12690 APInt CVNegMinus1 = -ConstValue - 1; 12691 if (CVNegPlus1.isPowerOf2()) { 12692 ShiftAmt = CVNegPlus1.logBase2(); 12693 AddSubOpc = ISD::SUB; 12694 ShiftValUseIsN0 = false; 12695 } else if (CVNegMinus1.isPowerOf2()) { 12696 ShiftAmt = CVNegMinus1.logBase2(); 12697 AddSubOpc = ISD::ADD; 12698 NegateResult = true; 12699 } else 12700 return SDValue(); 12701 } 12702 12703 SDLoc DL(N); 12704 EVT VT = N->getValueType(0); 12705 SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N0, 12706 DAG.getConstant(ShiftAmt, DL, MVT::i64)); 12707 12708 SDValue AddSubN0 = ShiftValUseIsN0 ? ShiftedVal : N0; 12709 SDValue AddSubN1 = ShiftValUseIsN0 ? N0 : ShiftedVal; 12710 SDValue Res = DAG.getNode(AddSubOpc, DL, VT, AddSubN0, AddSubN1); 12711 assert(!(NegateResult && TrailingZeroes) && 12712 "NegateResult and TrailingZeroes cannot both be true for now."); 12713 // Negate the result. 12714 if (NegateResult) 12715 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res); 12716 // Shift the result. 12717 if (TrailingZeroes) 12718 return DAG.getNode(ISD::SHL, DL, VT, Res, 12719 DAG.getConstant(TrailingZeroes, DL, MVT::i64)); 12720 return Res; 12721 } 12722 12723 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, 12724 SelectionDAG &DAG) { 12725 // Take advantage of vector comparisons producing 0 or -1 in each lane to 12726 // optimize away operation when it's from a constant. 12727 // 12728 // The general transformation is: 12729 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> 12730 // AND(VECTOR_CMP(x,y), constant2) 12731 // constant2 = UNARYOP(constant) 12732 12733 // Early exit if this isn't a vector operation, the operand of the 12734 // unary operation isn't a bitwise AND, or if the sizes of the operations 12735 // aren't the same. 12736 EVT VT = N->getValueType(0); 12737 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || 12738 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC || 12739 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits()) 12740 return SDValue(); 12741 12742 // Now check that the other operand of the AND is a constant. We could 12743 // make the transformation for non-constant splats as well, but it's unclear 12744 // that would be a benefit as it would not eliminate any operations, just 12745 // perform one more step in scalar code before moving to the vector unit. 12746 if (BuildVectorSDNode *BV = 12747 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) { 12748 // Bail out if the vector isn't a constant. 12749 if (!BV->isConstant()) 12750 return SDValue(); 12751 12752 // Everything checks out. Build up the new and improved node. 12753 SDLoc DL(N); 12754 EVT IntVT = BV->getValueType(0); 12755 // Create a new constant of the appropriate type for the transformed 12756 // DAG. 12757 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); 12758 // The AND node needs bitcasts to/from an integer vector type around it. 12759 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst); 12760 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, 12761 N->getOperand(0)->getOperand(0), MaskConst); 12762 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd); 12763 return Res; 12764 } 12765 12766 return SDValue(); 12767 } 12768 12769 static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, 12770 const AArch64Subtarget *Subtarget) { 12771 // First try to optimize away the conversion when it's conditionally from 12772 // a constant. Vectors only. 12773 if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG)) 12774 return Res; 12775 12776 EVT VT = N->getValueType(0); 12777 if (VT != MVT::f32 && VT != MVT::f64) 12778 return SDValue(); 12779 12780 // Only optimize when the source and destination types have the same width. 12781 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits()) 12782 return SDValue(); 12783 12784 // If the result of an integer load is only used by an integer-to-float 12785 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead. 12786 // This eliminates an "integer-to-vector-move" UOP and improves throughput. 12787 SDValue N0 = N->getOperand(0); 12788 if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && 12789 // Do not change the width of a volatile load. 12790 !cast<LoadSDNode>(N0)->isVolatile()) { 12791 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 12792 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(), 12793 LN0->getPointerInfo(), LN0->getAlignment(), 12794 LN0->getMemOperand()->getFlags()); 12795 12796 // Make sure successors of the original load stay after it by updating them 12797 // to use the new Chain. 12798 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1)); 12799 12800 unsigned Opcode = 12801 (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF; 12802 return DAG.getNode(Opcode, SDLoc(N), VT, Load); 12803 } 12804 12805 return SDValue(); 12806 } 12807 12808 /// Fold a floating-point multiply by power of two into floating-point to 12809 /// fixed-point conversion. 12810 static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, 12811 TargetLowering::DAGCombinerInfo &DCI, 12812 const AArch64Subtarget *Subtarget) { 12813 if (!Subtarget->hasNEON()) 12814 return SDValue(); 12815 12816 if (!N->getValueType(0).isSimple()) 12817 return SDValue(); 12818 12819 SDValue Op = N->getOperand(0); 12820 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || 12821 Op.getOpcode() != ISD::FMUL) 12822 return SDValue(); 12823 12824 SDValue ConstVec = Op->getOperand(1); 12825 if (!isa<BuildVectorSDNode>(ConstVec)) 12826 return SDValue(); 12827 12828 MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); 12829 uint32_t FloatBits = FloatTy.getSizeInBits(); 12830 if (FloatBits != 32 && FloatBits != 64) 12831 return SDValue(); 12832 12833 MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); 12834 uint32_t IntBits = IntTy.getSizeInBits(); 12835 if (IntBits != 16 && IntBits != 32 && IntBits != 64) 12836 return SDValue(); 12837 12838 // Avoid conversions where iN is larger than the float (e.g., float -> i64). 12839 if (IntBits > FloatBits) 12840 return SDValue(); 12841 12842 BitVector UndefElements; 12843 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 12844 int32_t Bits = IntBits == 64 ? 64 : 32; 12845 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1); 12846 if (C == -1 || C == 0 || C > Bits) 12847 return SDValue(); 12848 12849 MVT ResTy; 12850 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 12851 switch (NumLanes) { 12852 default: 12853 return SDValue(); 12854 case 2: 12855 ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; 12856 break; 12857 case 4: 12858 ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64; 12859 break; 12860 } 12861 12862 if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps()) 12863 return SDValue(); 12864 12865 assert((ResTy != MVT::v4i64 || DCI.isBeforeLegalizeOps()) && 12866 "Illegal vector type after legalization"); 12867 12868 SDLoc DL(N); 12869 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; 12870 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs 12871 : Intrinsic::aarch64_neon_vcvtfp2fxu; 12872 SDValue FixConv = 12873 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy, 12874 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), 12875 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32)); 12876 // We can handle smaller integers by generating an extra trunc. 12877 if (IntBits < FloatBits) 12878 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv); 12879 12880 return FixConv; 12881 } 12882 12883 /// Fold a floating-point divide by power of two into fixed-point to 12884 /// floating-point conversion. 12885 static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG, 12886 TargetLowering::DAGCombinerInfo &DCI, 12887 const AArch64Subtarget *Subtarget) { 12888 if (!Subtarget->hasNEON()) 12889 return SDValue(); 12890 12891 SDValue Op = N->getOperand(0); 12892 unsigned Opc = Op->getOpcode(); 12893 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || 12894 !Op.getOperand(0).getValueType().isSimple() || 12895 (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP)) 12896 return SDValue(); 12897 12898 SDValue ConstVec = N->getOperand(1); 12899 if (!isa<BuildVectorSDNode>(ConstVec)) 12900 return SDValue(); 12901 12902 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); 12903 int32_t IntBits = IntTy.getSizeInBits(); 12904 if (IntBits != 16 && IntBits != 32 && IntBits != 64) 12905 return SDValue(); 12906 12907 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); 12908 int32_t FloatBits = FloatTy.getSizeInBits(); 12909 if (FloatBits != 32 && FloatBits != 64) 12910 return SDValue(); 12911 12912 // Avoid conversions where iN is larger than the float (e.g., i64 -> float). 12913 if (IntBits > FloatBits) 12914 return SDValue(); 12915 12916 BitVector UndefElements; 12917 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 12918 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1); 12919 if (C == -1 || C == 0 || C > FloatBits) 12920 return SDValue(); 12921 12922 MVT ResTy; 12923 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 12924 switch (NumLanes) { 12925 default: 12926 return SDValue(); 12927 case 2: 12928 ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; 12929 break; 12930 case 4: 12931 ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64; 12932 break; 12933 } 12934 12935 if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps()) 12936 return SDValue(); 12937 12938 SDLoc DL(N); 12939 SDValue ConvInput = Op.getOperand(0); 12940 bool IsSigned = Opc == ISD::SINT_TO_FP; 12941 if (IntBits < FloatBits) 12942 ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, 12943 ResTy, ConvInput); 12944 12945 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp 12946 : Intrinsic::aarch64_neon_vcvtfxu2fp; 12947 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), 12948 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput, 12949 DAG.getConstant(C, DL, MVT::i32)); 12950 } 12951 12952 /// An EXTR instruction is made up of two shifts, ORed together. This helper 12953 /// searches for and classifies those shifts. 12954 static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, 12955 bool &FromHi) { 12956 if (N.getOpcode() == ISD::SHL) 12957 FromHi = false; 12958 else if (N.getOpcode() == ISD::SRL) 12959 FromHi = true; 12960 else 12961 return false; 12962 12963 if (!isa<ConstantSDNode>(N.getOperand(1))) 12964 return false; 12965 12966 ShiftAmount = N->getConstantOperandVal(1); 12967 Src = N->getOperand(0); 12968 return true; 12969 } 12970 12971 /// EXTR instruction extracts a contiguous chunk of bits from two existing 12972 /// registers viewed as a high/low pair. This function looks for the pattern: 12973 /// <tt>(or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N))</tt> and replaces it 12974 /// with an EXTR. Can't quite be done in TableGen because the two immediates 12975 /// aren't independent. 12976 static SDValue tryCombineToEXTR(SDNode *N, 12977 TargetLowering::DAGCombinerInfo &DCI) { 12978 SelectionDAG &DAG = DCI.DAG; 12979 SDLoc DL(N); 12980 EVT VT = N->getValueType(0); 12981 12982 assert(N->getOpcode() == ISD::OR && "Unexpected root"); 12983 12984 if (VT != MVT::i32 && VT != MVT::i64) 12985 return SDValue(); 12986 12987 SDValue LHS; 12988 uint32_t ShiftLHS = 0; 12989 bool LHSFromHi = false; 12990 if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi)) 12991 return SDValue(); 12992 12993 SDValue RHS; 12994 uint32_t ShiftRHS = 0; 12995 bool RHSFromHi = false; 12996 if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi)) 12997 return SDValue(); 12998 12999 // If they're both trying to come from the high part of the register, they're 13000 // not really an EXTR. 13001 if (LHSFromHi == RHSFromHi) 13002 return SDValue(); 13003 13004 if (ShiftLHS + ShiftRHS != VT.getSizeInBits()) 13005 return SDValue(); 13006 13007 if (LHSFromHi) { 13008 std::swap(LHS, RHS); 13009 std::swap(ShiftLHS, ShiftRHS); 13010 } 13011 13012 return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS, 13013 DAG.getConstant(ShiftRHS, DL, MVT::i64)); 13014 } 13015 13016 static SDValue tryCombineToBSL(SDNode *N, 13017 TargetLowering::DAGCombinerInfo &DCI) { 13018 EVT VT = N->getValueType(0); 13019 SelectionDAG &DAG = DCI.DAG; 13020 SDLoc DL(N); 13021 13022 if (!VT.isVector()) 13023 return SDValue(); 13024 13025 // The combining code currently only works for NEON vectors. In particular, 13026 // it does not work for SVE when dealing with vectors wider than 128 bits. 13027 if (!VT.is64BitVector() && !VT.is128BitVector()) 13028 return SDValue(); 13029 13030 SDValue N0 = N->getOperand(0); 13031 if (N0.getOpcode() != ISD::AND) 13032 return SDValue(); 13033 13034 SDValue N1 = N->getOperand(1); 13035 if (N1.getOpcode() != ISD::AND) 13036 return SDValue(); 13037 13038 // InstCombine does (not (neg a)) => (add a -1). 13039 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c) 13040 // Loop over all combinations of AND operands. 13041 for (int i = 1; i >= 0; --i) { 13042 for (int j = 1; j >= 0; --j) { 13043 SDValue O0 = N0->getOperand(i); 13044 SDValue O1 = N1->getOperand(j); 13045 SDValue Sub, Add, SubSibling, AddSibling; 13046 13047 // Find a SUB and an ADD operand, one from each AND. 13048 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) { 13049 Sub = O0; 13050 Add = O1; 13051 SubSibling = N0->getOperand(1 - i); 13052 AddSibling = N1->getOperand(1 - j); 13053 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) { 13054 Add = O0; 13055 Sub = O1; 13056 AddSibling = N0->getOperand(1 - i); 13057 SubSibling = N1->getOperand(1 - j); 13058 } else 13059 continue; 13060 13061 if (!ISD::isBuildVectorAllZeros(Sub.getOperand(0).getNode())) 13062 continue; 13063 13064 // Constant ones is always righthand operand of the Add. 13065 if (!ISD::isBuildVectorAllOnes(Add.getOperand(1).getNode())) 13066 continue; 13067 13068 if (Sub.getOperand(1) != Add.getOperand(0)) 13069 continue; 13070 13071 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling); 13072 } 13073 } 13074 13075 // (or (and a b) (and (not a) c)) => (bsl a b c) 13076 // We only have to look for constant vectors here since the general, variable 13077 // case can be handled in TableGen. 13078 unsigned Bits = VT.getScalarSizeInBits(); 13079 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1); 13080 for (int i = 1; i >= 0; --i) 13081 for (int j = 1; j >= 0; --j) { 13082 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i)); 13083 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j)); 13084 if (!BVN0 || !BVN1) 13085 continue; 13086 13087 bool FoundMatch = true; 13088 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) { 13089 ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k)); 13090 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k)); 13091 if (!CN0 || !CN1 || 13092 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) { 13093 FoundMatch = false; 13094 break; 13095 } 13096 } 13097 13098 if (FoundMatch) 13099 return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0), 13100 N0->getOperand(1 - i), N1->getOperand(1 - j)); 13101 } 13102 13103 return SDValue(); 13104 } 13105 13106 static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 13107 const AArch64Subtarget *Subtarget) { 13108 // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) 13109 SelectionDAG &DAG = DCI.DAG; 13110 EVT VT = N->getValueType(0); 13111 13112 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 13113 return SDValue(); 13114 13115 if (SDValue Res = tryCombineToEXTR(N, DCI)) 13116 return Res; 13117 13118 if (SDValue Res = tryCombineToBSL(N, DCI)) 13119 return Res; 13120 13121 return SDValue(); 13122 } 13123 13124 static bool isConstantSplatVectorMaskForType(SDNode *N, EVT MemVT) { 13125 if (!MemVT.getVectorElementType().isSimple()) 13126 return false; 13127 13128 uint64_t MaskForTy = 0ull; 13129 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) { 13130 case MVT::i8: 13131 MaskForTy = 0xffull; 13132 break; 13133 case MVT::i16: 13134 MaskForTy = 0xffffull; 13135 break; 13136 case MVT::i32: 13137 MaskForTy = 0xffffffffull; 13138 break; 13139 default: 13140 return false; 13141 break; 13142 } 13143 13144 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR) 13145 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0))) 13146 return Op0->getAPIntValue().getLimitedValue() == MaskForTy; 13147 13148 return false; 13149 } 13150 13151 static SDValue performSVEAndCombine(SDNode *N, 13152 TargetLowering::DAGCombinerInfo &DCI) { 13153 if (DCI.isBeforeLegalizeOps()) 13154 return SDValue(); 13155 13156 SelectionDAG &DAG = DCI.DAG; 13157 SDValue Src = N->getOperand(0); 13158 unsigned Opc = Src->getOpcode(); 13159 13160 // Zero/any extend of an unsigned unpack 13161 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) { 13162 SDValue UnpkOp = Src->getOperand(0); 13163 SDValue Dup = N->getOperand(1); 13164 13165 if (Dup.getOpcode() != AArch64ISD::DUP) 13166 return SDValue(); 13167 13168 SDLoc DL(N); 13169 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0)); 13170 uint64_t ExtVal = C->getZExtValue(); 13171 13172 // If the mask is fully covered by the unpack, we don't need to push 13173 // a new AND onto the operand 13174 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType(); 13175 if ((ExtVal == 0xFF && EltTy == MVT::i8) || 13176 (ExtVal == 0xFFFF && EltTy == MVT::i16) || 13177 (ExtVal == 0xFFFFFFFF && EltTy == MVT::i32)) 13178 return Src; 13179 13180 // Truncate to prevent a DUP with an over wide constant 13181 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits()); 13182 13183 // Otherwise, make sure we propagate the AND to the operand 13184 // of the unpack 13185 Dup = DAG.getNode(AArch64ISD::DUP, DL, 13186 UnpkOp->getValueType(0), 13187 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32)); 13188 13189 SDValue And = DAG.getNode(ISD::AND, DL, 13190 UnpkOp->getValueType(0), UnpkOp, Dup); 13191 13192 return DAG.getNode(Opc, DL, N->getValueType(0), And); 13193 } 13194 13195 if (!EnableCombineMGatherIntrinsics) 13196 return SDValue(); 13197 13198 SDValue Mask = N->getOperand(1); 13199 13200 if (!Src.hasOneUse()) 13201 return SDValue(); 13202 13203 EVT MemVT; 13204 13205 // SVE load instructions perform an implicit zero-extend, which makes them 13206 // perfect candidates for combining. 13207 switch (Opc) { 13208 case AArch64ISD::LD1_MERGE_ZERO: 13209 case AArch64ISD::LDNF1_MERGE_ZERO: 13210 case AArch64ISD::LDFF1_MERGE_ZERO: 13211 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT(); 13212 break; 13213 case AArch64ISD::GLD1_MERGE_ZERO: 13214 case AArch64ISD::GLD1_SCALED_MERGE_ZERO: 13215 case AArch64ISD::GLD1_SXTW_MERGE_ZERO: 13216 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO: 13217 case AArch64ISD::GLD1_UXTW_MERGE_ZERO: 13218 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO: 13219 case AArch64ISD::GLD1_IMM_MERGE_ZERO: 13220 case AArch64ISD::GLDFF1_MERGE_ZERO: 13221 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO: 13222 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO: 13223 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO: 13224 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO: 13225 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO: 13226 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO: 13227 case AArch64ISD::GLDNT1_MERGE_ZERO: 13228 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT(); 13229 break; 13230 default: 13231 return SDValue(); 13232 } 13233 13234 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT)) 13235 return Src; 13236 13237 return SDValue(); 13238 } 13239 13240 static SDValue performANDCombine(SDNode *N, 13241 TargetLowering::DAGCombinerInfo &DCI) { 13242 SelectionDAG &DAG = DCI.DAG; 13243 SDValue LHS = N->getOperand(0); 13244 EVT VT = N->getValueType(0); 13245 if (!VT.isVector() || !DAG.getTargetLoweringInfo().isTypeLegal(VT)) 13246 return SDValue(); 13247 13248 if (VT.isScalableVector()) 13249 return performSVEAndCombine(N, DCI); 13250 13251 // The combining code below works only for NEON vectors. In particular, it 13252 // does not work for SVE when dealing with vectors wider than 128 bits. 13253 if (!(VT.is64BitVector() || VT.is128BitVector())) 13254 return SDValue(); 13255 13256 BuildVectorSDNode *BVN = 13257 dyn_cast<BuildVectorSDNode>(N->getOperand(1).getNode()); 13258 if (!BVN) 13259 return SDValue(); 13260 13261 // AND does not accept an immediate, so check if we can use a BIC immediate 13262 // instruction instead. We do this here instead of using a (and x, (mvni imm)) 13263 // pattern in isel, because some immediates may be lowered to the preferred 13264 // (and x, (movi imm)) form, even though an mvni representation also exists. 13265 APInt DefBits(VT.getSizeInBits(), 0); 13266 APInt UndefBits(VT.getSizeInBits(), 0); 13267 if (resolveBuildVector(BVN, DefBits, UndefBits)) { 13268 SDValue NewOp; 13269 13270 DefBits = ~DefBits; 13271 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG, 13272 DefBits, &LHS)) || 13273 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG, 13274 DefBits, &LHS))) 13275 return NewOp; 13276 13277 UndefBits = ~UndefBits; 13278 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG, 13279 UndefBits, &LHS)) || 13280 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG, 13281 UndefBits, &LHS))) 13282 return NewOp; 13283 } 13284 13285 return SDValue(); 13286 } 13287 13288 static SDValue performSRLCombine(SDNode *N, 13289 TargetLowering::DAGCombinerInfo &DCI) { 13290 SelectionDAG &DAG = DCI.DAG; 13291 EVT VT = N->getValueType(0); 13292 if (VT != MVT::i32 && VT != MVT::i64) 13293 return SDValue(); 13294 13295 // Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the 13296 // high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32) 13297 // to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero. 13298 SDValue N0 = N->getOperand(0); 13299 if (N0.getOpcode() == ISD::BSWAP) { 13300 SDLoc DL(N); 13301 SDValue N1 = N->getOperand(1); 13302 SDValue N00 = N0.getOperand(0); 13303 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) { 13304 uint64_t ShiftAmt = C->getZExtValue(); 13305 if (VT == MVT::i32 && ShiftAmt == 16 && 13306 DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(32, 16))) 13307 return DAG.getNode(ISD::ROTR, DL, VT, N0, N1); 13308 if (VT == MVT::i64 && ShiftAmt == 32 && 13309 DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(64, 32))) 13310 return DAG.getNode(ISD::ROTR, DL, VT, N0, N1); 13311 } 13312 } 13313 return SDValue(); 13314 } 13315 13316 // Attempt to form urhadd(OpA, OpB) from 13317 // truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1)) 13318 // or uhadd(OpA, OpB) from truncate(vlshr(add(zext(OpA), zext(OpB)), 1)). 13319 // The original form of the first expression is 13320 // truncate(srl(add(zext(OpB), add(zext(OpA), 1)), 1)) and the 13321 // (OpA + OpB + 1) subexpression will have been changed to (OpB - (~OpA)). 13322 // Before this function is called the srl will have been lowered to 13323 // AArch64ISD::VLSHR. 13324 // This pass can also recognize signed variants of the patterns that use sign 13325 // extension instead of zero extension and form a srhadd(OpA, OpB) or a 13326 // shadd(OpA, OpB) from them. 13327 static SDValue 13328 performVectorTruncateCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 13329 SelectionDAG &DAG) { 13330 EVT VT = N->getValueType(0); 13331 13332 // Since we are looking for a right shift by a constant value of 1 and we are 13333 // operating on types at least 16 bits in length (sign/zero extended OpA and 13334 // OpB, which are at least 8 bits), it follows that the truncate will always 13335 // discard the shifted-in bit and therefore the right shift will be logical 13336 // regardless of the signedness of OpA and OpB. 13337 SDValue Shift = N->getOperand(0); 13338 if (Shift.getOpcode() != AArch64ISD::VLSHR) 13339 return SDValue(); 13340 13341 // Is the right shift using an immediate value of 1? 13342 uint64_t ShiftAmount = Shift.getConstantOperandVal(1); 13343 if (ShiftAmount != 1) 13344 return SDValue(); 13345 13346 SDValue ExtendOpA, ExtendOpB; 13347 SDValue ShiftOp0 = Shift.getOperand(0); 13348 unsigned ShiftOp0Opc = ShiftOp0.getOpcode(); 13349 if (ShiftOp0Opc == ISD::SUB) { 13350 13351 SDValue Xor = ShiftOp0.getOperand(1); 13352 if (Xor.getOpcode() != ISD::XOR) 13353 return SDValue(); 13354 13355 // Is the XOR using a constant amount of all ones in the right hand side? 13356 uint64_t C; 13357 if (!isAllConstantBuildVector(Xor.getOperand(1), C)) 13358 return SDValue(); 13359 13360 unsigned ElemSizeInBits = VT.getScalarSizeInBits(); 13361 APInt CAsAPInt(ElemSizeInBits, C); 13362 if (CAsAPInt != APInt::getAllOnesValue(ElemSizeInBits)) 13363 return SDValue(); 13364 13365 ExtendOpA = Xor.getOperand(0); 13366 ExtendOpB = ShiftOp0.getOperand(0); 13367 } else if (ShiftOp0Opc == ISD::ADD) { 13368 ExtendOpA = ShiftOp0.getOperand(0); 13369 ExtendOpB = ShiftOp0.getOperand(1); 13370 } else 13371 return SDValue(); 13372 13373 unsigned ExtendOpAOpc = ExtendOpA.getOpcode(); 13374 unsigned ExtendOpBOpc = ExtendOpB.getOpcode(); 13375 if (!(ExtendOpAOpc == ExtendOpBOpc && 13376 (ExtendOpAOpc == ISD::ZERO_EXTEND || ExtendOpAOpc == ISD::SIGN_EXTEND))) 13377 return SDValue(); 13378 13379 // Is the result of the right shift being truncated to the same value type as 13380 // the original operands, OpA and OpB? 13381 SDValue OpA = ExtendOpA.getOperand(0); 13382 SDValue OpB = ExtendOpB.getOperand(0); 13383 EVT OpAVT = OpA.getValueType(); 13384 assert(ExtendOpA.getValueType() == ExtendOpB.getValueType()); 13385 if (!(VT == OpAVT && OpAVT == OpB.getValueType())) 13386 return SDValue(); 13387 13388 SDLoc DL(N); 13389 bool IsSignExtend = ExtendOpAOpc == ISD::SIGN_EXTEND; 13390 bool IsRHADD = ShiftOp0Opc == ISD::SUB; 13391 unsigned HADDOpc = IsSignExtend 13392 ? (IsRHADD ? AArch64ISD::SRHADD : AArch64ISD::SHADD) 13393 : (IsRHADD ? AArch64ISD::URHADD : AArch64ISD::UHADD); 13394 SDValue ResultHADD = DAG.getNode(HADDOpc, DL, VT, OpA, OpB); 13395 13396 return ResultHADD; 13397 } 13398 13399 static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) { 13400 switch (Opcode) { 13401 case ISD::FADD: 13402 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64; 13403 case ISD::ADD: 13404 return VT == MVT::i64; 13405 default: 13406 return false; 13407 } 13408 } 13409 13410 static SDValue performExtractVectorEltCombine(SDNode *N, SelectionDAG &DAG) { 13411 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); 13412 ConstantSDNode *ConstantN1 = dyn_cast<ConstantSDNode>(N1); 13413 13414 EVT VT = N->getValueType(0); 13415 const bool FullFP16 = 13416 static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16(); 13417 13418 // Rewrite for pairwise fadd pattern 13419 // (f32 (extract_vector_elt 13420 // (fadd (vXf32 Other) 13421 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0)) 13422 // -> 13423 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0) 13424 // (extract_vector_elt (vXf32 Other) 1)) 13425 if (ConstantN1 && ConstantN1->getZExtValue() == 0 && 13426 hasPairwiseAdd(N0->getOpcode(), VT, FullFP16)) { 13427 SDLoc DL(N0); 13428 SDValue N00 = N0->getOperand(0); 13429 SDValue N01 = N0->getOperand(1); 13430 13431 ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01); 13432 SDValue Other = N00; 13433 13434 // And handle the commutative case. 13435 if (!Shuffle) { 13436 Shuffle = dyn_cast<ShuffleVectorSDNode>(N00); 13437 Other = N01; 13438 } 13439 13440 if (Shuffle && Shuffle->getMaskElt(0) == 1 && 13441 Other == Shuffle->getOperand(0)) { 13442 return DAG.getNode(N0->getOpcode(), DL, VT, 13443 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other, 13444 DAG.getConstant(0, DL, MVT::i64)), 13445 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other, 13446 DAG.getConstant(1, DL, MVT::i64))); 13447 } 13448 } 13449 13450 return SDValue(); 13451 } 13452 13453 static SDValue performConcatVectorsCombine(SDNode *N, 13454 TargetLowering::DAGCombinerInfo &DCI, 13455 SelectionDAG &DAG) { 13456 SDLoc dl(N); 13457 EVT VT = N->getValueType(0); 13458 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); 13459 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode(); 13460 13461 // Optimize concat_vectors of truncated vectors, where the intermediate 13462 // type is illegal, to avoid said illegality, e.g., 13463 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))), 13464 // (v2i16 (truncate (v2i64))))) 13465 // -> 13466 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))), 13467 // (v4i32 (bitcast (v2i64))), 13468 // <0, 2, 4, 6>))) 13469 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed 13470 // on both input and result type, so we might generate worse code. 13471 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8. 13472 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE && 13473 N1Opc == ISD::TRUNCATE) { 13474 SDValue N00 = N0->getOperand(0); 13475 SDValue N10 = N1->getOperand(0); 13476 EVT N00VT = N00.getValueType(); 13477 13478 if (N00VT == N10.getValueType() && 13479 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) && 13480 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) { 13481 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16); 13482 SmallVector<int, 8> Mask(MidVT.getVectorNumElements()); 13483 for (size_t i = 0; i < Mask.size(); ++i) 13484 Mask[i] = i * 2; 13485 return DAG.getNode(ISD::TRUNCATE, dl, VT, 13486 DAG.getVectorShuffle( 13487 MidVT, dl, 13488 DAG.getNode(ISD::BITCAST, dl, MidVT, N00), 13489 DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask)); 13490 } 13491 } 13492 13493 // Wait 'til after everything is legalized to try this. That way we have 13494 // legal vector types and such. 13495 if (DCI.isBeforeLegalizeOps()) 13496 return SDValue(); 13497 13498 // Optimise concat_vectors of two [us]rhadds or [us]hadds that use extracted 13499 // subvectors from the same original vectors. Combine these into a single 13500 // [us]rhadd or [us]hadd that operates on the two original vectors. Example: 13501 // (v16i8 (concat_vectors (v8i8 (urhadd (extract_subvector (v16i8 OpA, <0>), 13502 // extract_subvector (v16i8 OpB, 13503 // <0>))), 13504 // (v8i8 (urhadd (extract_subvector (v16i8 OpA, <8>), 13505 // extract_subvector (v16i8 OpB, 13506 // <8>))))) 13507 // -> 13508 // (v16i8(urhadd(v16i8 OpA, v16i8 OpB))) 13509 if (N->getNumOperands() == 2 && N0Opc == N1Opc && 13510 (N0Opc == AArch64ISD::URHADD || N0Opc == AArch64ISD::SRHADD || 13511 N0Opc == AArch64ISD::UHADD || N0Opc == AArch64ISD::SHADD)) { 13512 SDValue N00 = N0->getOperand(0); 13513 SDValue N01 = N0->getOperand(1); 13514 SDValue N10 = N1->getOperand(0); 13515 SDValue N11 = N1->getOperand(1); 13516 13517 EVT N00VT = N00.getValueType(); 13518 EVT N10VT = N10.getValueType(); 13519 13520 if (N00->getOpcode() == ISD::EXTRACT_SUBVECTOR && 13521 N01->getOpcode() == ISD::EXTRACT_SUBVECTOR && 13522 N10->getOpcode() == ISD::EXTRACT_SUBVECTOR && 13523 N11->getOpcode() == ISD::EXTRACT_SUBVECTOR && N00VT == N10VT) { 13524 SDValue N00Source = N00->getOperand(0); 13525 SDValue N01Source = N01->getOperand(0); 13526 SDValue N10Source = N10->getOperand(0); 13527 SDValue N11Source = N11->getOperand(0); 13528 13529 if (N00Source == N10Source && N01Source == N11Source && 13530 N00Source.getValueType() == VT && N01Source.getValueType() == VT) { 13531 assert(N0.getValueType() == N1.getValueType()); 13532 13533 uint64_t N00Index = N00.getConstantOperandVal(1); 13534 uint64_t N01Index = N01.getConstantOperandVal(1); 13535 uint64_t N10Index = N10.getConstantOperandVal(1); 13536 uint64_t N11Index = N11.getConstantOperandVal(1); 13537 13538 if (N00Index == N01Index && N10Index == N11Index && N00Index == 0 && 13539 N10Index == N00VT.getVectorNumElements()) 13540 return DAG.getNode(N0Opc, dl, VT, N00Source, N01Source); 13541 } 13542 } 13543 } 13544 13545 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector 13546 // splat. The indexed instructions are going to be expecting a DUPLANE64, so 13547 // canonicalise to that. 13548 if (N0 == N1 && VT.getVectorNumElements() == 2) { 13549 assert(VT.getScalarSizeInBits() == 64); 13550 return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG), 13551 DAG.getConstant(0, dl, MVT::i64)); 13552 } 13553 13554 // Canonicalise concat_vectors so that the right-hand vector has as few 13555 // bit-casts as possible before its real operation. The primary matching 13556 // destination for these operations will be the narrowing "2" instructions, 13557 // which depend on the operation being performed on this right-hand vector. 13558 // For example, 13559 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS)))) 13560 // becomes 13561 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS)) 13562 13563 if (N1Opc != ISD::BITCAST) 13564 return SDValue(); 13565 SDValue RHS = N1->getOperand(0); 13566 MVT RHSTy = RHS.getValueType().getSimpleVT(); 13567 // If the RHS is not a vector, this is not the pattern we're looking for. 13568 if (!RHSTy.isVector()) 13569 return SDValue(); 13570 13571 LLVM_DEBUG( 13572 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n"); 13573 13574 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(), 13575 RHSTy.getVectorNumElements() * 2); 13576 return DAG.getNode(ISD::BITCAST, dl, VT, 13577 DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy, 13578 DAG.getNode(ISD::BITCAST, dl, RHSTy, N0), 13579 RHS)); 13580 } 13581 13582 static SDValue tryCombineFixedPointConvert(SDNode *N, 13583 TargetLowering::DAGCombinerInfo &DCI, 13584 SelectionDAG &DAG) { 13585 // Wait until after everything is legalized to try this. That way we have 13586 // legal vector types and such. 13587 if (DCI.isBeforeLegalizeOps()) 13588 return SDValue(); 13589 // Transform a scalar conversion of a value from a lane extract into a 13590 // lane extract of a vector conversion. E.g., from foo1 to foo2: 13591 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); } 13592 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; } 13593 // 13594 // The second form interacts better with instruction selection and the 13595 // register allocator to avoid cross-class register copies that aren't 13596 // coalescable due to a lane reference. 13597 13598 // Check the operand and see if it originates from a lane extract. 13599 SDValue Op1 = N->getOperand(1); 13600 if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 13601 // Yep, no additional predication needed. Perform the transform. 13602 SDValue IID = N->getOperand(0); 13603 SDValue Shift = N->getOperand(2); 13604 SDValue Vec = Op1.getOperand(0); 13605 SDValue Lane = Op1.getOperand(1); 13606 EVT ResTy = N->getValueType(0); 13607 EVT VecResTy; 13608 SDLoc DL(N); 13609 13610 // The vector width should be 128 bits by the time we get here, even 13611 // if it started as 64 bits (the extract_vector handling will have 13612 // done so). 13613 assert(Vec.getValueSizeInBits() == 128 && 13614 "unexpected vector size on extract_vector_elt!"); 13615 if (Vec.getValueType() == MVT::v4i32) 13616 VecResTy = MVT::v4f32; 13617 else if (Vec.getValueType() == MVT::v2i64) 13618 VecResTy = MVT::v2f64; 13619 else 13620 llvm_unreachable("unexpected vector type!"); 13621 13622 SDValue Convert = 13623 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift); 13624 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane); 13625 } 13626 return SDValue(); 13627 } 13628 13629 // AArch64 high-vector "long" operations are formed by performing the non-high 13630 // version on an extract_subvector of each operand which gets the high half: 13631 // 13632 // (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS)) 13633 // 13634 // However, there are cases which don't have an extract_high explicitly, but 13635 // have another operation that can be made compatible with one for free. For 13636 // example: 13637 // 13638 // (dupv64 scalar) --> (extract_high (dup128 scalar)) 13639 // 13640 // This routine does the actual conversion of such DUPs, once outer routines 13641 // have determined that everything else is in order. 13642 // It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold 13643 // similarly here. 13644 static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) { 13645 switch (N.getOpcode()) { 13646 case AArch64ISD::DUP: 13647 case AArch64ISD::DUPLANE8: 13648 case AArch64ISD::DUPLANE16: 13649 case AArch64ISD::DUPLANE32: 13650 case AArch64ISD::DUPLANE64: 13651 case AArch64ISD::MOVI: 13652 case AArch64ISD::MOVIshift: 13653 case AArch64ISD::MOVIedit: 13654 case AArch64ISD::MOVImsl: 13655 case AArch64ISD::MVNIshift: 13656 case AArch64ISD::MVNImsl: 13657 break; 13658 default: 13659 // FMOV could be supported, but isn't very useful, as it would only occur 13660 // if you passed a bitcast' floating point immediate to an eligible long 13661 // integer op (addl, smull, ...). 13662 return SDValue(); 13663 } 13664 13665 MVT NarrowTy = N.getSimpleValueType(); 13666 if (!NarrowTy.is64BitVector()) 13667 return SDValue(); 13668 13669 MVT ElementTy = NarrowTy.getVectorElementType(); 13670 unsigned NumElems = NarrowTy.getVectorNumElements(); 13671 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2); 13672 13673 SDLoc dl(N); 13674 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy, 13675 DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()), 13676 DAG.getConstant(NumElems, dl, MVT::i64)); 13677 } 13678 13679 static bool isEssentiallyExtractHighSubvector(SDValue N) { 13680 if (N.getOpcode() == ISD::BITCAST) 13681 N = N.getOperand(0); 13682 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR) 13683 return false; 13684 if (N.getOperand(0).getValueType().isScalableVector()) 13685 return false; 13686 return cast<ConstantSDNode>(N.getOperand(1))->getAPIntValue() == 13687 N.getOperand(0).getValueType().getVectorNumElements() / 2; 13688 } 13689 13690 /// Helper structure to keep track of ISD::SET_CC operands. 13691 struct GenericSetCCInfo { 13692 const SDValue *Opnd0; 13693 const SDValue *Opnd1; 13694 ISD::CondCode CC; 13695 }; 13696 13697 /// Helper structure to keep track of a SET_CC lowered into AArch64 code. 13698 struct AArch64SetCCInfo { 13699 const SDValue *Cmp; 13700 AArch64CC::CondCode CC; 13701 }; 13702 13703 /// Helper structure to keep track of SetCC information. 13704 union SetCCInfo { 13705 GenericSetCCInfo Generic; 13706 AArch64SetCCInfo AArch64; 13707 }; 13708 13709 /// Helper structure to be able to read SetCC information. If set to 13710 /// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a 13711 /// GenericSetCCInfo. 13712 struct SetCCInfoAndKind { 13713 SetCCInfo Info; 13714 bool IsAArch64; 13715 }; 13716 13717 /// Check whether or not \p Op is a SET_CC operation, either a generic or 13718 /// an 13719 /// AArch64 lowered one. 13720 /// \p SetCCInfo is filled accordingly. 13721 /// \post SetCCInfo is meanginfull only when this function returns true. 13722 /// \return True when Op is a kind of SET_CC operation. 13723 static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) { 13724 // If this is a setcc, this is straight forward. 13725 if (Op.getOpcode() == ISD::SETCC) { 13726 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0); 13727 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1); 13728 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 13729 SetCCInfo.IsAArch64 = false; 13730 return true; 13731 } 13732 // Otherwise, check if this is a matching csel instruction. 13733 // In other words: 13734 // - csel 1, 0, cc 13735 // - csel 0, 1, !cc 13736 if (Op.getOpcode() != AArch64ISD::CSEL) 13737 return false; 13738 // Set the information about the operands. 13739 // TODO: we want the operands of the Cmp not the csel 13740 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3); 13741 SetCCInfo.IsAArch64 = true; 13742 SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>( 13743 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); 13744 13745 // Check that the operands matches the constraints: 13746 // (1) Both operands must be constants. 13747 // (2) One must be 1 and the other must be 0. 13748 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0)); 13749 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 13750 13751 // Check (1). 13752 if (!TValue || !FValue) 13753 return false; 13754 13755 // Check (2). 13756 if (!TValue->isOne()) { 13757 // Update the comparison when we are interested in !cc. 13758 std::swap(TValue, FValue); 13759 SetCCInfo.Info.AArch64.CC = 13760 AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC); 13761 } 13762 return TValue->isOne() && FValue->isNullValue(); 13763 } 13764 13765 // Returns true if Op is setcc or zext of setcc. 13766 static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) { 13767 if (isSetCC(Op, Info)) 13768 return true; 13769 return ((Op.getOpcode() == ISD::ZERO_EXTEND) && 13770 isSetCC(Op->getOperand(0), Info)); 13771 } 13772 13773 // The folding we want to perform is: 13774 // (add x, [zext] (setcc cc ...) ) 13775 // --> 13776 // (csel x, (add x, 1), !cc ...) 13777 // 13778 // The latter will get matched to a CSINC instruction. 13779 static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) { 13780 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!"); 13781 SDValue LHS = Op->getOperand(0); 13782 SDValue RHS = Op->getOperand(1); 13783 SetCCInfoAndKind InfoAndKind; 13784 13785 // If both operands are a SET_CC, then we don't want to perform this 13786 // folding and create another csel as this results in more instructions 13787 // (and higher register usage). 13788 if (isSetCCOrZExtSetCC(LHS, InfoAndKind) && 13789 isSetCCOrZExtSetCC(RHS, InfoAndKind)) 13790 return SDValue(); 13791 13792 // If neither operand is a SET_CC, give up. 13793 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) { 13794 std::swap(LHS, RHS); 13795 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) 13796 return SDValue(); 13797 } 13798 13799 // FIXME: This could be generatized to work for FP comparisons. 13800 EVT CmpVT = InfoAndKind.IsAArch64 13801 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType() 13802 : InfoAndKind.Info.Generic.Opnd0->getValueType(); 13803 if (CmpVT != MVT::i32 && CmpVT != MVT::i64) 13804 return SDValue(); 13805 13806 SDValue CCVal; 13807 SDValue Cmp; 13808 SDLoc dl(Op); 13809 if (InfoAndKind.IsAArch64) { 13810 CCVal = DAG.getConstant( 13811 AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl, 13812 MVT::i32); 13813 Cmp = *InfoAndKind.Info.AArch64.Cmp; 13814 } else 13815 Cmp = getAArch64Cmp( 13816 *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1, 13817 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG, 13818 dl); 13819 13820 EVT VT = Op->getValueType(0); 13821 LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT)); 13822 return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp); 13823 } 13824 13825 // ADD(UADDV a, UADDV b) --> UADDV(ADD a, b) 13826 static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) { 13827 EVT VT = N->getValueType(0); 13828 // Only scalar integer and vector types. 13829 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger()) 13830 return SDValue(); 13831 13832 SDValue LHS = N->getOperand(0); 13833 SDValue RHS = N->getOperand(1); 13834 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 13835 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT) 13836 return SDValue(); 13837 13838 auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); 13839 auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1)); 13840 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isNullValue()) 13841 return SDValue(); 13842 13843 SDValue Op1 = LHS->getOperand(0); 13844 SDValue Op2 = RHS->getOperand(0); 13845 EVT OpVT1 = Op1.getValueType(); 13846 EVT OpVT2 = Op2.getValueType(); 13847 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 || 13848 Op2.getOpcode() != AArch64ISD::UADDV || 13849 OpVT1.getVectorElementType() != VT) 13850 return SDValue(); 13851 13852 SDValue Val1 = Op1.getOperand(0); 13853 SDValue Val2 = Op2.getOperand(0); 13854 EVT ValVT = Val1->getValueType(0); 13855 SDLoc DL(N); 13856 SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2); 13857 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, 13858 DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal), 13859 DAG.getConstant(0, DL, MVT::i64)); 13860 } 13861 13862 // ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y) 13863 static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) { 13864 EVT VT = N->getValueType(0); 13865 if (N->getOpcode() != ISD::ADD) 13866 return SDValue(); 13867 13868 SDValue Dot = N->getOperand(0); 13869 SDValue A = N->getOperand(1); 13870 // Handle commutivity 13871 auto isZeroDot = [](SDValue Dot) { 13872 return (Dot.getOpcode() == AArch64ISD::UDOT || 13873 Dot.getOpcode() == AArch64ISD::SDOT) && 13874 isZerosVector(Dot.getOperand(0).getNode()); 13875 }; 13876 if (!isZeroDot(Dot)) 13877 std::swap(Dot, A); 13878 if (!isZeroDot(Dot)) 13879 return SDValue(); 13880 13881 return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1), 13882 Dot.getOperand(2)); 13883 } 13884 13885 // The basic add/sub long vector instructions have variants with "2" on the end 13886 // which act on the high-half of their inputs. They are normally matched by 13887 // patterns like: 13888 // 13889 // (add (zeroext (extract_high LHS)), 13890 // (zeroext (extract_high RHS))) 13891 // -> uaddl2 vD, vN, vM 13892 // 13893 // However, if one of the extracts is something like a duplicate, this 13894 // instruction can still be used profitably. This function puts the DAG into a 13895 // more appropriate form for those patterns to trigger. 13896 static SDValue performAddSubLongCombine(SDNode *N, 13897 TargetLowering::DAGCombinerInfo &DCI, 13898 SelectionDAG &DAG) { 13899 if (DCI.isBeforeLegalizeOps()) 13900 return SDValue(); 13901 13902 MVT VT = N->getSimpleValueType(0); 13903 if (!VT.is128BitVector()) { 13904 if (N->getOpcode() == ISD::ADD) 13905 return performSetccAddFolding(N, DAG); 13906 return SDValue(); 13907 } 13908 13909 // Make sure both branches are extended in the same way. 13910 SDValue LHS = N->getOperand(0); 13911 SDValue RHS = N->getOperand(1); 13912 if ((LHS.getOpcode() != ISD::ZERO_EXTEND && 13913 LHS.getOpcode() != ISD::SIGN_EXTEND) || 13914 LHS.getOpcode() != RHS.getOpcode()) 13915 return SDValue(); 13916 13917 unsigned ExtType = LHS.getOpcode(); 13918 13919 // It's not worth doing if at least one of the inputs isn't already an 13920 // extract, but we don't know which it'll be so we have to try both. 13921 if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) { 13922 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG); 13923 if (!RHS.getNode()) 13924 return SDValue(); 13925 13926 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS); 13927 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) { 13928 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG); 13929 if (!LHS.getNode()) 13930 return SDValue(); 13931 13932 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS); 13933 } 13934 13935 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS); 13936 } 13937 13938 static SDValue performAddSubCombine(SDNode *N, 13939 TargetLowering::DAGCombinerInfo &DCI, 13940 SelectionDAG &DAG) { 13941 // Try to change sum of two reductions. 13942 if (SDValue Val = performUADDVCombine(N, DAG)) 13943 return Val; 13944 if (SDValue Val = performAddDotCombine(N, DAG)) 13945 return Val; 13946 13947 return performAddSubLongCombine(N, DCI, DAG); 13948 } 13949 13950 // Massage DAGs which we can use the high-half "long" operations on into 13951 // something isel will recognize better. E.g. 13952 // 13953 // (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) --> 13954 // (aarch64_neon_umull (extract_high (v2i64 vec))) 13955 // (extract_high (v2i64 (dup128 scalar))))) 13956 // 13957 static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, 13958 TargetLowering::DAGCombinerInfo &DCI, 13959 SelectionDAG &DAG) { 13960 if (DCI.isBeforeLegalizeOps()) 13961 return SDValue(); 13962 13963 SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1); 13964 SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2); 13965 assert(LHS.getValueType().is64BitVector() && 13966 RHS.getValueType().is64BitVector() && 13967 "unexpected shape for long operation"); 13968 13969 // Either node could be a DUP, but it's not worth doing both of them (you'd 13970 // just as well use the non-high version) so look for a corresponding extract 13971 // operation on the other "wing". 13972 if (isEssentiallyExtractHighSubvector(LHS)) { 13973 RHS = tryExtendDUPToExtractHigh(RHS, DAG); 13974 if (!RHS.getNode()) 13975 return SDValue(); 13976 } else if (isEssentiallyExtractHighSubvector(RHS)) { 13977 LHS = tryExtendDUPToExtractHigh(LHS, DAG); 13978 if (!LHS.getNode()) 13979 return SDValue(); 13980 } 13981 13982 if (IID == Intrinsic::not_intrinsic) 13983 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS); 13984 13985 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0), 13986 N->getOperand(0), LHS, RHS); 13987 } 13988 13989 static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) { 13990 MVT ElemTy = N->getSimpleValueType(0).getScalarType(); 13991 unsigned ElemBits = ElemTy.getSizeInBits(); 13992 13993 int64_t ShiftAmount; 13994 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) { 13995 APInt SplatValue, SplatUndef; 13996 unsigned SplatBitSize; 13997 bool HasAnyUndefs; 13998 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, 13999 HasAnyUndefs, ElemBits) || 14000 SplatBitSize != ElemBits) 14001 return SDValue(); 14002 14003 ShiftAmount = SplatValue.getSExtValue(); 14004 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) { 14005 ShiftAmount = CVN->getSExtValue(); 14006 } else 14007 return SDValue(); 14008 14009 unsigned Opcode; 14010 bool IsRightShift; 14011 switch (IID) { 14012 default: 14013 llvm_unreachable("Unknown shift intrinsic"); 14014 case Intrinsic::aarch64_neon_sqshl: 14015 Opcode = AArch64ISD::SQSHL_I; 14016 IsRightShift = false; 14017 break; 14018 case Intrinsic::aarch64_neon_uqshl: 14019 Opcode = AArch64ISD::UQSHL_I; 14020 IsRightShift = false; 14021 break; 14022 case Intrinsic::aarch64_neon_srshl: 14023 Opcode = AArch64ISD::SRSHR_I; 14024 IsRightShift = true; 14025 break; 14026 case Intrinsic::aarch64_neon_urshl: 14027 Opcode = AArch64ISD::URSHR_I; 14028 IsRightShift = true; 14029 break; 14030 case Intrinsic::aarch64_neon_sqshlu: 14031 Opcode = AArch64ISD::SQSHLU_I; 14032 IsRightShift = false; 14033 break; 14034 case Intrinsic::aarch64_neon_sshl: 14035 case Intrinsic::aarch64_neon_ushl: 14036 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular 14037 // left shift for positive shift amounts. Below, we only replace the current 14038 // node with VSHL, if this condition is met. 14039 Opcode = AArch64ISD::VSHL; 14040 IsRightShift = false; 14041 break; 14042 } 14043 14044 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) { 14045 SDLoc dl(N); 14046 return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1), 14047 DAG.getConstant(-ShiftAmount, dl, MVT::i32)); 14048 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) { 14049 SDLoc dl(N); 14050 return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1), 14051 DAG.getConstant(ShiftAmount, dl, MVT::i32)); 14052 } 14053 14054 return SDValue(); 14055 } 14056 14057 // The CRC32[BH] instructions ignore the high bits of their data operand. Since 14058 // the intrinsics must be legal and take an i32, this means there's almost 14059 // certainly going to be a zext in the DAG which we can eliminate. 14060 static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) { 14061 SDValue AndN = N->getOperand(2); 14062 if (AndN.getOpcode() != ISD::AND) 14063 return SDValue(); 14064 14065 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1)); 14066 if (!CMask || CMask->getZExtValue() != Mask) 14067 return SDValue(); 14068 14069 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32, 14070 N->getOperand(0), N->getOperand(1), AndN.getOperand(0)); 14071 } 14072 14073 static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, 14074 SelectionDAG &DAG) { 14075 SDLoc dl(N); 14076 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), 14077 DAG.getNode(Opc, dl, 14078 N->getOperand(1).getSimpleValueType(), 14079 N->getOperand(1)), 14080 DAG.getConstant(0, dl, MVT::i64)); 14081 } 14082 14083 static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG) { 14084 SDLoc DL(N); 14085 SDValue Op1 = N->getOperand(1); 14086 SDValue Op2 = N->getOperand(2); 14087 EVT ScalarTy = Op2.getValueType(); 14088 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) 14089 ScalarTy = MVT::i32; 14090 14091 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base). 14092 SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0)); 14093 SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2); 14094 SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step); 14095 SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1); 14096 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base); 14097 } 14098 14099 static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) { 14100 SDLoc dl(N); 14101 SDValue Scalar = N->getOperand(3); 14102 EVT ScalarTy = Scalar.getValueType(); 14103 14104 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) 14105 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar); 14106 14107 SDValue Passthru = N->getOperand(1); 14108 SDValue Pred = N->getOperand(2); 14109 return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0), 14110 Pred, Scalar, Passthru); 14111 } 14112 14113 static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) { 14114 SDLoc dl(N); 14115 LLVMContext &Ctx = *DAG.getContext(); 14116 EVT VT = N->getValueType(0); 14117 14118 assert(VT.isScalableVector() && "Expected a scalable vector."); 14119 14120 // Current lowering only supports the SVE-ACLE types. 14121 if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock) 14122 return SDValue(); 14123 14124 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8; 14125 unsigned ByteSize = VT.getSizeInBits().getKnownMinSize() / 8; 14126 EVT ByteVT = 14127 EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize)); 14128 14129 // Convert everything to the domain of EXT (i.e bytes). 14130 SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1)); 14131 SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2)); 14132 SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3), 14133 DAG.getConstant(ElemSize, dl, MVT::i32)); 14134 14135 SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2); 14136 return DAG.getNode(ISD::BITCAST, dl, VT, EXT); 14137 } 14138 14139 static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, 14140 TargetLowering::DAGCombinerInfo &DCI, 14141 SelectionDAG &DAG) { 14142 if (DCI.isBeforeLegalize()) 14143 return SDValue(); 14144 14145 SDValue Comparator = N->getOperand(3); 14146 if (Comparator.getOpcode() == AArch64ISD::DUP || 14147 Comparator.getOpcode() == ISD::SPLAT_VECTOR) { 14148 unsigned IID = getIntrinsicID(N); 14149 EVT VT = N->getValueType(0); 14150 EVT CmpVT = N->getOperand(2).getValueType(); 14151 SDValue Pred = N->getOperand(1); 14152 SDValue Imm; 14153 SDLoc DL(N); 14154 14155 switch (IID) { 14156 default: 14157 llvm_unreachable("Called with wrong intrinsic!"); 14158 break; 14159 14160 // Signed comparisons 14161 case Intrinsic::aarch64_sve_cmpeq_wide: 14162 case Intrinsic::aarch64_sve_cmpne_wide: 14163 case Intrinsic::aarch64_sve_cmpge_wide: 14164 case Intrinsic::aarch64_sve_cmpgt_wide: 14165 case Intrinsic::aarch64_sve_cmplt_wide: 14166 case Intrinsic::aarch64_sve_cmple_wide: { 14167 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) { 14168 int64_t ImmVal = CN->getSExtValue(); 14169 if (ImmVal >= -16 && ImmVal <= 15) 14170 Imm = DAG.getConstant(ImmVal, DL, MVT::i32); 14171 else 14172 return SDValue(); 14173 } 14174 break; 14175 } 14176 // Unsigned comparisons 14177 case Intrinsic::aarch64_sve_cmphs_wide: 14178 case Intrinsic::aarch64_sve_cmphi_wide: 14179 case Intrinsic::aarch64_sve_cmplo_wide: 14180 case Intrinsic::aarch64_sve_cmpls_wide: { 14181 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) { 14182 uint64_t ImmVal = CN->getZExtValue(); 14183 if (ImmVal <= 127) 14184 Imm = DAG.getConstant(ImmVal, DL, MVT::i32); 14185 else 14186 return SDValue(); 14187 } 14188 break; 14189 } 14190 } 14191 14192 if (!Imm) 14193 return SDValue(); 14194 14195 SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm); 14196 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred, 14197 N->getOperand(2), Splat, DAG.getCondCode(CC)); 14198 } 14199 14200 return SDValue(); 14201 } 14202 14203 static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, 14204 AArch64CC::CondCode Cond) { 14205 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 14206 14207 SDLoc DL(Op); 14208 assert(Op.getValueType().isScalableVector() && 14209 TLI.isTypeLegal(Op.getValueType()) && 14210 "Expected legal scalable vector type!"); 14211 14212 // Ensure target specific opcodes are using legal type. 14213 EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); 14214 SDValue TVal = DAG.getConstant(1, DL, OutVT); 14215 SDValue FVal = DAG.getConstant(0, DL, OutVT); 14216 14217 // Set condition code (CC) flags. 14218 SDValue Test = DAG.getNode(AArch64ISD::PTEST, DL, MVT::Other, Pg, Op); 14219 14220 // Convert CC to integer based on requested condition. 14221 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare. 14222 SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32); 14223 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test); 14224 return DAG.getZExtOrTrunc(Res, DL, VT); 14225 } 14226 14227 static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc, 14228 SelectionDAG &DAG) { 14229 SDLoc DL(N); 14230 14231 SDValue Pred = N->getOperand(1); 14232 SDValue VecToReduce = N->getOperand(2); 14233 14234 // NOTE: The integer reduction's result type is not always linked to the 14235 // operand's element type so we construct it from the intrinsic's result type. 14236 EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0)); 14237 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce); 14238 14239 // SVE reductions set the whole vector register with the first element 14240 // containing the reduction result, which we'll now extract. 14241 SDValue Zero = DAG.getConstant(0, DL, MVT::i64); 14242 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce, 14243 Zero); 14244 } 14245 14246 static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, 14247 SelectionDAG &DAG) { 14248 SDLoc DL(N); 14249 14250 SDValue Pred = N->getOperand(1); 14251 SDValue VecToReduce = N->getOperand(2); 14252 14253 EVT ReduceVT = VecToReduce.getValueType(); 14254 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce); 14255 14256 // SVE reductions set the whole vector register with the first element 14257 // containing the reduction result, which we'll now extract. 14258 SDValue Zero = DAG.getConstant(0, DL, MVT::i64); 14259 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce, 14260 Zero); 14261 } 14262 14263 static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, 14264 SelectionDAG &DAG) { 14265 SDLoc DL(N); 14266 14267 SDValue Pred = N->getOperand(1); 14268 SDValue InitVal = N->getOperand(2); 14269 SDValue VecToReduce = N->getOperand(3); 14270 EVT ReduceVT = VecToReduce.getValueType(); 14271 14272 // Ordered reductions use the first lane of the result vector as the 14273 // reduction's initial value. 14274 SDValue Zero = DAG.getConstant(0, DL, MVT::i64); 14275 InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT, 14276 DAG.getUNDEF(ReduceVT), InitVal, Zero); 14277 14278 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce); 14279 14280 // SVE reductions set the whole vector register with the first element 14281 // containing the reduction result, which we'll now extract. 14282 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce, 14283 Zero); 14284 } 14285 14286 static bool isAllActivePredicate(SDValue N) { 14287 unsigned NumElts = N.getValueType().getVectorMinNumElements(); 14288 14289 // Look through cast. 14290 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) { 14291 N = N.getOperand(0); 14292 // When reinterpreting from a type with fewer elements the "new" elements 14293 // are not active, so bail if they're likely to be used. 14294 if (N.getValueType().getVectorMinNumElements() < NumElts) 14295 return false; 14296 } 14297 14298 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size 14299 // or smaller than the implicit element type represented by N. 14300 // NOTE: A larger element count implies a smaller element type. 14301 if (N.getOpcode() == AArch64ISD::PTRUE && 14302 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all) 14303 return N.getValueType().getVectorMinNumElements() >= NumElts; 14304 14305 return false; 14306 } 14307 14308 // If a merged operation has no inactive lanes we can relax it to a predicated 14309 // or unpredicated operation, which potentially allows better isel (perhaps 14310 // using immediate forms) or relaxing register reuse requirements. 14311 static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc, 14312 SelectionDAG &DAG, 14313 bool UnpredOp = false) { 14314 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!"); 14315 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!"); 14316 SDValue Pg = N->getOperand(1); 14317 14318 // ISD way to specify an all active predicate. 14319 if (isAllActivePredicate(Pg)) { 14320 if (UnpredOp) 14321 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), N->getOperand(2), 14322 N->getOperand(3)); 14323 else 14324 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, 14325 N->getOperand(2), N->getOperand(3)); 14326 } 14327 14328 // FUTURE: SplatVector(true) 14329 return SDValue(); 14330 } 14331 14332 static SDValue performIntrinsicCombine(SDNode *N, 14333 TargetLowering::DAGCombinerInfo &DCI, 14334 const AArch64Subtarget *Subtarget) { 14335 SelectionDAG &DAG = DCI.DAG; 14336 unsigned IID = getIntrinsicID(N); 14337 switch (IID) { 14338 default: 14339 break; 14340 case Intrinsic::aarch64_neon_vcvtfxs2fp: 14341 case Intrinsic::aarch64_neon_vcvtfxu2fp: 14342 return tryCombineFixedPointConvert(N, DCI, DAG); 14343 case Intrinsic::aarch64_neon_saddv: 14344 return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG); 14345 case Intrinsic::aarch64_neon_uaddv: 14346 return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG); 14347 case Intrinsic::aarch64_neon_sminv: 14348 return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG); 14349 case Intrinsic::aarch64_neon_uminv: 14350 return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG); 14351 case Intrinsic::aarch64_neon_smaxv: 14352 return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG); 14353 case Intrinsic::aarch64_neon_umaxv: 14354 return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG); 14355 case Intrinsic::aarch64_neon_fmax: 14356 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0), 14357 N->getOperand(1), N->getOperand(2)); 14358 case Intrinsic::aarch64_neon_fmin: 14359 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0), 14360 N->getOperand(1), N->getOperand(2)); 14361 case Intrinsic::aarch64_neon_fmaxnm: 14362 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0), 14363 N->getOperand(1), N->getOperand(2)); 14364 case Intrinsic::aarch64_neon_fminnm: 14365 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0), 14366 N->getOperand(1), N->getOperand(2)); 14367 case Intrinsic::aarch64_neon_smull: 14368 case Intrinsic::aarch64_neon_umull: 14369 case Intrinsic::aarch64_neon_pmull: 14370 case Intrinsic::aarch64_neon_sqdmull: 14371 return tryCombineLongOpWithDup(IID, N, DCI, DAG); 14372 case Intrinsic::aarch64_neon_sqshl: 14373 case Intrinsic::aarch64_neon_uqshl: 14374 case Intrinsic::aarch64_neon_sqshlu: 14375 case Intrinsic::aarch64_neon_srshl: 14376 case Intrinsic::aarch64_neon_urshl: 14377 case Intrinsic::aarch64_neon_sshl: 14378 case Intrinsic::aarch64_neon_ushl: 14379 return tryCombineShiftImm(IID, N, DAG); 14380 case Intrinsic::aarch64_crc32b: 14381 case Intrinsic::aarch64_crc32cb: 14382 return tryCombineCRC32(0xff, N, DAG); 14383 case Intrinsic::aarch64_crc32h: 14384 case Intrinsic::aarch64_crc32ch: 14385 return tryCombineCRC32(0xffff, N, DAG); 14386 case Intrinsic::aarch64_sve_saddv: 14387 // There is no i64 version of SADDV because the sign is irrelevant. 14388 if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64) 14389 return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG); 14390 else 14391 return combineSVEReductionInt(N, AArch64ISD::SADDV_PRED, DAG); 14392 case Intrinsic::aarch64_sve_uaddv: 14393 return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG); 14394 case Intrinsic::aarch64_sve_smaxv: 14395 return combineSVEReductionInt(N, AArch64ISD::SMAXV_PRED, DAG); 14396 case Intrinsic::aarch64_sve_umaxv: 14397 return combineSVEReductionInt(N, AArch64ISD::UMAXV_PRED, DAG); 14398 case Intrinsic::aarch64_sve_sminv: 14399 return combineSVEReductionInt(N, AArch64ISD::SMINV_PRED, DAG); 14400 case Intrinsic::aarch64_sve_uminv: 14401 return combineSVEReductionInt(N, AArch64ISD::UMINV_PRED, DAG); 14402 case Intrinsic::aarch64_sve_orv: 14403 return combineSVEReductionInt(N, AArch64ISD::ORV_PRED, DAG); 14404 case Intrinsic::aarch64_sve_eorv: 14405 return combineSVEReductionInt(N, AArch64ISD::EORV_PRED, DAG); 14406 case Intrinsic::aarch64_sve_andv: 14407 return combineSVEReductionInt(N, AArch64ISD::ANDV_PRED, DAG); 14408 case Intrinsic::aarch64_sve_index: 14409 return LowerSVEIntrinsicIndex(N, DAG); 14410 case Intrinsic::aarch64_sve_dup: 14411 return LowerSVEIntrinsicDUP(N, DAG); 14412 case Intrinsic::aarch64_sve_dup_x: 14413 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0), 14414 N->getOperand(1)); 14415 case Intrinsic::aarch64_sve_ext: 14416 return LowerSVEIntrinsicEXT(N, DAG); 14417 case Intrinsic::aarch64_sve_mul: 14418 return convertMergedOpToPredOp(N, AArch64ISD::MUL_PRED, DAG); 14419 case Intrinsic::aarch64_sve_smulh: 14420 return convertMergedOpToPredOp(N, AArch64ISD::MULHS_PRED, DAG); 14421 case Intrinsic::aarch64_sve_umulh: 14422 return convertMergedOpToPredOp(N, AArch64ISD::MULHU_PRED, DAG); 14423 case Intrinsic::aarch64_sve_smin: 14424 return convertMergedOpToPredOp(N, AArch64ISD::SMIN_PRED, DAG); 14425 case Intrinsic::aarch64_sve_umin: 14426 return convertMergedOpToPredOp(N, AArch64ISD::UMIN_PRED, DAG); 14427 case Intrinsic::aarch64_sve_smax: 14428 return convertMergedOpToPredOp(N, AArch64ISD::SMAX_PRED, DAG); 14429 case Intrinsic::aarch64_sve_umax: 14430 return convertMergedOpToPredOp(N, AArch64ISD::UMAX_PRED, DAG); 14431 case Intrinsic::aarch64_sve_lsl: 14432 return convertMergedOpToPredOp(N, AArch64ISD::SHL_PRED, DAG); 14433 case Intrinsic::aarch64_sve_lsr: 14434 return convertMergedOpToPredOp(N, AArch64ISD::SRL_PRED, DAG); 14435 case Intrinsic::aarch64_sve_asr: 14436 return convertMergedOpToPredOp(N, AArch64ISD::SRA_PRED, DAG); 14437 case Intrinsic::aarch64_sve_fadd: 14438 return convertMergedOpToPredOp(N, AArch64ISD::FADD_PRED, DAG); 14439 case Intrinsic::aarch64_sve_fsub: 14440 return convertMergedOpToPredOp(N, AArch64ISD::FSUB_PRED, DAG); 14441 case Intrinsic::aarch64_sve_fmul: 14442 return convertMergedOpToPredOp(N, AArch64ISD::FMUL_PRED, DAG); 14443 case Intrinsic::aarch64_sve_add: 14444 return convertMergedOpToPredOp(N, ISD::ADD, DAG, true); 14445 case Intrinsic::aarch64_sve_sub: 14446 return convertMergedOpToPredOp(N, ISD::SUB, DAG, true); 14447 case Intrinsic::aarch64_sve_and: 14448 return convertMergedOpToPredOp(N, ISD::AND, DAG, true); 14449 case Intrinsic::aarch64_sve_bic: 14450 return convertMergedOpToPredOp(N, AArch64ISD::BIC, DAG, true); 14451 case Intrinsic::aarch64_sve_eor: 14452 return convertMergedOpToPredOp(N, ISD::XOR, DAG, true); 14453 case Intrinsic::aarch64_sve_orr: 14454 return convertMergedOpToPredOp(N, ISD::OR, DAG, true); 14455 case Intrinsic::aarch64_sve_sqadd: 14456 return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true); 14457 case Intrinsic::aarch64_sve_sqsub: 14458 return convertMergedOpToPredOp(N, ISD::SSUBSAT, DAG, true); 14459 case Intrinsic::aarch64_sve_uqadd: 14460 return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true); 14461 case Intrinsic::aarch64_sve_uqsub: 14462 return convertMergedOpToPredOp(N, ISD::USUBSAT, DAG, true); 14463 case Intrinsic::aarch64_sve_sqadd_x: 14464 return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0), 14465 N->getOperand(1), N->getOperand(2)); 14466 case Intrinsic::aarch64_sve_sqsub_x: 14467 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0), 14468 N->getOperand(1), N->getOperand(2)); 14469 case Intrinsic::aarch64_sve_uqadd_x: 14470 return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0), 14471 N->getOperand(1), N->getOperand(2)); 14472 case Intrinsic::aarch64_sve_uqsub_x: 14473 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0), 14474 N->getOperand(1), N->getOperand(2)); 14475 case Intrinsic::aarch64_sve_cmphs: 14476 if (!N->getOperand(2).getValueType().isFloatingPoint()) 14477 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), 14478 N->getValueType(0), N->getOperand(1), N->getOperand(2), 14479 N->getOperand(3), DAG.getCondCode(ISD::SETUGE)); 14480 break; 14481 case Intrinsic::aarch64_sve_cmphi: 14482 if (!N->getOperand(2).getValueType().isFloatingPoint()) 14483 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), 14484 N->getValueType(0), N->getOperand(1), N->getOperand(2), 14485 N->getOperand(3), DAG.getCondCode(ISD::SETUGT)); 14486 break; 14487 case Intrinsic::aarch64_sve_fcmpge: 14488 case Intrinsic::aarch64_sve_cmpge: 14489 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), 14490 N->getValueType(0), N->getOperand(1), N->getOperand(2), 14491 N->getOperand(3), DAG.getCondCode(ISD::SETGE)); 14492 break; 14493 case Intrinsic::aarch64_sve_fcmpgt: 14494 case Intrinsic::aarch64_sve_cmpgt: 14495 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), 14496 N->getValueType(0), N->getOperand(1), N->getOperand(2), 14497 N->getOperand(3), DAG.getCondCode(ISD::SETGT)); 14498 break; 14499 case Intrinsic::aarch64_sve_fcmpeq: 14500 case Intrinsic::aarch64_sve_cmpeq: 14501 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), 14502 N->getValueType(0), N->getOperand(1), N->getOperand(2), 14503 N->getOperand(3), DAG.getCondCode(ISD::SETEQ)); 14504 break; 14505 case Intrinsic::aarch64_sve_fcmpne: 14506 case Intrinsic::aarch64_sve_cmpne: 14507 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), 14508 N->getValueType(0), N->getOperand(1), N->getOperand(2), 14509 N->getOperand(3), DAG.getCondCode(ISD::SETNE)); 14510 break; 14511 case Intrinsic::aarch64_sve_fcmpuo: 14512 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), 14513 N->getValueType(0), N->getOperand(1), N->getOperand(2), 14514 N->getOperand(3), DAG.getCondCode(ISD::SETUO)); 14515 break; 14516 case Intrinsic::aarch64_sve_fadda: 14517 return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG); 14518 case Intrinsic::aarch64_sve_faddv: 14519 return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG); 14520 case Intrinsic::aarch64_sve_fmaxnmv: 14521 return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG); 14522 case Intrinsic::aarch64_sve_fmaxv: 14523 return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG); 14524 case Intrinsic::aarch64_sve_fminnmv: 14525 return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG); 14526 case Intrinsic::aarch64_sve_fminv: 14527 return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG); 14528 case Intrinsic::aarch64_sve_sel: 14529 return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0), 14530 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 14531 case Intrinsic::aarch64_sve_cmpeq_wide: 14532 return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG); 14533 case Intrinsic::aarch64_sve_cmpne_wide: 14534 return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG); 14535 case Intrinsic::aarch64_sve_cmpge_wide: 14536 return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG); 14537 case Intrinsic::aarch64_sve_cmpgt_wide: 14538 return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG); 14539 case Intrinsic::aarch64_sve_cmplt_wide: 14540 return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG); 14541 case Intrinsic::aarch64_sve_cmple_wide: 14542 return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG); 14543 case Intrinsic::aarch64_sve_cmphs_wide: 14544 return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG); 14545 case Intrinsic::aarch64_sve_cmphi_wide: 14546 return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG); 14547 case Intrinsic::aarch64_sve_cmplo_wide: 14548 return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG); 14549 case Intrinsic::aarch64_sve_cmpls_wide: 14550 return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG); 14551 case Intrinsic::aarch64_sve_ptest_any: 14552 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2), 14553 AArch64CC::ANY_ACTIVE); 14554 case Intrinsic::aarch64_sve_ptest_first: 14555 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2), 14556 AArch64CC::FIRST_ACTIVE); 14557 case Intrinsic::aarch64_sve_ptest_last: 14558 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2), 14559 AArch64CC::LAST_ACTIVE); 14560 } 14561 return SDValue(); 14562 } 14563 14564 static SDValue performExtendCombine(SDNode *N, 14565 TargetLowering::DAGCombinerInfo &DCI, 14566 SelectionDAG &DAG) { 14567 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then 14568 // we can convert that DUP into another extract_high (of a bigger DUP), which 14569 // helps the backend to decide that an sabdl2 would be useful, saving a real 14570 // extract_high operation. 14571 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND && 14572 (N->getOperand(0).getOpcode() == ISD::ABDU || 14573 N->getOperand(0).getOpcode() == ISD::ABDS)) { 14574 SDNode *ABDNode = N->getOperand(0).getNode(); 14575 SDValue NewABD = 14576 tryCombineLongOpWithDup(Intrinsic::not_intrinsic, ABDNode, DCI, DAG); 14577 if (!NewABD.getNode()) 14578 return SDValue(); 14579 14580 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD); 14581 } 14582 return SDValue(); 14583 } 14584 14585 static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, 14586 SDValue SplatVal, unsigned NumVecElts) { 14587 assert(!St.isTruncatingStore() && "cannot split truncating vector store"); 14588 unsigned OrigAlignment = St.getAlignment(); 14589 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8; 14590 14591 // Create scalar stores. This is at least as good as the code sequence for a 14592 // split unaligned store which is a dup.s, ext.b, and two stores. 14593 // Most of the time the three stores should be replaced by store pair 14594 // instructions (stp). 14595 SDLoc DL(&St); 14596 SDValue BasePtr = St.getBasePtr(); 14597 uint64_t BaseOffset = 0; 14598 14599 const MachinePointerInfo &PtrInfo = St.getPointerInfo(); 14600 SDValue NewST1 = 14601 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo, 14602 OrigAlignment, St.getMemOperand()->getFlags()); 14603 14604 // As this in ISel, we will not merge this add which may degrade results. 14605 if (BasePtr->getOpcode() == ISD::ADD && 14606 isa<ConstantSDNode>(BasePtr->getOperand(1))) { 14607 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue(); 14608 BasePtr = BasePtr->getOperand(0); 14609 } 14610 14611 unsigned Offset = EltOffset; 14612 while (--NumVecElts) { 14613 unsigned Alignment = MinAlign(OrigAlignment, Offset); 14614 SDValue OffsetPtr = 14615 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, 14616 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64)); 14617 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr, 14618 PtrInfo.getWithOffset(Offset), Alignment, 14619 St.getMemOperand()->getFlags()); 14620 Offset += EltOffset; 14621 } 14622 return NewST1; 14623 } 14624 14625 // Returns an SVE type that ContentTy can be trivially sign or zero extended 14626 // into. 14627 static MVT getSVEContainerType(EVT ContentTy) { 14628 assert(ContentTy.isSimple() && "No SVE containers for extended types"); 14629 14630 switch (ContentTy.getSimpleVT().SimpleTy) { 14631 default: 14632 llvm_unreachable("No known SVE container for this MVT type"); 14633 case MVT::nxv2i8: 14634 case MVT::nxv2i16: 14635 case MVT::nxv2i32: 14636 case MVT::nxv2i64: 14637 case MVT::nxv2f32: 14638 case MVT::nxv2f64: 14639 return MVT::nxv2i64; 14640 case MVT::nxv4i8: 14641 case MVT::nxv4i16: 14642 case MVT::nxv4i32: 14643 case MVT::nxv4f32: 14644 return MVT::nxv4i32; 14645 case MVT::nxv8i8: 14646 case MVT::nxv8i16: 14647 case MVT::nxv8f16: 14648 case MVT::nxv8bf16: 14649 return MVT::nxv8i16; 14650 case MVT::nxv16i8: 14651 return MVT::nxv16i8; 14652 } 14653 } 14654 14655 static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) { 14656 SDLoc DL(N); 14657 EVT VT = N->getValueType(0); 14658 14659 if (VT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock) 14660 return SDValue(); 14661 14662 EVT ContainerVT = VT; 14663 if (ContainerVT.isInteger()) 14664 ContainerVT = getSVEContainerType(ContainerVT); 14665 14666 SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other); 14667 SDValue Ops[] = { N->getOperand(0), // Chain 14668 N->getOperand(2), // Pg 14669 N->getOperand(3), // Base 14670 DAG.getValueType(VT) }; 14671 14672 SDValue Load = DAG.getNode(Opc, DL, VTs, Ops); 14673 SDValue LoadChain = SDValue(Load.getNode(), 1); 14674 14675 if (ContainerVT.isInteger() && (VT != ContainerVT)) 14676 Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0)); 14677 14678 return DAG.getMergeValues({ Load, LoadChain }, DL); 14679 } 14680 14681 static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) { 14682 SDLoc DL(N); 14683 EVT VT = N->getValueType(0); 14684 EVT PtrTy = N->getOperand(3).getValueType(); 14685 14686 if (VT == MVT::nxv8bf16 && 14687 !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16()) 14688 return SDValue(); 14689 14690 EVT LoadVT = VT; 14691 if (VT.isFloatingPoint()) 14692 LoadVT = VT.changeTypeToInteger(); 14693 14694 auto *MINode = cast<MemIntrinsicSDNode>(N); 14695 SDValue PassThru = DAG.getConstant(0, DL, LoadVT); 14696 SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(), 14697 MINode->getOperand(3), DAG.getUNDEF(PtrTy), 14698 MINode->getOperand(2), PassThru, 14699 MINode->getMemoryVT(), MINode->getMemOperand(), 14700 ISD::UNINDEXED, ISD::NON_EXTLOAD, false); 14701 14702 if (VT.isFloatingPoint()) { 14703 SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) }; 14704 return DAG.getMergeValues(Ops, DL); 14705 } 14706 14707 return L; 14708 } 14709 14710 template <unsigned Opcode> 14711 static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) { 14712 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO || 14713 Opcode == AArch64ISD::LD1RO_MERGE_ZERO, 14714 "Unsupported opcode."); 14715 SDLoc DL(N); 14716 EVT VT = N->getValueType(0); 14717 if (VT == MVT::nxv8bf16 && 14718 !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16()) 14719 return SDValue(); 14720 14721 EVT LoadVT = VT; 14722 if (VT.isFloatingPoint()) 14723 LoadVT = VT.changeTypeToInteger(); 14724 14725 SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)}; 14726 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops); 14727 SDValue LoadChain = SDValue(Load.getNode(), 1); 14728 14729 if (VT.isFloatingPoint()) 14730 Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0)); 14731 14732 return DAG.getMergeValues({Load, LoadChain}, DL); 14733 } 14734 14735 static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) { 14736 SDLoc DL(N); 14737 SDValue Data = N->getOperand(2); 14738 EVT DataVT = Data.getValueType(); 14739 EVT HwSrcVt = getSVEContainerType(DataVT); 14740 SDValue InputVT = DAG.getValueType(DataVT); 14741 14742 if (DataVT == MVT::nxv8bf16 && 14743 !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16()) 14744 return SDValue(); 14745 14746 if (DataVT.isFloatingPoint()) 14747 InputVT = DAG.getValueType(HwSrcVt); 14748 14749 SDValue SrcNew; 14750 if (Data.getValueType().isFloatingPoint()) 14751 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data); 14752 else 14753 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data); 14754 14755 SDValue Ops[] = { N->getOperand(0), // Chain 14756 SrcNew, 14757 N->getOperand(4), // Base 14758 N->getOperand(3), // Pg 14759 InputVT 14760 }; 14761 14762 return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops); 14763 } 14764 14765 static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) { 14766 SDLoc DL(N); 14767 14768 SDValue Data = N->getOperand(2); 14769 EVT DataVT = Data.getValueType(); 14770 EVT PtrTy = N->getOperand(4).getValueType(); 14771 14772 if (DataVT == MVT::nxv8bf16 && 14773 !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16()) 14774 return SDValue(); 14775 14776 if (DataVT.isFloatingPoint()) 14777 Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data); 14778 14779 auto *MINode = cast<MemIntrinsicSDNode>(N); 14780 return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4), 14781 DAG.getUNDEF(PtrTy), MINode->getOperand(3), 14782 MINode->getMemoryVT(), MINode->getMemOperand(), 14783 ISD::UNINDEXED, false, false); 14784 } 14785 14786 /// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The 14787 /// load store optimizer pass will merge them to store pair stores. This should 14788 /// be better than a movi to create the vector zero followed by a vector store 14789 /// if the zero constant is not re-used, since one instructions and one register 14790 /// live range will be removed. 14791 /// 14792 /// For example, the final generated code should be: 14793 /// 14794 /// stp xzr, xzr, [x0] 14795 /// 14796 /// instead of: 14797 /// 14798 /// movi v0.2d, #0 14799 /// str q0, [x0] 14800 /// 14801 static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) { 14802 SDValue StVal = St.getValue(); 14803 EVT VT = StVal.getValueType(); 14804 14805 // Avoid scalarizing zero splat stores for scalable vectors. 14806 if (VT.isScalableVector()) 14807 return SDValue(); 14808 14809 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or 14810 // 2, 3 or 4 i32 elements. 14811 int NumVecElts = VT.getVectorNumElements(); 14812 if (!(((NumVecElts == 2 || NumVecElts == 3) && 14813 VT.getVectorElementType().getSizeInBits() == 64) || 14814 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) && 14815 VT.getVectorElementType().getSizeInBits() == 32))) 14816 return SDValue(); 14817 14818 if (StVal.getOpcode() != ISD::BUILD_VECTOR) 14819 return SDValue(); 14820 14821 // If the zero constant has more than one use then the vector store could be 14822 // better since the constant mov will be amortized and stp q instructions 14823 // should be able to be formed. 14824 if (!StVal.hasOneUse()) 14825 return SDValue(); 14826 14827 // If the store is truncating then it's going down to i16 or smaller, which 14828 // means it can be implemented in a single store anyway. 14829 if (St.isTruncatingStore()) 14830 return SDValue(); 14831 14832 // If the immediate offset of the address operand is too large for the stp 14833 // instruction, then bail out. 14834 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) { 14835 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1); 14836 if (Offset < -512 || Offset > 504) 14837 return SDValue(); 14838 } 14839 14840 for (int I = 0; I < NumVecElts; ++I) { 14841 SDValue EltVal = StVal.getOperand(I); 14842 if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal)) 14843 return SDValue(); 14844 } 14845 14846 // Use a CopyFromReg WZR/XZR here to prevent 14847 // DAGCombiner::MergeConsecutiveStores from undoing this transformation. 14848 SDLoc DL(&St); 14849 unsigned ZeroReg; 14850 EVT ZeroVT; 14851 if (VT.getVectorElementType().getSizeInBits() == 32) { 14852 ZeroReg = AArch64::WZR; 14853 ZeroVT = MVT::i32; 14854 } else { 14855 ZeroReg = AArch64::XZR; 14856 ZeroVT = MVT::i64; 14857 } 14858 SDValue SplatVal = 14859 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT); 14860 return splitStoreSplat(DAG, St, SplatVal, NumVecElts); 14861 } 14862 14863 /// Replace a splat of a scalar to a vector store by scalar stores of the scalar 14864 /// value. The load store optimizer pass will merge them to store pair stores. 14865 /// This has better performance than a splat of the scalar followed by a split 14866 /// vector store. Even if the stores are not merged it is four stores vs a dup, 14867 /// followed by an ext.b and two stores. 14868 static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) { 14869 SDValue StVal = St.getValue(); 14870 EVT VT = StVal.getValueType(); 14871 14872 // Don't replace floating point stores, they possibly won't be transformed to 14873 // stp because of the store pair suppress pass. 14874 if (VT.isFloatingPoint()) 14875 return SDValue(); 14876 14877 // We can express a splat as store pair(s) for 2 or 4 elements. 14878 unsigned NumVecElts = VT.getVectorNumElements(); 14879 if (NumVecElts != 4 && NumVecElts != 2) 14880 return SDValue(); 14881 14882 // If the store is truncating then it's going down to i16 or smaller, which 14883 // means it can be implemented in a single store anyway. 14884 if (St.isTruncatingStore()) 14885 return SDValue(); 14886 14887 // Check that this is a splat. 14888 // Make sure that each of the relevant vector element locations are inserted 14889 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32. 14890 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1); 14891 SDValue SplatVal; 14892 for (unsigned I = 0; I < NumVecElts; ++I) { 14893 // Check for insert vector elements. 14894 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT) 14895 return SDValue(); 14896 14897 // Check that same value is inserted at each vector element. 14898 if (I == 0) 14899 SplatVal = StVal.getOperand(1); 14900 else if (StVal.getOperand(1) != SplatVal) 14901 return SDValue(); 14902 14903 // Check insert element index. 14904 ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2)); 14905 if (!CIndex) 14906 return SDValue(); 14907 uint64_t IndexVal = CIndex->getZExtValue(); 14908 if (IndexVal >= NumVecElts) 14909 return SDValue(); 14910 IndexNotInserted.reset(IndexVal); 14911 14912 StVal = StVal.getOperand(0); 14913 } 14914 // Check that all vector element locations were inserted to. 14915 if (IndexNotInserted.any()) 14916 return SDValue(); 14917 14918 return splitStoreSplat(DAG, St, SplatVal, NumVecElts); 14919 } 14920 14921 static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 14922 SelectionDAG &DAG, 14923 const AArch64Subtarget *Subtarget) { 14924 14925 StoreSDNode *S = cast<StoreSDNode>(N); 14926 if (S->isVolatile() || S->isIndexed()) 14927 return SDValue(); 14928 14929 SDValue StVal = S->getValue(); 14930 EVT VT = StVal.getValueType(); 14931 14932 if (!VT.isFixedLengthVector()) 14933 return SDValue(); 14934 14935 // If we get a splat of zeros, convert this vector store to a store of 14936 // scalars. They will be merged into store pairs of xzr thereby removing one 14937 // instruction and one register. 14938 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S)) 14939 return ReplacedZeroSplat; 14940 14941 // FIXME: The logic for deciding if an unaligned store should be split should 14942 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be 14943 // a call to that function here. 14944 14945 if (!Subtarget->isMisaligned128StoreSlow()) 14946 return SDValue(); 14947 14948 // Don't split at -Oz. 14949 if (DAG.getMachineFunction().getFunction().hasMinSize()) 14950 return SDValue(); 14951 14952 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting 14953 // those up regresses performance on micro-benchmarks and olden/bh. 14954 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64) 14955 return SDValue(); 14956 14957 // Split unaligned 16B stores. They are terrible for performance. 14958 // Don't split stores with alignment of 1 or 2. Code that uses clang vector 14959 // extensions can use this to mark that it does not want splitting to happen 14960 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of 14961 // eliminating alignment hazards is only 1 in 8 for alignment of 2. 14962 if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 || 14963 S->getAlignment() <= 2) 14964 return SDValue(); 14965 14966 // If we get a splat of a scalar convert this vector store to a store of 14967 // scalars. They will be merged into store pairs thereby removing two 14968 // instructions. 14969 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S)) 14970 return ReplacedSplat; 14971 14972 SDLoc DL(S); 14973 14974 // Split VT into two. 14975 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); 14976 unsigned NumElts = HalfVT.getVectorNumElements(); 14977 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, 14978 DAG.getConstant(0, DL, MVT::i64)); 14979 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, 14980 DAG.getConstant(NumElts, DL, MVT::i64)); 14981 SDValue BasePtr = S->getBasePtr(); 14982 SDValue NewST1 = 14983 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(), 14984 S->getAlignment(), S->getMemOperand()->getFlags()); 14985 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, 14986 DAG.getConstant(8, DL, MVT::i64)); 14987 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr, 14988 S->getPointerInfo(), S->getAlignment(), 14989 S->getMemOperand()->getFlags()); 14990 } 14991 14992 static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG) { 14993 assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!"); 14994 14995 // splice(pg, op1, undef) -> op1 14996 if (N->getOperand(2).isUndef()) 14997 return N->getOperand(1); 14998 14999 return SDValue(); 15000 } 15001 15002 static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG) { 15003 SDLoc DL(N); 15004 SDValue Op0 = N->getOperand(0); 15005 SDValue Op1 = N->getOperand(1); 15006 EVT ResVT = N->getValueType(0); 15007 15008 // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z) 15009 if (Op0.getOpcode() == AArch64ISD::UUNPKLO) { 15010 if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) { 15011 SDValue X = Op0.getOperand(0).getOperand(0); 15012 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1); 15013 } 15014 } 15015 15016 // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z) 15017 if (Op1.getOpcode() == AArch64ISD::UUNPKHI) { 15018 if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) { 15019 SDValue Z = Op1.getOperand(0).getOperand(1); 15020 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z); 15021 } 15022 } 15023 15024 return SDValue(); 15025 } 15026 15027 static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG) { 15028 unsigned Opc = N->getOpcode(); 15029 15030 assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads 15031 Opc <= AArch64ISD::GLD1_IMM_MERGE_ZERO) || 15032 (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads 15033 Opc <= AArch64ISD::GLD1S_IMM_MERGE_ZERO)) && 15034 "Invalid opcode."); 15035 15036 const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO || 15037 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO; 15038 const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO || 15039 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO; 15040 const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO || 15041 Opc == AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO || 15042 Opc == AArch64ISD::GLD1_UXTW_MERGE_ZERO || 15043 Opc == AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO; 15044 15045 SDLoc DL(N); 15046 SDValue Chain = N->getOperand(0); 15047 SDValue Pg = N->getOperand(1); 15048 SDValue Base = N->getOperand(2); 15049 SDValue Offset = N->getOperand(3); 15050 SDValue Ty = N->getOperand(4); 15051 15052 EVT ResVT = N->getValueType(0); 15053 15054 const auto OffsetOpc = Offset.getOpcode(); 15055 const bool OffsetIsZExt = 15056 OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU; 15057 const bool OffsetIsSExt = 15058 OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU; 15059 15060 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible. 15061 if (!Extended && (OffsetIsSExt || OffsetIsZExt)) { 15062 SDValue ExtPg = Offset.getOperand(0); 15063 VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode()); 15064 EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType(); 15065 15066 // If the predicate for the sign- or zero-extended offset is the 15067 // same as the predicate used for this load and the sign-/zero-extension 15068 // was from a 32-bits... 15069 if (ExtPg == Pg && ExtFromEVT == MVT::i32) { 15070 SDValue UnextendedOffset = Offset.getOperand(1); 15071 15072 unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true); 15073 if (Signed) 15074 NewOpc = getSignExtendedGatherOpcode(NewOpc); 15075 15076 return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other}, 15077 {Chain, Pg, Base, UnextendedOffset, Ty}); 15078 } 15079 } 15080 15081 return SDValue(); 15082 } 15083 15084 /// Optimize a vector shift instruction and its operand if shifted out 15085 /// bits are not used. 15086 static SDValue performVectorShiftCombine(SDNode *N, 15087 const AArch64TargetLowering &TLI, 15088 TargetLowering::DAGCombinerInfo &DCI) { 15089 assert(N->getOpcode() == AArch64ISD::VASHR || 15090 N->getOpcode() == AArch64ISD::VLSHR); 15091 15092 SDValue Op = N->getOperand(0); 15093 unsigned OpScalarSize = Op.getScalarValueSizeInBits(); 15094 15095 unsigned ShiftImm = N->getConstantOperandVal(1); 15096 assert(OpScalarSize > ShiftImm && "Invalid shift imm"); 15097 15098 APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm); 15099 APInt DemandedMask = ~ShiftedOutBits; 15100 15101 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI)) 15102 return SDValue(N, 0); 15103 15104 return SDValue(); 15105 } 15106 15107 /// Target-specific DAG combine function for post-increment LD1 (lane) and 15108 /// post-increment LD1R. 15109 static SDValue performPostLD1Combine(SDNode *N, 15110 TargetLowering::DAGCombinerInfo &DCI, 15111 bool IsLaneOp) { 15112 if (DCI.isBeforeLegalizeOps()) 15113 return SDValue(); 15114 15115 SelectionDAG &DAG = DCI.DAG; 15116 EVT VT = N->getValueType(0); 15117 15118 if (VT.isScalableVector()) 15119 return SDValue(); 15120 15121 unsigned LoadIdx = IsLaneOp ? 1 : 0; 15122 SDNode *LD = N->getOperand(LoadIdx).getNode(); 15123 // If it is not LOAD, can not do such combine. 15124 if (LD->getOpcode() != ISD::LOAD) 15125 return SDValue(); 15126 15127 // The vector lane must be a constant in the LD1LANE opcode. 15128 SDValue Lane; 15129 if (IsLaneOp) { 15130 Lane = N->getOperand(2); 15131 auto *LaneC = dyn_cast<ConstantSDNode>(Lane); 15132 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements()) 15133 return SDValue(); 15134 } 15135 15136 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD); 15137 EVT MemVT = LoadSDN->getMemoryVT(); 15138 // Check if memory operand is the same type as the vector element. 15139 if (MemVT != VT.getVectorElementType()) 15140 return SDValue(); 15141 15142 // Check if there are other uses. If so, do not combine as it will introduce 15143 // an extra load. 15144 for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE; 15145 ++UI) { 15146 if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result. 15147 continue; 15148 if (*UI != N) 15149 return SDValue(); 15150 } 15151 15152 SDValue Addr = LD->getOperand(1); 15153 SDValue Vector = N->getOperand(0); 15154 // Search for a use of the address operand that is an increment. 15155 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE = 15156 Addr.getNode()->use_end(); UI != UE; ++UI) { 15157 SDNode *User = *UI; 15158 if (User->getOpcode() != ISD::ADD 15159 || UI.getUse().getResNo() != Addr.getResNo()) 15160 continue; 15161 15162 // If the increment is a constant, it must match the memory ref size. 15163 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 15164 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { 15165 uint32_t IncVal = CInc->getZExtValue(); 15166 unsigned NumBytes = VT.getScalarSizeInBits() / 8; 15167 if (IncVal != NumBytes) 15168 continue; 15169 Inc = DAG.getRegister(AArch64::XZR, MVT::i64); 15170 } 15171 15172 // To avoid cycle construction make sure that neither the load nor the add 15173 // are predecessors to each other or the Vector. 15174 SmallPtrSet<const SDNode *, 32> Visited; 15175 SmallVector<const SDNode *, 16> Worklist; 15176 Visited.insert(Addr.getNode()); 15177 Worklist.push_back(User); 15178 Worklist.push_back(LD); 15179 Worklist.push_back(Vector.getNode()); 15180 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) || 15181 SDNode::hasPredecessorHelper(User, Visited, Worklist)) 15182 continue; 15183 15184 SmallVector<SDValue, 8> Ops; 15185 Ops.push_back(LD->getOperand(0)); // Chain 15186 if (IsLaneOp) { 15187 Ops.push_back(Vector); // The vector to be inserted 15188 Ops.push_back(Lane); // The lane to be inserted in the vector 15189 } 15190 Ops.push_back(Addr); 15191 Ops.push_back(Inc); 15192 15193 EVT Tys[3] = { VT, MVT::i64, MVT::Other }; 15194 SDVTList SDTys = DAG.getVTList(Tys); 15195 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost; 15196 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops, 15197 MemVT, 15198 LoadSDN->getMemOperand()); 15199 15200 // Update the uses. 15201 SDValue NewResults[] = { 15202 SDValue(LD, 0), // The result of load 15203 SDValue(UpdN.getNode(), 2) // Chain 15204 }; 15205 DCI.CombineTo(LD, NewResults); 15206 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result 15207 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register 15208 15209 break; 15210 } 15211 return SDValue(); 15212 } 15213 15214 /// Simplify ``Addr`` given that the top byte of it is ignored by HW during 15215 /// address translation. 15216 static bool performTBISimplification(SDValue Addr, 15217 TargetLowering::DAGCombinerInfo &DCI, 15218 SelectionDAG &DAG) { 15219 APInt DemandedMask = APInt::getLowBitsSet(64, 56); 15220 KnownBits Known; 15221 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 15222 !DCI.isBeforeLegalizeOps()); 15223 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 15224 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) { 15225 DCI.CommitTargetLoweringOpt(TLO); 15226 return true; 15227 } 15228 return false; 15229 } 15230 15231 static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) { 15232 assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) && 15233 "Expected STORE dag node in input!"); 15234 15235 if (auto Store = dyn_cast<StoreSDNode>(N)) { 15236 if (!Store->isTruncatingStore() || Store->isIndexed()) 15237 return SDValue(); 15238 SDValue Ext = Store->getValue(); 15239 auto ExtOpCode = Ext.getOpcode(); 15240 if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND && 15241 ExtOpCode != ISD::ANY_EXTEND) 15242 return SDValue(); 15243 SDValue Orig = Ext->getOperand(0); 15244 if (Store->getMemoryVT() != Orig->getValueType(0)) 15245 return SDValue(); 15246 return DAG.getStore(Store->getChain(), SDLoc(Store), Orig, 15247 Store->getBasePtr(), Store->getPointerInfo(), 15248 Store->getAlign()); 15249 } 15250 15251 return SDValue(); 15252 } 15253 15254 static SDValue performSTORECombine(SDNode *N, 15255 TargetLowering::DAGCombinerInfo &DCI, 15256 SelectionDAG &DAG, 15257 const AArch64Subtarget *Subtarget) { 15258 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget)) 15259 return Split; 15260 15261 if (Subtarget->supportsAddressTopByteIgnored() && 15262 performTBISimplification(N->getOperand(2), DCI, DAG)) 15263 return SDValue(N, 0); 15264 15265 if (SDValue Store = foldTruncStoreOfExt(DAG, N)) 15266 return Store; 15267 15268 return SDValue(); 15269 } 15270 15271 /// Target-specific DAG combine function for NEON load/store intrinsics 15272 /// to merge base address updates. 15273 static SDValue performNEONPostLDSTCombine(SDNode *N, 15274 TargetLowering::DAGCombinerInfo &DCI, 15275 SelectionDAG &DAG) { 15276 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 15277 return SDValue(); 15278 15279 unsigned AddrOpIdx = N->getNumOperands() - 1; 15280 SDValue Addr = N->getOperand(AddrOpIdx); 15281 15282 // Search for a use of the address operand that is an increment. 15283 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), 15284 UE = Addr.getNode()->use_end(); UI != UE; ++UI) { 15285 SDNode *User = *UI; 15286 if (User->getOpcode() != ISD::ADD || 15287 UI.getUse().getResNo() != Addr.getResNo()) 15288 continue; 15289 15290 // Check that the add is independent of the load/store. Otherwise, folding 15291 // it would create a cycle. 15292 SmallPtrSet<const SDNode *, 32> Visited; 15293 SmallVector<const SDNode *, 16> Worklist; 15294 Visited.insert(Addr.getNode()); 15295 Worklist.push_back(N); 15296 Worklist.push_back(User); 15297 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || 15298 SDNode::hasPredecessorHelper(User, Visited, Worklist)) 15299 continue; 15300 15301 // Find the new opcode for the updating load/store. 15302 bool IsStore = false; 15303 bool IsLaneOp = false; 15304 bool IsDupOp = false; 15305 unsigned NewOpc = 0; 15306 unsigned NumVecs = 0; 15307 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 15308 switch (IntNo) { 15309 default: llvm_unreachable("unexpected intrinsic for Neon base update"); 15310 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post; 15311 NumVecs = 2; break; 15312 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post; 15313 NumVecs = 3; break; 15314 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post; 15315 NumVecs = 4; break; 15316 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post; 15317 NumVecs = 2; IsStore = true; break; 15318 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post; 15319 NumVecs = 3; IsStore = true; break; 15320 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post; 15321 NumVecs = 4; IsStore = true; break; 15322 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post; 15323 NumVecs = 2; break; 15324 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post; 15325 NumVecs = 3; break; 15326 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post; 15327 NumVecs = 4; break; 15328 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post; 15329 NumVecs = 2; IsStore = true; break; 15330 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post; 15331 NumVecs = 3; IsStore = true; break; 15332 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post; 15333 NumVecs = 4; IsStore = true; break; 15334 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost; 15335 NumVecs = 2; IsDupOp = true; break; 15336 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost; 15337 NumVecs = 3; IsDupOp = true; break; 15338 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost; 15339 NumVecs = 4; IsDupOp = true; break; 15340 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost; 15341 NumVecs = 2; IsLaneOp = true; break; 15342 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost; 15343 NumVecs = 3; IsLaneOp = true; break; 15344 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost; 15345 NumVecs = 4; IsLaneOp = true; break; 15346 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost; 15347 NumVecs = 2; IsStore = true; IsLaneOp = true; break; 15348 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost; 15349 NumVecs = 3; IsStore = true; IsLaneOp = true; break; 15350 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost; 15351 NumVecs = 4; IsStore = true; IsLaneOp = true; break; 15352 } 15353 15354 EVT VecTy; 15355 if (IsStore) 15356 VecTy = N->getOperand(2).getValueType(); 15357 else 15358 VecTy = N->getValueType(0); 15359 15360 // If the increment is a constant, it must match the memory ref size. 15361 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 15362 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { 15363 uint32_t IncVal = CInc->getZExtValue(); 15364 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; 15365 if (IsLaneOp || IsDupOp) 15366 NumBytes /= VecTy.getVectorNumElements(); 15367 if (IncVal != NumBytes) 15368 continue; 15369 Inc = DAG.getRegister(AArch64::XZR, MVT::i64); 15370 } 15371 SmallVector<SDValue, 8> Ops; 15372 Ops.push_back(N->getOperand(0)); // Incoming chain 15373 // Load lane and store have vector list as input. 15374 if (IsLaneOp || IsStore) 15375 for (unsigned i = 2; i < AddrOpIdx; ++i) 15376 Ops.push_back(N->getOperand(i)); 15377 Ops.push_back(Addr); // Base register 15378 Ops.push_back(Inc); 15379 15380 // Return Types. 15381 EVT Tys[6]; 15382 unsigned NumResultVecs = (IsStore ? 0 : NumVecs); 15383 unsigned n; 15384 for (n = 0; n < NumResultVecs; ++n) 15385 Tys[n] = VecTy; 15386 Tys[n++] = MVT::i64; // Type of write back register 15387 Tys[n] = MVT::Other; // Type of the chain 15388 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2)); 15389 15390 MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N); 15391 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops, 15392 MemInt->getMemoryVT(), 15393 MemInt->getMemOperand()); 15394 15395 // Update the uses. 15396 std::vector<SDValue> NewResults; 15397 for (unsigned i = 0; i < NumResultVecs; ++i) { 15398 NewResults.push_back(SDValue(UpdN.getNode(), i)); 15399 } 15400 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); 15401 DCI.CombineTo(N, NewResults); 15402 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); 15403 15404 break; 15405 } 15406 return SDValue(); 15407 } 15408 15409 // Checks to see if the value is the prescribed width and returns information 15410 // about its extension mode. 15411 static 15412 bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) { 15413 ExtType = ISD::NON_EXTLOAD; 15414 switch(V.getNode()->getOpcode()) { 15415 default: 15416 return false; 15417 case ISD::LOAD: { 15418 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode()); 15419 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8) 15420 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) { 15421 ExtType = LoadNode->getExtensionType(); 15422 return true; 15423 } 15424 return false; 15425 } 15426 case ISD::AssertSext: { 15427 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1)); 15428 if ((TypeNode->getVT() == MVT::i8 && width == 8) 15429 || (TypeNode->getVT() == MVT::i16 && width == 16)) { 15430 ExtType = ISD::SEXTLOAD; 15431 return true; 15432 } 15433 return false; 15434 } 15435 case ISD::AssertZext: { 15436 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1)); 15437 if ((TypeNode->getVT() == MVT::i8 && width == 8) 15438 || (TypeNode->getVT() == MVT::i16 && width == 16)) { 15439 ExtType = ISD::ZEXTLOAD; 15440 return true; 15441 } 15442 return false; 15443 } 15444 case ISD::Constant: 15445 case ISD::TargetConstant: { 15446 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) < 15447 1LL << (width - 1); 15448 } 15449 } 15450 15451 return true; 15452 } 15453 15454 // This function does a whole lot of voodoo to determine if the tests are 15455 // equivalent without and with a mask. Essentially what happens is that given a 15456 // DAG resembling: 15457 // 15458 // +-------------+ +-------------+ +-------------+ +-------------+ 15459 // | Input | | AddConstant | | CompConstant| | CC | 15460 // +-------------+ +-------------+ +-------------+ +-------------+ 15461 // | | | | 15462 // V V | +----------+ 15463 // +-------------+ +----+ | | 15464 // | ADD | |0xff| | | 15465 // +-------------+ +----+ | | 15466 // | | | | 15467 // V V | | 15468 // +-------------+ | | 15469 // | AND | | | 15470 // +-------------+ | | 15471 // | | | 15472 // +-----+ | | 15473 // | | | 15474 // V V V 15475 // +-------------+ 15476 // | CMP | 15477 // +-------------+ 15478 // 15479 // The AND node may be safely removed for some combinations of inputs. In 15480 // particular we need to take into account the extension type of the Input, 15481 // the exact values of AddConstant, CompConstant, and CC, along with the nominal 15482 // width of the input (this can work for any width inputs, the above graph is 15483 // specific to 8 bits. 15484 // 15485 // The specific equations were worked out by generating output tables for each 15486 // AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The 15487 // problem was simplified by working with 4 bit inputs, which means we only 15488 // needed to reason about 24 distinct bit patterns: 8 patterns unique to zero 15489 // extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8 15490 // patterns present in both extensions (0,7). For every distinct set of 15491 // AddConstant and CompConstants bit patterns we can consider the masked and 15492 // unmasked versions to be equivalent if the result of this function is true for 15493 // all 16 distinct bit patterns of for the current extension type of Input (w0). 15494 // 15495 // sub w8, w0, w1 15496 // and w10, w8, #0x0f 15497 // cmp w8, w2 15498 // cset w9, AArch64CC 15499 // cmp w10, w2 15500 // cset w11, AArch64CC 15501 // cmp w9, w11 15502 // cset w0, eq 15503 // ret 15504 // 15505 // Since the above function shows when the outputs are equivalent it defines 15506 // when it is safe to remove the AND. Unfortunately it only runs on AArch64 and 15507 // would be expensive to run during compiles. The equations below were written 15508 // in a test harness that confirmed they gave equivalent outputs to the above 15509 // for all inputs function, so they can be used determine if the removal is 15510 // legal instead. 15511 // 15512 // isEquivalentMaskless() is the code for testing if the AND can be removed 15513 // factored out of the DAG recognition as the DAG can take several forms. 15514 15515 static bool isEquivalentMaskless(unsigned CC, unsigned width, 15516 ISD::LoadExtType ExtType, int AddConstant, 15517 int CompConstant) { 15518 // By being careful about our equations and only writing the in term 15519 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can 15520 // make them generally applicable to all bit widths. 15521 int MaxUInt = (1 << width); 15522 15523 // For the purposes of these comparisons sign extending the type is 15524 // equivalent to zero extending the add and displacing it by half the integer 15525 // width. Provided we are careful and make sure our equations are valid over 15526 // the whole range we can just adjust the input and avoid writing equations 15527 // for sign extended inputs. 15528 if (ExtType == ISD::SEXTLOAD) 15529 AddConstant -= (1 << (width-1)); 15530 15531 switch(CC) { 15532 case AArch64CC::LE: 15533 case AArch64CC::GT: 15534 if ((AddConstant == 0) || 15535 (CompConstant == MaxUInt - 1 && AddConstant < 0) || 15536 (AddConstant >= 0 && CompConstant < 0) || 15537 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant)) 15538 return true; 15539 break; 15540 case AArch64CC::LT: 15541 case AArch64CC::GE: 15542 if ((AddConstant == 0) || 15543 (AddConstant >= 0 && CompConstant <= 0) || 15544 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant)) 15545 return true; 15546 break; 15547 case AArch64CC::HI: 15548 case AArch64CC::LS: 15549 if ((AddConstant >= 0 && CompConstant < 0) || 15550 (AddConstant <= 0 && CompConstant >= -1 && 15551 CompConstant < AddConstant + MaxUInt)) 15552 return true; 15553 break; 15554 case AArch64CC::PL: 15555 case AArch64CC::MI: 15556 if ((AddConstant == 0) || 15557 (AddConstant > 0 && CompConstant <= 0) || 15558 (AddConstant < 0 && CompConstant <= AddConstant)) 15559 return true; 15560 break; 15561 case AArch64CC::LO: 15562 case AArch64CC::HS: 15563 if ((AddConstant >= 0 && CompConstant <= 0) || 15564 (AddConstant <= 0 && CompConstant >= 0 && 15565 CompConstant <= AddConstant + MaxUInt)) 15566 return true; 15567 break; 15568 case AArch64CC::EQ: 15569 case AArch64CC::NE: 15570 if ((AddConstant > 0 && CompConstant < 0) || 15571 (AddConstant < 0 && CompConstant >= 0 && 15572 CompConstant < AddConstant + MaxUInt) || 15573 (AddConstant >= 0 && CompConstant >= 0 && 15574 CompConstant >= AddConstant) || 15575 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant)) 15576 return true; 15577 break; 15578 case AArch64CC::VS: 15579 case AArch64CC::VC: 15580 case AArch64CC::AL: 15581 case AArch64CC::NV: 15582 return true; 15583 case AArch64CC::Invalid: 15584 break; 15585 } 15586 15587 return false; 15588 } 15589 15590 static 15591 SDValue performCONDCombine(SDNode *N, 15592 TargetLowering::DAGCombinerInfo &DCI, 15593 SelectionDAG &DAG, unsigned CCIndex, 15594 unsigned CmpIndex) { 15595 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue(); 15596 SDNode *SubsNode = N->getOperand(CmpIndex).getNode(); 15597 unsigned CondOpcode = SubsNode->getOpcode(); 15598 15599 if (CondOpcode != AArch64ISD::SUBS) 15600 return SDValue(); 15601 15602 // There is a SUBS feeding this condition. Is it fed by a mask we can 15603 // use? 15604 15605 SDNode *AndNode = SubsNode->getOperand(0).getNode(); 15606 unsigned MaskBits = 0; 15607 15608 if (AndNode->getOpcode() != ISD::AND) 15609 return SDValue(); 15610 15611 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) { 15612 uint32_t CNV = CN->getZExtValue(); 15613 if (CNV == 255) 15614 MaskBits = 8; 15615 else if (CNV == 65535) 15616 MaskBits = 16; 15617 } 15618 15619 if (!MaskBits) 15620 return SDValue(); 15621 15622 SDValue AddValue = AndNode->getOperand(0); 15623 15624 if (AddValue.getOpcode() != ISD::ADD) 15625 return SDValue(); 15626 15627 // The basic dag structure is correct, grab the inputs and validate them. 15628 15629 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0); 15630 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1); 15631 SDValue SubsInputValue = SubsNode->getOperand(1); 15632 15633 // The mask is present and the provenance of all the values is a smaller type, 15634 // lets see if the mask is superfluous. 15635 15636 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) || 15637 !isa<ConstantSDNode>(SubsInputValue.getNode())) 15638 return SDValue(); 15639 15640 ISD::LoadExtType ExtType; 15641 15642 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) || 15643 !checkValueWidth(AddInputValue2, MaskBits, ExtType) || 15644 !checkValueWidth(AddInputValue1, MaskBits, ExtType) ) 15645 return SDValue(); 15646 15647 if(!isEquivalentMaskless(CC, MaskBits, ExtType, 15648 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(), 15649 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue())) 15650 return SDValue(); 15651 15652 // The AND is not necessary, remove it. 15653 15654 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0), 15655 SubsNode->getValueType(1)); 15656 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) }; 15657 15658 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops); 15659 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode()); 15660 15661 return SDValue(N, 0); 15662 } 15663 15664 // Optimize compare with zero and branch. 15665 static SDValue performBRCONDCombine(SDNode *N, 15666 TargetLowering::DAGCombinerInfo &DCI, 15667 SelectionDAG &DAG) { 15668 MachineFunction &MF = DAG.getMachineFunction(); 15669 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions 15670 // will not be produced, as they are conditional branch instructions that do 15671 // not set flags. 15672 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening)) 15673 return SDValue(); 15674 15675 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3)) 15676 N = NV.getNode(); 15677 SDValue Chain = N->getOperand(0); 15678 SDValue Dest = N->getOperand(1); 15679 SDValue CCVal = N->getOperand(2); 15680 SDValue Cmp = N->getOperand(3); 15681 15682 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!"); 15683 unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue(); 15684 if (CC != AArch64CC::EQ && CC != AArch64CC::NE) 15685 return SDValue(); 15686 15687 unsigned CmpOpc = Cmp.getOpcode(); 15688 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS) 15689 return SDValue(); 15690 15691 // Only attempt folding if there is only one use of the flag and no use of the 15692 // value. 15693 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1)) 15694 return SDValue(); 15695 15696 SDValue LHS = Cmp.getOperand(0); 15697 SDValue RHS = Cmp.getOperand(1); 15698 15699 assert(LHS.getValueType() == RHS.getValueType() && 15700 "Expected the value type to be the same for both operands!"); 15701 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) 15702 return SDValue(); 15703 15704 if (isNullConstant(LHS)) 15705 std::swap(LHS, RHS); 15706 15707 if (!isNullConstant(RHS)) 15708 return SDValue(); 15709 15710 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA || 15711 LHS.getOpcode() == ISD::SRL) 15712 return SDValue(); 15713 15714 // Fold the compare into the branch instruction. 15715 SDValue BR; 15716 if (CC == AArch64CC::EQ) 15717 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); 15718 else 15719 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); 15720 15721 // Do not add new nodes to DAG combiner worklist. 15722 DCI.CombineTo(N, BR, false); 15723 15724 return SDValue(); 15725 } 15726 15727 // Optimize CSEL instructions 15728 static SDValue performCSELCombine(SDNode *N, 15729 TargetLowering::DAGCombinerInfo &DCI, 15730 SelectionDAG &DAG) { 15731 // CSEL x, x, cc -> x 15732 if (N->getOperand(0) == N->getOperand(1)) 15733 return N->getOperand(0); 15734 15735 return performCONDCombine(N, DCI, DAG, 2, 3); 15736 } 15737 15738 static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) { 15739 assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!"); 15740 SDValue LHS = N->getOperand(0); 15741 SDValue RHS = N->getOperand(1); 15742 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get(); 15743 15744 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X 15745 if (Cond == ISD::SETNE && isOneConstant(RHS) && 15746 LHS->getOpcode() == AArch64ISD::CSEL && 15747 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) && 15748 LHS->hasOneUse()) { 15749 SDLoc DL(N); 15750 15751 // Invert CSEL's condition. 15752 auto *OpCC = cast<ConstantSDNode>(LHS.getOperand(2)); 15753 auto OldCond = static_cast<AArch64CC::CondCode>(OpCC->getZExtValue()); 15754 auto NewCond = getInvertedCondCode(OldCond); 15755 15756 // csel 0, 1, !cond, X 15757 SDValue CSEL = 15758 DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0), 15759 LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32), 15760 LHS.getOperand(3)); 15761 return DAG.getZExtOrTrunc(CSEL, DL, N->getValueType(0)); 15762 } 15763 15764 return SDValue(); 15765 } 15766 15767 static SDValue performSetccMergeZeroCombine(SDNode *N, SelectionDAG &DAG) { 15768 assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO && 15769 "Unexpected opcode!"); 15770 15771 SDValue Pred = N->getOperand(0); 15772 SDValue LHS = N->getOperand(1); 15773 SDValue RHS = N->getOperand(2); 15774 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get(); 15775 15776 // setcc_merge_zero pred (sign_extend (setcc_merge_zero ... pred ...)), 0, ne 15777 // => inner setcc_merge_zero 15778 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) && 15779 LHS->getOpcode() == ISD::SIGN_EXTEND && 15780 LHS->getOperand(0)->getValueType(0) == N->getValueType(0) && 15781 LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO && 15782 LHS->getOperand(0)->getOperand(0) == Pred) 15783 return LHS->getOperand(0); 15784 15785 return SDValue(); 15786 } 15787 15788 // Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test 15789 // as well as whether the test should be inverted. This code is required to 15790 // catch these cases (as opposed to standard dag combines) because 15791 // AArch64ISD::TBZ is matched during legalization. 15792 static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, 15793 SelectionDAG &DAG) { 15794 15795 if (!Op->hasOneUse()) 15796 return Op; 15797 15798 // We don't handle undef/constant-fold cases below, as they should have 15799 // already been taken care of (e.g. and of 0, test of undefined shifted bits, 15800 // etc.) 15801 15802 // (tbz (trunc x), b) -> (tbz x, b) 15803 // This case is just here to enable more of the below cases to be caught. 15804 if (Op->getOpcode() == ISD::TRUNCATE && 15805 Bit < Op->getValueType(0).getSizeInBits()) { 15806 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 15807 } 15808 15809 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits. 15810 if (Op->getOpcode() == ISD::ANY_EXTEND && 15811 Bit < Op->getOperand(0).getValueSizeInBits()) { 15812 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 15813 } 15814 15815 if (Op->getNumOperands() != 2) 15816 return Op; 15817 15818 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 15819 if (!C) 15820 return Op; 15821 15822 switch (Op->getOpcode()) { 15823 default: 15824 return Op; 15825 15826 // (tbz (and x, m), b) -> (tbz x, b) 15827 case ISD::AND: 15828 if ((C->getZExtValue() >> Bit) & 1) 15829 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 15830 return Op; 15831 15832 // (tbz (shl x, c), b) -> (tbz x, b-c) 15833 case ISD::SHL: 15834 if (C->getZExtValue() <= Bit && 15835 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) { 15836 Bit = Bit - C->getZExtValue(); 15837 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 15838 } 15839 return Op; 15840 15841 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x 15842 case ISD::SRA: 15843 Bit = Bit + C->getZExtValue(); 15844 if (Bit >= Op->getValueType(0).getSizeInBits()) 15845 Bit = Op->getValueType(0).getSizeInBits() - 1; 15846 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 15847 15848 // (tbz (srl x, c), b) -> (tbz x, b+c) 15849 case ISD::SRL: 15850 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) { 15851 Bit = Bit + C->getZExtValue(); 15852 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 15853 } 15854 return Op; 15855 15856 // (tbz (xor x, -1), b) -> (tbnz x, b) 15857 case ISD::XOR: 15858 if ((C->getZExtValue() >> Bit) & 1) 15859 Invert = !Invert; 15860 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 15861 } 15862 } 15863 15864 // Optimize test single bit zero/non-zero and branch. 15865 static SDValue performTBZCombine(SDNode *N, 15866 TargetLowering::DAGCombinerInfo &DCI, 15867 SelectionDAG &DAG) { 15868 unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 15869 bool Invert = false; 15870 SDValue TestSrc = N->getOperand(1); 15871 SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG); 15872 15873 if (TestSrc == NewTestSrc) 15874 return SDValue(); 15875 15876 unsigned NewOpc = N->getOpcode(); 15877 if (Invert) { 15878 if (NewOpc == AArch64ISD::TBZ) 15879 NewOpc = AArch64ISD::TBNZ; 15880 else { 15881 assert(NewOpc == AArch64ISD::TBNZ); 15882 NewOpc = AArch64ISD::TBZ; 15883 } 15884 } 15885 15886 SDLoc DL(N); 15887 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc, 15888 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3)); 15889 } 15890 15891 // vselect (v1i1 setcc) -> 15892 // vselect (v1iXX setcc) (XX is the size of the compared operand type) 15893 // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as 15894 // condition. If it can legalize "VSELECT v1i1" correctly, no need to combine 15895 // such VSELECT. 15896 static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) { 15897 SDValue N0 = N->getOperand(0); 15898 EVT CCVT = N0.getValueType(); 15899 15900 // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform 15901 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the 15902 // supported types. 15903 SDValue SetCC = N->getOperand(0); 15904 if (SetCC.getOpcode() == ISD::SETCC && 15905 SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) { 15906 SDValue CmpLHS = SetCC.getOperand(0); 15907 EVT VT = CmpLHS.getValueType(); 15908 SDNode *CmpRHS = SetCC.getOperand(1).getNode(); 15909 SDNode *SplatLHS = N->getOperand(1).getNode(); 15910 SDNode *SplatRHS = N->getOperand(2).getNode(); 15911 APInt SplatLHSVal; 15912 if (CmpLHS.getValueType() == N->getOperand(1).getValueType() && 15913 VT.isSimple() && 15914 is_contained( 15915 makeArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, 15916 MVT::v2i32, MVT::v4i32, MVT::v2i64}), 15917 VT.getSimpleVT().SimpleTy) && 15918 ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) && 15919 SplatLHSVal.isOneValue() && ISD::isConstantSplatVectorAllOnes(CmpRHS) && 15920 ISD::isConstantSplatVectorAllOnes(SplatRHS)) { 15921 unsigned NumElts = VT.getVectorNumElements(); 15922 SmallVector<SDValue, 8> Ops( 15923 NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N), 15924 VT.getScalarType())); 15925 SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops); 15926 15927 auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val); 15928 auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1)); 15929 return Or; 15930 } 15931 } 15932 15933 if (N0.getOpcode() != ISD::SETCC || 15934 CCVT.getVectorElementCount() != ElementCount::getFixed(1) || 15935 CCVT.getVectorElementType() != MVT::i1) 15936 return SDValue(); 15937 15938 EVT ResVT = N->getValueType(0); 15939 EVT CmpVT = N0.getOperand(0).getValueType(); 15940 // Only combine when the result type is of the same size as the compared 15941 // operands. 15942 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits()) 15943 return SDValue(); 15944 15945 SDValue IfTrue = N->getOperand(1); 15946 SDValue IfFalse = N->getOperand(2); 15947 SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(), 15948 N0.getOperand(0), N0.getOperand(1), 15949 cast<CondCodeSDNode>(N0.getOperand(2))->get()); 15950 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC, 15951 IfTrue, IfFalse); 15952 } 15953 15954 /// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with 15955 /// the compare-mask instructions rather than going via NZCV, even if LHS and 15956 /// RHS are really scalar. This replaces any scalar setcc in the above pattern 15957 /// with a vector one followed by a DUP shuffle on the result. 15958 static SDValue performSelectCombine(SDNode *N, 15959 TargetLowering::DAGCombinerInfo &DCI) { 15960 SelectionDAG &DAG = DCI.DAG; 15961 SDValue N0 = N->getOperand(0); 15962 EVT ResVT = N->getValueType(0); 15963 15964 if (N0.getOpcode() != ISD::SETCC) 15965 return SDValue(); 15966 15967 if (ResVT.isScalableVector()) 15968 return SDValue(); 15969 15970 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered 15971 // scalar SetCCResultType. We also don't expect vectors, because we assume 15972 // that selects fed by vector SETCCs are canonicalized to VSELECT. 15973 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) && 15974 "Scalar-SETCC feeding SELECT has unexpected result type!"); 15975 15976 // If NumMaskElts == 0, the comparison is larger than select result. The 15977 // largest real NEON comparison is 64-bits per lane, which means the result is 15978 // at most 32-bits and an illegal vector. Just bail out for now. 15979 EVT SrcVT = N0.getOperand(0).getValueType(); 15980 15981 // Don't try to do this optimization when the setcc itself has i1 operands. 15982 // There are no legal vectors of i1, so this would be pointless. 15983 if (SrcVT == MVT::i1) 15984 return SDValue(); 15985 15986 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits(); 15987 if (!ResVT.isVector() || NumMaskElts == 0) 15988 return SDValue(); 15989 15990 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts); 15991 EVT CCVT = SrcVT.changeVectorElementTypeToInteger(); 15992 15993 // Also bail out if the vector CCVT isn't the same size as ResVT. 15994 // This can happen if the SETCC operand size doesn't divide the ResVT size 15995 // (e.g., f64 vs v3f32). 15996 if (CCVT.getSizeInBits() != ResVT.getSizeInBits()) 15997 return SDValue(); 15998 15999 // Make sure we didn't create illegal types, if we're not supposed to. 16000 assert(DCI.isBeforeLegalize() || 16001 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)); 16002 16003 // First perform a vector comparison, where lane 0 is the one we're interested 16004 // in. 16005 SDLoc DL(N0); 16006 SDValue LHS = 16007 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0)); 16008 SDValue RHS = 16009 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1)); 16010 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2)); 16011 16012 // Now duplicate the comparison mask we want across all other lanes. 16013 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0); 16014 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask); 16015 Mask = DAG.getNode(ISD::BITCAST, DL, 16016 ResVT.changeVectorElementTypeToInteger(), Mask); 16017 16018 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2)); 16019 } 16020 16021 /// Get rid of unnecessary NVCASTs (that don't change the type). 16022 static SDValue performNVCASTCombine(SDNode *N) { 16023 if (N->getValueType(0) == N->getOperand(0).getValueType()) 16024 return N->getOperand(0); 16025 16026 return SDValue(); 16027 } 16028 16029 // If all users of the globaladdr are of the form (globaladdr + constant), find 16030 // the smallest constant, fold it into the globaladdr's offset and rewrite the 16031 // globaladdr as (globaladdr + constant) - constant. 16032 static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, 16033 const AArch64Subtarget *Subtarget, 16034 const TargetMachine &TM) { 16035 auto *GN = cast<GlobalAddressSDNode>(N); 16036 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) != 16037 AArch64II::MO_NO_FLAG) 16038 return SDValue(); 16039 16040 uint64_t MinOffset = -1ull; 16041 for (SDNode *N : GN->uses()) { 16042 if (N->getOpcode() != ISD::ADD) 16043 return SDValue(); 16044 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0)); 16045 if (!C) 16046 C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 16047 if (!C) 16048 return SDValue(); 16049 MinOffset = std::min(MinOffset, C->getZExtValue()); 16050 } 16051 uint64_t Offset = MinOffset + GN->getOffset(); 16052 16053 // Require that the new offset is larger than the existing one. Otherwise, we 16054 // can end up oscillating between two possible DAGs, for example, 16055 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1). 16056 if (Offset <= uint64_t(GN->getOffset())) 16057 return SDValue(); 16058 16059 // Check whether folding this offset is legal. It must not go out of bounds of 16060 // the referenced object to avoid violating the code model, and must be 16061 // smaller than 2^21 because this is the largest offset expressible in all 16062 // object formats. 16063 // 16064 // This check also prevents us from folding negative offsets, which will end 16065 // up being treated in the same way as large positive ones. They could also 16066 // cause code model violations, and aren't really common enough to matter. 16067 if (Offset >= (1 << 21)) 16068 return SDValue(); 16069 16070 const GlobalValue *GV = GN->getGlobal(); 16071 Type *T = GV->getValueType(); 16072 if (!T->isSized() || 16073 Offset > GV->getParent()->getDataLayout().getTypeAllocSize(T)) 16074 return SDValue(); 16075 16076 SDLoc DL(GN); 16077 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset); 16078 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result, 16079 DAG.getConstant(MinOffset, DL, MVT::i64)); 16080 } 16081 16082 // Turns the vector of indices into a vector of byte offstes by scaling Offset 16083 // by (BitWidth / 8). 16084 static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, 16085 SDLoc DL, unsigned BitWidth) { 16086 assert(Offset.getValueType().isScalableVector() && 16087 "This method is only for scalable vectors of offsets"); 16088 16089 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64); 16090 SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift); 16091 16092 return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift); 16093 } 16094 16095 /// Check if the value of \p OffsetInBytes can be used as an immediate for 16096 /// the gather load/prefetch and scatter store instructions with vector base and 16097 /// immediate offset addressing mode: 16098 /// 16099 /// [<Zn>.[S|D]{, #<imm>}] 16100 /// 16101 /// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31. 16102 inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, 16103 unsigned ScalarSizeInBytes) { 16104 // The immediate is not a multiple of the scalar size. 16105 if (OffsetInBytes % ScalarSizeInBytes) 16106 return false; 16107 16108 // The immediate is out of range. 16109 if (OffsetInBytes / ScalarSizeInBytes > 31) 16110 return false; 16111 16112 return true; 16113 } 16114 16115 /// Check if the value of \p Offset represents a valid immediate for the SVE 16116 /// gather load/prefetch and scatter store instructiona with vector base and 16117 /// immediate offset addressing mode: 16118 /// 16119 /// [<Zn>.[S|D]{, #<imm>}] 16120 /// 16121 /// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31. 16122 static bool isValidImmForSVEVecImmAddrMode(SDValue Offset, 16123 unsigned ScalarSizeInBytes) { 16124 ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode()); 16125 return OffsetConst && isValidImmForSVEVecImmAddrMode( 16126 OffsetConst->getZExtValue(), ScalarSizeInBytes); 16127 } 16128 16129 static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, 16130 unsigned Opcode, 16131 bool OnlyPackedOffsets = true) { 16132 const SDValue Src = N->getOperand(2); 16133 const EVT SrcVT = Src->getValueType(0); 16134 assert(SrcVT.isScalableVector() && 16135 "Scatter stores are only possible for SVE vectors"); 16136 16137 SDLoc DL(N); 16138 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT(); 16139 16140 // Make sure that source data will fit into an SVE register 16141 if (SrcVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock) 16142 return SDValue(); 16143 16144 // For FPs, ACLE only supports _packed_ single and double precision types. 16145 if (SrcElVT.isFloatingPoint()) 16146 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64)) 16147 return SDValue(); 16148 16149 // Depending on the addressing mode, this is either a pointer or a vector of 16150 // pointers (that fits into one register) 16151 SDValue Base = N->getOperand(4); 16152 // Depending on the addressing mode, this is either a single offset or a 16153 // vector of offsets (that fits into one register) 16154 SDValue Offset = N->getOperand(5); 16155 16156 // For "scalar + vector of indices", just scale the indices. This only 16157 // applies to non-temporal scatters because there's no instruction that takes 16158 // indicies. 16159 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) { 16160 Offset = 16161 getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits()); 16162 Opcode = AArch64ISD::SSTNT1_PRED; 16163 } 16164 16165 // In the case of non-temporal gather loads there's only one SVE instruction 16166 // per data-size: "scalar + vector", i.e. 16167 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0] 16168 // Since we do have intrinsics that allow the arguments to be in a different 16169 // order, we may need to swap them to match the spec. 16170 if (Opcode == AArch64ISD::SSTNT1_PRED && Offset.getValueType().isVector()) 16171 std::swap(Base, Offset); 16172 16173 // SST1_IMM requires that the offset is an immediate that is: 16174 // * a multiple of #SizeInBytes, 16175 // * in the range [0, 31 x #SizeInBytes], 16176 // where #SizeInBytes is the size in bytes of the stored items. For 16177 // immediates outside that range and non-immediate scalar offsets use SST1 or 16178 // SST1_UXTW instead. 16179 if (Opcode == AArch64ISD::SST1_IMM_PRED) { 16180 if (!isValidImmForSVEVecImmAddrMode(Offset, 16181 SrcVT.getScalarSizeInBits() / 8)) { 16182 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy) 16183 Opcode = AArch64ISD::SST1_UXTW_PRED; 16184 else 16185 Opcode = AArch64ISD::SST1_PRED; 16186 16187 std::swap(Base, Offset); 16188 } 16189 } 16190 16191 auto &TLI = DAG.getTargetLoweringInfo(); 16192 if (!TLI.isTypeLegal(Base.getValueType())) 16193 return SDValue(); 16194 16195 // Some scatter store variants allow unpacked offsets, but only as nxv2i32 16196 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to 16197 // nxv2i64. Legalize accordingly. 16198 if (!OnlyPackedOffsets && 16199 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32) 16200 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0); 16201 16202 if (!TLI.isTypeLegal(Offset.getValueType())) 16203 return SDValue(); 16204 16205 // Source value type that is representable in hardware 16206 EVT HwSrcVt = getSVEContainerType(SrcVT); 16207 16208 // Keep the original type of the input data to store - this is needed to be 16209 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For 16210 // FP values we want the integer equivalent, so just use HwSrcVt. 16211 SDValue InputVT = DAG.getValueType(SrcVT); 16212 if (SrcVT.isFloatingPoint()) 16213 InputVT = DAG.getValueType(HwSrcVt); 16214 16215 SDVTList VTs = DAG.getVTList(MVT::Other); 16216 SDValue SrcNew; 16217 16218 if (Src.getValueType().isFloatingPoint()) 16219 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src); 16220 else 16221 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src); 16222 16223 SDValue Ops[] = {N->getOperand(0), // Chain 16224 SrcNew, 16225 N->getOperand(3), // Pg 16226 Base, 16227 Offset, 16228 InputVT}; 16229 16230 return DAG.getNode(Opcode, DL, VTs, Ops); 16231 } 16232 16233 static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, 16234 unsigned Opcode, 16235 bool OnlyPackedOffsets = true) { 16236 const EVT RetVT = N->getValueType(0); 16237 assert(RetVT.isScalableVector() && 16238 "Gather loads are only possible for SVE vectors"); 16239 16240 SDLoc DL(N); 16241 16242 // Make sure that the loaded data will fit into an SVE register 16243 if (RetVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock) 16244 return SDValue(); 16245 16246 // Depending on the addressing mode, this is either a pointer or a vector of 16247 // pointers (that fits into one register) 16248 SDValue Base = N->getOperand(3); 16249 // Depending on the addressing mode, this is either a single offset or a 16250 // vector of offsets (that fits into one register) 16251 SDValue Offset = N->getOperand(4); 16252 16253 // For "scalar + vector of indices", just scale the indices. This only 16254 // applies to non-temporal gathers because there's no instruction that takes 16255 // indicies. 16256 if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) { 16257 Offset = getScaledOffsetForBitWidth(DAG, Offset, DL, 16258 RetVT.getScalarSizeInBits()); 16259 Opcode = AArch64ISD::GLDNT1_MERGE_ZERO; 16260 } 16261 16262 // In the case of non-temporal gather loads there's only one SVE instruction 16263 // per data-size: "scalar + vector", i.e. 16264 // * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0] 16265 // Since we do have intrinsics that allow the arguments to be in a different 16266 // order, we may need to swap them to match the spec. 16267 if (Opcode == AArch64ISD::GLDNT1_MERGE_ZERO && 16268 Offset.getValueType().isVector()) 16269 std::swap(Base, Offset); 16270 16271 // GLD{FF}1_IMM requires that the offset is an immediate that is: 16272 // * a multiple of #SizeInBytes, 16273 // * in the range [0, 31 x #SizeInBytes], 16274 // where #SizeInBytes is the size in bytes of the loaded items. For 16275 // immediates outside that range and non-immediate scalar offsets use 16276 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead. 16277 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO || 16278 Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) { 16279 if (!isValidImmForSVEVecImmAddrMode(Offset, 16280 RetVT.getScalarSizeInBits() / 8)) { 16281 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy) 16282 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO) 16283 ? AArch64ISD::GLD1_UXTW_MERGE_ZERO 16284 : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO; 16285 else 16286 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO) 16287 ? AArch64ISD::GLD1_MERGE_ZERO 16288 : AArch64ISD::GLDFF1_MERGE_ZERO; 16289 16290 std::swap(Base, Offset); 16291 } 16292 } 16293 16294 auto &TLI = DAG.getTargetLoweringInfo(); 16295 if (!TLI.isTypeLegal(Base.getValueType())) 16296 return SDValue(); 16297 16298 // Some gather load variants allow unpacked offsets, but only as nxv2i32 16299 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to 16300 // nxv2i64. Legalize accordingly. 16301 if (!OnlyPackedOffsets && 16302 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32) 16303 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0); 16304 16305 // Return value type that is representable in hardware 16306 EVT HwRetVt = getSVEContainerType(RetVT); 16307 16308 // Keep the original output value type around - this is needed to be able to 16309 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP 16310 // values we want the integer equivalent, so just use HwRetVT. 16311 SDValue OutVT = DAG.getValueType(RetVT); 16312 if (RetVT.isFloatingPoint()) 16313 OutVT = DAG.getValueType(HwRetVt); 16314 16315 SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other); 16316 SDValue Ops[] = {N->getOperand(0), // Chain 16317 N->getOperand(2), // Pg 16318 Base, Offset, OutVT}; 16319 16320 SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops); 16321 SDValue LoadChain = SDValue(Load.getNode(), 1); 16322 16323 if (RetVT.isInteger() && (RetVT != HwRetVt)) 16324 Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0)); 16325 16326 // If the original return value was FP, bitcast accordingly. Doing it here 16327 // means that we can avoid adding TableGen patterns for FPs. 16328 if (RetVT.isFloatingPoint()) 16329 Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0)); 16330 16331 return DAG.getMergeValues({Load, LoadChain}, DL); 16332 } 16333 16334 static SDValue 16335 performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 16336 SelectionDAG &DAG) { 16337 SDLoc DL(N); 16338 SDValue Src = N->getOperand(0); 16339 unsigned Opc = Src->getOpcode(); 16340 16341 // Sign extend of an unsigned unpack -> signed unpack 16342 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) { 16343 16344 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI 16345 : AArch64ISD::SUNPKLO; 16346 16347 // Push the sign extend to the operand of the unpack 16348 // This is necessary where, for example, the operand of the unpack 16349 // is another unpack: 16350 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8) 16351 // -> 16352 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8) 16353 // -> 16354 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd)) 16355 SDValue ExtOp = Src->getOperand(0); 16356 auto VT = cast<VTSDNode>(N->getOperand(1))->getVT(); 16357 EVT EltTy = VT.getVectorElementType(); 16358 (void)EltTy; 16359 16360 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) && 16361 "Sign extending from an invalid type"); 16362 16363 EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext()); 16364 16365 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(), 16366 ExtOp, DAG.getValueType(ExtVT)); 16367 16368 return DAG.getNode(SOpc, DL, N->getValueType(0), Ext); 16369 } 16370 16371 if (DCI.isBeforeLegalizeOps()) 16372 return SDValue(); 16373 16374 if (!EnableCombineMGatherIntrinsics) 16375 return SDValue(); 16376 16377 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates 16378 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes. 16379 unsigned NewOpc; 16380 unsigned MemVTOpNum = 4; 16381 switch (Opc) { 16382 case AArch64ISD::LD1_MERGE_ZERO: 16383 NewOpc = AArch64ISD::LD1S_MERGE_ZERO; 16384 MemVTOpNum = 3; 16385 break; 16386 case AArch64ISD::LDNF1_MERGE_ZERO: 16387 NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO; 16388 MemVTOpNum = 3; 16389 break; 16390 case AArch64ISD::LDFF1_MERGE_ZERO: 16391 NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO; 16392 MemVTOpNum = 3; 16393 break; 16394 case AArch64ISD::GLD1_MERGE_ZERO: 16395 NewOpc = AArch64ISD::GLD1S_MERGE_ZERO; 16396 break; 16397 case AArch64ISD::GLD1_SCALED_MERGE_ZERO: 16398 NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO; 16399 break; 16400 case AArch64ISD::GLD1_SXTW_MERGE_ZERO: 16401 NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO; 16402 break; 16403 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO: 16404 NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO; 16405 break; 16406 case AArch64ISD::GLD1_UXTW_MERGE_ZERO: 16407 NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO; 16408 break; 16409 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO: 16410 NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO; 16411 break; 16412 case AArch64ISD::GLD1_IMM_MERGE_ZERO: 16413 NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO; 16414 break; 16415 case AArch64ISD::GLDFF1_MERGE_ZERO: 16416 NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO; 16417 break; 16418 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO: 16419 NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO; 16420 break; 16421 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO: 16422 NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO; 16423 break; 16424 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO: 16425 NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO; 16426 break; 16427 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO: 16428 NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO; 16429 break; 16430 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO: 16431 NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO; 16432 break; 16433 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO: 16434 NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO; 16435 break; 16436 case AArch64ISD::GLDNT1_MERGE_ZERO: 16437 NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO; 16438 break; 16439 default: 16440 return SDValue(); 16441 } 16442 16443 EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT(); 16444 EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT(); 16445 16446 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse()) 16447 return SDValue(); 16448 16449 EVT DstVT = N->getValueType(0); 16450 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other); 16451 16452 SmallVector<SDValue, 5> Ops; 16453 for (unsigned I = 0; I < Src->getNumOperands(); ++I) 16454 Ops.push_back(Src->getOperand(I)); 16455 16456 SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops); 16457 DCI.CombineTo(N, ExtLoad); 16458 DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1)); 16459 16460 // Return N so it doesn't get rechecked 16461 return SDValue(N, 0); 16462 } 16463 16464 /// Legalize the gather prefetch (scalar + vector addressing mode) when the 16465 /// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset 16466 /// != nxv2i32) do not need legalization. 16467 static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) { 16468 const unsigned OffsetPos = 4; 16469 SDValue Offset = N->getOperand(OffsetPos); 16470 16471 // Not an unpacked vector, bail out. 16472 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32) 16473 return SDValue(); 16474 16475 // Extend the unpacked offset vector to 64-bit lanes. 16476 SDLoc DL(N); 16477 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset); 16478 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end()); 16479 // Replace the offset operand with the 64-bit one. 16480 Ops[OffsetPos] = Offset; 16481 16482 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops); 16483 } 16484 16485 /// Combines a node carrying the intrinsic 16486 /// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses 16487 /// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to 16488 /// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the 16489 /// sve gather prefetch instruction with vector plus immediate addressing mode. 16490 static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, 16491 unsigned ScalarSizeInBytes) { 16492 const unsigned ImmPos = 4, OffsetPos = 3; 16493 // No need to combine the node if the immediate is valid... 16494 if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes)) 16495 return SDValue(); 16496 16497 // ...otherwise swap the offset base with the offset... 16498 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end()); 16499 std::swap(Ops[ImmPos], Ops[OffsetPos]); 16500 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to 16501 // `aarch64_sve_prfb_gather_uxtw_index`. 16502 SDLoc DL(N); 16503 Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL, 16504 MVT::i64); 16505 16506 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops); 16507 } 16508 16509 // Return true if the vector operation can guarantee only the first lane of its 16510 // result contains data, with all bits in other lanes set to zero. 16511 static bool isLanes1toNKnownZero(SDValue Op) { 16512 switch (Op.getOpcode()) { 16513 default: 16514 return false; 16515 case AArch64ISD::ANDV_PRED: 16516 case AArch64ISD::EORV_PRED: 16517 case AArch64ISD::FADDA_PRED: 16518 case AArch64ISD::FADDV_PRED: 16519 case AArch64ISD::FMAXNMV_PRED: 16520 case AArch64ISD::FMAXV_PRED: 16521 case AArch64ISD::FMINNMV_PRED: 16522 case AArch64ISD::FMINV_PRED: 16523 case AArch64ISD::ORV_PRED: 16524 case AArch64ISD::SADDV_PRED: 16525 case AArch64ISD::SMAXV_PRED: 16526 case AArch64ISD::SMINV_PRED: 16527 case AArch64ISD::UADDV_PRED: 16528 case AArch64ISD::UMAXV_PRED: 16529 case AArch64ISD::UMINV_PRED: 16530 return true; 16531 } 16532 } 16533 16534 static SDValue removeRedundantInsertVectorElt(SDNode *N) { 16535 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!"); 16536 SDValue InsertVec = N->getOperand(0); 16537 SDValue InsertElt = N->getOperand(1); 16538 SDValue InsertIdx = N->getOperand(2); 16539 16540 // We only care about inserts into the first element... 16541 if (!isNullConstant(InsertIdx)) 16542 return SDValue(); 16543 // ...of a zero'd vector... 16544 if (!ISD::isConstantSplatVectorAllZeros(InsertVec.getNode())) 16545 return SDValue(); 16546 // ...where the inserted data was previously extracted... 16547 if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT) 16548 return SDValue(); 16549 16550 SDValue ExtractVec = InsertElt.getOperand(0); 16551 SDValue ExtractIdx = InsertElt.getOperand(1); 16552 16553 // ...from the first element of a vector. 16554 if (!isNullConstant(ExtractIdx)) 16555 return SDValue(); 16556 16557 // If we get here we are effectively trying to zero lanes 1-N of a vector. 16558 16559 // Ensure there's no type conversion going on. 16560 if (N->getValueType(0) != ExtractVec.getValueType()) 16561 return SDValue(); 16562 16563 if (!isLanes1toNKnownZero(ExtractVec)) 16564 return SDValue(); 16565 16566 // The explicit zeroing is redundant. 16567 return ExtractVec; 16568 } 16569 16570 static SDValue 16571 performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 16572 if (SDValue Res = removeRedundantInsertVectorElt(N)) 16573 return Res; 16574 16575 return performPostLD1Combine(N, DCI, true); 16576 } 16577 16578 SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG) { 16579 EVT Ty = N->getValueType(0); 16580 if (Ty.isInteger()) 16581 return SDValue(); 16582 16583 EVT IntTy = Ty.changeVectorElementTypeToInteger(); 16584 EVT ExtIntTy = getPackedSVEVectorVT(IntTy.getVectorElementCount()); 16585 if (ExtIntTy.getVectorElementType().getScalarSizeInBits() < 16586 IntTy.getVectorElementType().getScalarSizeInBits()) 16587 return SDValue(); 16588 16589 SDLoc DL(N); 16590 SDValue LHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(0)), 16591 DL, ExtIntTy); 16592 SDValue RHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(1)), 16593 DL, ExtIntTy); 16594 SDValue Idx = N->getOperand(2); 16595 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ExtIntTy, LHS, RHS, Idx); 16596 SDValue Trunc = DAG.getAnyExtOrTrunc(Splice, DL, IntTy); 16597 return DAG.getBitcast(Ty, Trunc); 16598 } 16599 16600 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, 16601 DAGCombinerInfo &DCI) const { 16602 SelectionDAG &DAG = DCI.DAG; 16603 switch (N->getOpcode()) { 16604 default: 16605 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n"); 16606 break; 16607 case ISD::ADD: 16608 case ISD::SUB: 16609 return performAddSubCombine(N, DCI, DAG); 16610 case ISD::XOR: 16611 return performXorCombine(N, DAG, DCI, Subtarget); 16612 case ISD::MUL: 16613 return performMulCombine(N, DAG, DCI, Subtarget); 16614 case ISD::SINT_TO_FP: 16615 case ISD::UINT_TO_FP: 16616 return performIntToFpCombine(N, DAG, Subtarget); 16617 case ISD::FP_TO_SINT: 16618 case ISD::FP_TO_UINT: 16619 return performFpToIntCombine(N, DAG, DCI, Subtarget); 16620 case ISD::FDIV: 16621 return performFDivCombine(N, DAG, DCI, Subtarget); 16622 case ISD::OR: 16623 return performORCombine(N, DCI, Subtarget); 16624 case ISD::AND: 16625 return performANDCombine(N, DCI); 16626 case ISD::SRL: 16627 return performSRLCombine(N, DCI); 16628 case ISD::INTRINSIC_WO_CHAIN: 16629 return performIntrinsicCombine(N, DCI, Subtarget); 16630 case ISD::ANY_EXTEND: 16631 case ISD::ZERO_EXTEND: 16632 case ISD::SIGN_EXTEND: 16633 return performExtendCombine(N, DCI, DAG); 16634 case ISD::SIGN_EXTEND_INREG: 16635 return performSignExtendInRegCombine(N, DCI, DAG); 16636 case ISD::TRUNCATE: 16637 return performVectorTruncateCombine(N, DCI, DAG); 16638 case ISD::CONCAT_VECTORS: 16639 return performConcatVectorsCombine(N, DCI, DAG); 16640 case ISD::SELECT: 16641 return performSelectCombine(N, DCI); 16642 case ISD::VSELECT: 16643 return performVSelectCombine(N, DCI.DAG); 16644 case ISD::SETCC: 16645 return performSETCCCombine(N, DAG); 16646 case ISD::LOAD: 16647 if (performTBISimplification(N->getOperand(1), DCI, DAG)) 16648 return SDValue(N, 0); 16649 break; 16650 case ISD::STORE: 16651 return performSTORECombine(N, DCI, DAG, Subtarget); 16652 case ISD::VECTOR_SPLICE: 16653 return performSVESpliceCombine(N, DAG); 16654 case AArch64ISD::BRCOND: 16655 return performBRCONDCombine(N, DCI, DAG); 16656 case AArch64ISD::TBNZ: 16657 case AArch64ISD::TBZ: 16658 return performTBZCombine(N, DCI, DAG); 16659 case AArch64ISD::CSEL: 16660 return performCSELCombine(N, DCI, DAG); 16661 case AArch64ISD::DUP: 16662 return performPostLD1Combine(N, DCI, false); 16663 case AArch64ISD::NVCAST: 16664 return performNVCASTCombine(N); 16665 case AArch64ISD::SPLICE: 16666 return performSpliceCombine(N, DAG); 16667 case AArch64ISD::UZP1: 16668 return performUzpCombine(N, DAG); 16669 case AArch64ISD::SETCC_MERGE_ZERO: 16670 return performSetccMergeZeroCombine(N, DAG); 16671 case AArch64ISD::GLD1_MERGE_ZERO: 16672 case AArch64ISD::GLD1_SCALED_MERGE_ZERO: 16673 case AArch64ISD::GLD1_UXTW_MERGE_ZERO: 16674 case AArch64ISD::GLD1_SXTW_MERGE_ZERO: 16675 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO: 16676 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO: 16677 case AArch64ISD::GLD1_IMM_MERGE_ZERO: 16678 case AArch64ISD::GLD1S_MERGE_ZERO: 16679 case AArch64ISD::GLD1S_SCALED_MERGE_ZERO: 16680 case AArch64ISD::GLD1S_UXTW_MERGE_ZERO: 16681 case AArch64ISD::GLD1S_SXTW_MERGE_ZERO: 16682 case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO: 16683 case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO: 16684 case AArch64ISD::GLD1S_IMM_MERGE_ZERO: 16685 return performGLD1Combine(N, DAG); 16686 case AArch64ISD::VASHR: 16687 case AArch64ISD::VLSHR: 16688 return performVectorShiftCombine(N, *this, DCI); 16689 case ISD::INSERT_VECTOR_ELT: 16690 return performInsertVectorEltCombine(N, DCI); 16691 case ISD::EXTRACT_VECTOR_ELT: 16692 return performExtractVectorEltCombine(N, DAG); 16693 case ISD::VECREDUCE_ADD: 16694 return performVecReduceAddCombine(N, DCI.DAG, Subtarget); 16695 case ISD::INTRINSIC_VOID: 16696 case ISD::INTRINSIC_W_CHAIN: 16697 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 16698 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset: 16699 return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/); 16700 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset: 16701 return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/); 16702 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset: 16703 return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/); 16704 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset: 16705 return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/); 16706 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index: 16707 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index: 16708 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index: 16709 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index: 16710 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index: 16711 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index: 16712 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index: 16713 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index: 16714 return legalizeSVEGatherPrefetchOffsVec(N, DAG); 16715 case Intrinsic::aarch64_neon_ld2: 16716 case Intrinsic::aarch64_neon_ld3: 16717 case Intrinsic::aarch64_neon_ld4: 16718 case Intrinsic::aarch64_neon_ld1x2: 16719 case Intrinsic::aarch64_neon_ld1x3: 16720 case Intrinsic::aarch64_neon_ld1x4: 16721 case Intrinsic::aarch64_neon_ld2lane: 16722 case Intrinsic::aarch64_neon_ld3lane: 16723 case Intrinsic::aarch64_neon_ld4lane: 16724 case Intrinsic::aarch64_neon_ld2r: 16725 case Intrinsic::aarch64_neon_ld3r: 16726 case Intrinsic::aarch64_neon_ld4r: 16727 case Intrinsic::aarch64_neon_st2: 16728 case Intrinsic::aarch64_neon_st3: 16729 case Intrinsic::aarch64_neon_st4: 16730 case Intrinsic::aarch64_neon_st1x2: 16731 case Intrinsic::aarch64_neon_st1x3: 16732 case Intrinsic::aarch64_neon_st1x4: 16733 case Intrinsic::aarch64_neon_st2lane: 16734 case Intrinsic::aarch64_neon_st3lane: 16735 case Intrinsic::aarch64_neon_st4lane: 16736 return performNEONPostLDSTCombine(N, DCI, DAG); 16737 case Intrinsic::aarch64_sve_ldnt1: 16738 return performLDNT1Combine(N, DAG); 16739 case Intrinsic::aarch64_sve_ld1rq: 16740 return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG); 16741 case Intrinsic::aarch64_sve_ld1ro: 16742 return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG); 16743 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset: 16744 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO); 16745 case Intrinsic::aarch64_sve_ldnt1_gather: 16746 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO); 16747 case Intrinsic::aarch64_sve_ldnt1_gather_index: 16748 return performGatherLoadCombine(N, DAG, 16749 AArch64ISD::GLDNT1_INDEX_MERGE_ZERO); 16750 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw: 16751 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO); 16752 case Intrinsic::aarch64_sve_ld1: 16753 return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO); 16754 case Intrinsic::aarch64_sve_ldnf1: 16755 return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO); 16756 case Intrinsic::aarch64_sve_ldff1: 16757 return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO); 16758 case Intrinsic::aarch64_sve_st1: 16759 return performST1Combine(N, DAG); 16760 case Intrinsic::aarch64_sve_stnt1: 16761 return performSTNT1Combine(N, DAG); 16762 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset: 16763 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED); 16764 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw: 16765 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED); 16766 case Intrinsic::aarch64_sve_stnt1_scatter: 16767 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED); 16768 case Intrinsic::aarch64_sve_stnt1_scatter_index: 16769 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED); 16770 case Intrinsic::aarch64_sve_ld1_gather: 16771 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO); 16772 case Intrinsic::aarch64_sve_ld1_gather_index: 16773 return performGatherLoadCombine(N, DAG, 16774 AArch64ISD::GLD1_SCALED_MERGE_ZERO); 16775 case Intrinsic::aarch64_sve_ld1_gather_sxtw: 16776 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO, 16777 /*OnlyPackedOffsets=*/false); 16778 case Intrinsic::aarch64_sve_ld1_gather_uxtw: 16779 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO, 16780 /*OnlyPackedOffsets=*/false); 16781 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index: 16782 return performGatherLoadCombine(N, DAG, 16783 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO, 16784 /*OnlyPackedOffsets=*/false); 16785 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index: 16786 return performGatherLoadCombine(N, DAG, 16787 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO, 16788 /*OnlyPackedOffsets=*/false); 16789 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset: 16790 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO); 16791 case Intrinsic::aarch64_sve_ldff1_gather: 16792 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO); 16793 case Intrinsic::aarch64_sve_ldff1_gather_index: 16794 return performGatherLoadCombine(N, DAG, 16795 AArch64ISD::GLDFF1_SCALED_MERGE_ZERO); 16796 case Intrinsic::aarch64_sve_ldff1_gather_sxtw: 16797 return performGatherLoadCombine(N, DAG, 16798 AArch64ISD::GLDFF1_SXTW_MERGE_ZERO, 16799 /*OnlyPackedOffsets=*/false); 16800 case Intrinsic::aarch64_sve_ldff1_gather_uxtw: 16801 return performGatherLoadCombine(N, DAG, 16802 AArch64ISD::GLDFF1_UXTW_MERGE_ZERO, 16803 /*OnlyPackedOffsets=*/false); 16804 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index: 16805 return performGatherLoadCombine(N, DAG, 16806 AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO, 16807 /*OnlyPackedOffsets=*/false); 16808 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index: 16809 return performGatherLoadCombine(N, DAG, 16810 AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO, 16811 /*OnlyPackedOffsets=*/false); 16812 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset: 16813 return performGatherLoadCombine(N, DAG, 16814 AArch64ISD::GLDFF1_IMM_MERGE_ZERO); 16815 case Intrinsic::aarch64_sve_st1_scatter: 16816 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED); 16817 case Intrinsic::aarch64_sve_st1_scatter_index: 16818 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED); 16819 case Intrinsic::aarch64_sve_st1_scatter_sxtw: 16820 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED, 16821 /*OnlyPackedOffsets=*/false); 16822 case Intrinsic::aarch64_sve_st1_scatter_uxtw: 16823 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED, 16824 /*OnlyPackedOffsets=*/false); 16825 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index: 16826 return performScatterStoreCombine(N, DAG, 16827 AArch64ISD::SST1_SXTW_SCALED_PRED, 16828 /*OnlyPackedOffsets=*/false); 16829 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index: 16830 return performScatterStoreCombine(N, DAG, 16831 AArch64ISD::SST1_UXTW_SCALED_PRED, 16832 /*OnlyPackedOffsets=*/false); 16833 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset: 16834 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED); 16835 case Intrinsic::aarch64_sve_tuple_get: { 16836 SDLoc DL(N); 16837 SDValue Chain = N->getOperand(0); 16838 SDValue Src1 = N->getOperand(2); 16839 SDValue Idx = N->getOperand(3); 16840 16841 uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue(); 16842 EVT ResVT = N->getValueType(0); 16843 uint64_t NumLanes = ResVT.getVectorElementCount().getKnownMinValue(); 16844 SDValue ExtIdx = DAG.getVectorIdxConstant(IdxConst * NumLanes, DL); 16845 SDValue Val = 16846 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Src1, ExtIdx); 16847 return DAG.getMergeValues({Val, Chain}, DL); 16848 } 16849 case Intrinsic::aarch64_sve_tuple_set: { 16850 SDLoc DL(N); 16851 SDValue Chain = N->getOperand(0); 16852 SDValue Tuple = N->getOperand(2); 16853 SDValue Idx = N->getOperand(3); 16854 SDValue Vec = N->getOperand(4); 16855 16856 EVT TupleVT = Tuple.getValueType(); 16857 uint64_t TupleLanes = TupleVT.getVectorElementCount().getKnownMinValue(); 16858 16859 uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue(); 16860 uint64_t NumLanes = 16861 Vec.getValueType().getVectorElementCount().getKnownMinValue(); 16862 16863 if ((TupleLanes % NumLanes) != 0) 16864 report_fatal_error("invalid tuple vector!"); 16865 16866 uint64_t NumVecs = TupleLanes / NumLanes; 16867 16868 SmallVector<SDValue, 4> Opnds; 16869 for (unsigned I = 0; I < NumVecs; ++I) { 16870 if (I == IdxConst) 16871 Opnds.push_back(Vec); 16872 else { 16873 SDValue ExtIdx = DAG.getVectorIdxConstant(I * NumLanes, DL); 16874 Opnds.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, 16875 Vec.getValueType(), Tuple, ExtIdx)); 16876 } 16877 } 16878 SDValue Concat = 16879 DAG.getNode(ISD::CONCAT_VECTORS, DL, Tuple.getValueType(), Opnds); 16880 return DAG.getMergeValues({Concat, Chain}, DL); 16881 } 16882 case Intrinsic::aarch64_sve_tuple_create2: 16883 case Intrinsic::aarch64_sve_tuple_create3: 16884 case Intrinsic::aarch64_sve_tuple_create4: { 16885 SDLoc DL(N); 16886 SDValue Chain = N->getOperand(0); 16887 16888 SmallVector<SDValue, 4> Opnds; 16889 for (unsigned I = 2; I < N->getNumOperands(); ++I) 16890 Opnds.push_back(N->getOperand(I)); 16891 16892 EVT VT = Opnds[0].getValueType(); 16893 EVT EltVT = VT.getVectorElementType(); 16894 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, 16895 VT.getVectorElementCount() * 16896 (N->getNumOperands() - 2)); 16897 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Opnds); 16898 return DAG.getMergeValues({Concat, Chain}, DL); 16899 } 16900 case Intrinsic::aarch64_sve_ld2: 16901 case Intrinsic::aarch64_sve_ld3: 16902 case Intrinsic::aarch64_sve_ld4: { 16903 SDLoc DL(N); 16904 SDValue Chain = N->getOperand(0); 16905 SDValue Mask = N->getOperand(2); 16906 SDValue BasePtr = N->getOperand(3); 16907 SDValue LoadOps[] = {Chain, Mask, BasePtr}; 16908 unsigned IntrinsicID = 16909 cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 16910 SDValue Result = 16911 LowerSVEStructLoad(IntrinsicID, LoadOps, N->getValueType(0), DAG, DL); 16912 return DAG.getMergeValues({Result, Chain}, DL); 16913 } 16914 case Intrinsic::aarch64_rndr: 16915 case Intrinsic::aarch64_rndrrs: { 16916 unsigned IntrinsicID = 16917 cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 16918 auto Register = 16919 (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR 16920 : AArch64SysReg::RNDRRS); 16921 SDLoc DL(N); 16922 SDValue A = DAG.getNode( 16923 AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::Glue, MVT::Other), 16924 N->getOperand(0), DAG.getConstant(Register, DL, MVT::i64)); 16925 SDValue B = DAG.getNode( 16926 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32), 16927 DAG.getConstant(0, DL, MVT::i32), 16928 DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1)); 16929 return DAG.getMergeValues( 16930 {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL); 16931 } 16932 default: 16933 break; 16934 } 16935 break; 16936 case ISD::GlobalAddress: 16937 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine()); 16938 } 16939 return SDValue(); 16940 } 16941 16942 // Check if the return value is used as only a return value, as otherwise 16943 // we can't perform a tail-call. In particular, we need to check for 16944 // target ISD nodes that are returns and any other "odd" constructs 16945 // that the generic analysis code won't necessarily catch. 16946 bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N, 16947 SDValue &Chain) const { 16948 if (N->getNumValues() != 1) 16949 return false; 16950 if (!N->hasNUsesOfValue(1, 0)) 16951 return false; 16952 16953 SDValue TCChain = Chain; 16954 SDNode *Copy = *N->use_begin(); 16955 if (Copy->getOpcode() == ISD::CopyToReg) { 16956 // If the copy has a glue operand, we conservatively assume it isn't safe to 16957 // perform a tail call. 16958 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() == 16959 MVT::Glue) 16960 return false; 16961 TCChain = Copy->getOperand(0); 16962 } else if (Copy->getOpcode() != ISD::FP_EXTEND) 16963 return false; 16964 16965 bool HasRet = false; 16966 for (SDNode *Node : Copy->uses()) { 16967 if (Node->getOpcode() != AArch64ISD::RET_FLAG) 16968 return false; 16969 HasRet = true; 16970 } 16971 16972 if (!HasRet) 16973 return false; 16974 16975 Chain = TCChain; 16976 return true; 16977 } 16978 16979 // Return whether the an instruction can potentially be optimized to a tail 16980 // call. This will cause the optimizers to attempt to move, or duplicate, 16981 // return instructions to help enable tail call optimizations for this 16982 // instruction. 16983 bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { 16984 return CI->isTailCall(); 16985 } 16986 16987 bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base, 16988 SDValue &Offset, 16989 ISD::MemIndexedMode &AM, 16990 bool &IsInc, 16991 SelectionDAG &DAG) const { 16992 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB) 16993 return false; 16994 16995 Base = Op->getOperand(0); 16996 // All of the indexed addressing mode instructions take a signed 16997 // 9 bit immediate offset. 16998 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) { 16999 int64_t RHSC = RHS->getSExtValue(); 17000 if (Op->getOpcode() == ISD::SUB) 17001 RHSC = -(uint64_t)RHSC; 17002 if (!isInt<9>(RHSC)) 17003 return false; 17004 IsInc = (Op->getOpcode() == ISD::ADD); 17005 Offset = Op->getOperand(1); 17006 return true; 17007 } 17008 return false; 17009 } 17010 17011 bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, 17012 SDValue &Offset, 17013 ISD::MemIndexedMode &AM, 17014 SelectionDAG &DAG) const { 17015 EVT VT; 17016 SDValue Ptr; 17017 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 17018 VT = LD->getMemoryVT(); 17019 Ptr = LD->getBasePtr(); 17020 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 17021 VT = ST->getMemoryVT(); 17022 Ptr = ST->getBasePtr(); 17023 } else 17024 return false; 17025 17026 bool IsInc; 17027 if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG)) 17028 return false; 17029 AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC; 17030 return true; 17031 } 17032 17033 bool AArch64TargetLowering::getPostIndexedAddressParts( 17034 SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, 17035 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const { 17036 EVT VT; 17037 SDValue Ptr; 17038 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 17039 VT = LD->getMemoryVT(); 17040 Ptr = LD->getBasePtr(); 17041 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 17042 VT = ST->getMemoryVT(); 17043 Ptr = ST->getBasePtr(); 17044 } else 17045 return false; 17046 17047 bool IsInc; 17048 if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG)) 17049 return false; 17050 // Post-indexing updates the base, so it's not a valid transform 17051 // if that's not the same as the load's pointer. 17052 if (Ptr != Base) 17053 return false; 17054 AM = IsInc ? ISD::POST_INC : ISD::POST_DEC; 17055 return true; 17056 } 17057 17058 void AArch64TargetLowering::ReplaceBITCASTResults( 17059 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { 17060 SDLoc DL(N); 17061 SDValue Op = N->getOperand(0); 17062 EVT VT = N->getValueType(0); 17063 EVT SrcVT = Op.getValueType(); 17064 17065 if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) { 17066 assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() && 17067 "Expected fp->int bitcast!"); 17068 SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG); 17069 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult)); 17070 return; 17071 } 17072 17073 if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16)) 17074 return; 17075 17076 Op = SDValue( 17077 DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32, 17078 DAG.getUNDEF(MVT::i32), Op, 17079 DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)), 17080 0); 17081 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op); 17082 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op)); 17083 } 17084 17085 static void ReplaceReductionResults(SDNode *N, 17086 SmallVectorImpl<SDValue> &Results, 17087 SelectionDAG &DAG, unsigned InterOp, 17088 unsigned AcrossOp) { 17089 EVT LoVT, HiVT; 17090 SDValue Lo, Hi; 17091 SDLoc dl(N); 17092 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); 17093 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); 17094 SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi); 17095 SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal); 17096 Results.push_back(SplitVal); 17097 } 17098 17099 static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) { 17100 SDLoc DL(N); 17101 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, N); 17102 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, 17103 DAG.getNode(ISD::SRL, DL, MVT::i128, N, 17104 DAG.getConstant(64, DL, MVT::i64))); 17105 return std::make_pair(Lo, Hi); 17106 } 17107 17108 void AArch64TargetLowering::ReplaceExtractSubVectorResults( 17109 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { 17110 SDValue In = N->getOperand(0); 17111 EVT InVT = In.getValueType(); 17112 17113 // Common code will handle these just fine. 17114 if (!InVT.isScalableVector() || !InVT.isInteger()) 17115 return; 17116 17117 SDLoc DL(N); 17118 EVT VT = N->getValueType(0); 17119 17120 // The following checks bail if this is not a halving operation. 17121 17122 ElementCount ResEC = VT.getVectorElementCount(); 17123 17124 if (InVT.getVectorElementCount() != (ResEC * 2)) 17125 return; 17126 17127 auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1)); 17128 if (!CIndex) 17129 return; 17130 17131 unsigned Index = CIndex->getZExtValue(); 17132 if ((Index != 0) && (Index != ResEC.getKnownMinValue())) 17133 return; 17134 17135 unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI; 17136 EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext()); 17137 17138 SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0)); 17139 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half)); 17140 } 17141 17142 // Create an even/odd pair of X registers holding integer value V. 17143 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) { 17144 SDLoc dl(V.getNode()); 17145 SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i64); 17146 SDValue VHi = DAG.getAnyExtOrTrunc( 17147 DAG.getNode(ISD::SRL, dl, MVT::i128, V, DAG.getConstant(64, dl, MVT::i64)), 17148 dl, MVT::i64); 17149 if (DAG.getDataLayout().isBigEndian()) 17150 std::swap (VLo, VHi); 17151 SDValue RegClass = 17152 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32); 17153 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32); 17154 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32); 17155 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 }; 17156 return SDValue( 17157 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0); 17158 } 17159 17160 static void ReplaceCMP_SWAP_128Results(SDNode *N, 17161 SmallVectorImpl<SDValue> &Results, 17162 SelectionDAG &DAG, 17163 const AArch64Subtarget *Subtarget) { 17164 assert(N->getValueType(0) == MVT::i128 && 17165 "AtomicCmpSwap on types less than 128 should be legal"); 17166 17167 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand(); 17168 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) { 17169 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type, 17170 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG. 17171 SDValue Ops[] = { 17172 createGPRPairNode(DAG, N->getOperand(2)), // Compare value 17173 createGPRPairNode(DAG, N->getOperand(3)), // Store value 17174 N->getOperand(1), // Ptr 17175 N->getOperand(0), // Chain in 17176 }; 17177 17178 unsigned Opcode; 17179 switch (MemOp->getMergedOrdering()) { 17180 case AtomicOrdering::Monotonic: 17181 Opcode = AArch64::CASPX; 17182 break; 17183 case AtomicOrdering::Acquire: 17184 Opcode = AArch64::CASPAX; 17185 break; 17186 case AtomicOrdering::Release: 17187 Opcode = AArch64::CASPLX; 17188 break; 17189 case AtomicOrdering::AcquireRelease: 17190 case AtomicOrdering::SequentiallyConsistent: 17191 Opcode = AArch64::CASPALX; 17192 break; 17193 default: 17194 llvm_unreachable("Unexpected ordering!"); 17195 } 17196 17197 MachineSDNode *CmpSwap = DAG.getMachineNode( 17198 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops); 17199 DAG.setNodeMemRefs(CmpSwap, {MemOp}); 17200 17201 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64; 17202 if (DAG.getDataLayout().isBigEndian()) 17203 std::swap(SubReg1, SubReg2); 17204 SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64, 17205 SDValue(CmpSwap, 0)); 17206 SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64, 17207 SDValue(CmpSwap, 0)); 17208 Results.push_back( 17209 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi)); 17210 Results.push_back(SDValue(CmpSwap, 1)); // Chain out 17211 return; 17212 } 17213 17214 unsigned Opcode; 17215 switch (MemOp->getMergedOrdering()) { 17216 case AtomicOrdering::Monotonic: 17217 Opcode = AArch64::CMP_SWAP_128_MONOTONIC; 17218 break; 17219 case AtomicOrdering::Acquire: 17220 Opcode = AArch64::CMP_SWAP_128_ACQUIRE; 17221 break; 17222 case AtomicOrdering::Release: 17223 Opcode = AArch64::CMP_SWAP_128_RELEASE; 17224 break; 17225 case AtomicOrdering::AcquireRelease: 17226 case AtomicOrdering::SequentiallyConsistent: 17227 Opcode = AArch64::CMP_SWAP_128; 17228 break; 17229 default: 17230 llvm_unreachable("Unexpected ordering!"); 17231 } 17232 17233 auto Desired = splitInt128(N->getOperand(2), DAG); 17234 auto New = splitInt128(N->getOperand(3), DAG); 17235 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second, 17236 New.first, New.second, N->getOperand(0)}; 17237 SDNode *CmpSwap = DAG.getMachineNode( 17238 Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), 17239 Ops); 17240 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp}); 17241 17242 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, 17243 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1))); 17244 Results.push_back(SDValue(CmpSwap, 3)); 17245 } 17246 17247 void AArch64TargetLowering::ReplaceNodeResults( 17248 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { 17249 switch (N->getOpcode()) { 17250 default: 17251 llvm_unreachable("Don't know how to custom expand this"); 17252 case ISD::BITCAST: 17253 ReplaceBITCASTResults(N, Results, DAG); 17254 return; 17255 case ISD::VECREDUCE_ADD: 17256 case ISD::VECREDUCE_SMAX: 17257 case ISD::VECREDUCE_SMIN: 17258 case ISD::VECREDUCE_UMAX: 17259 case ISD::VECREDUCE_UMIN: 17260 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG)); 17261 return; 17262 17263 case ISD::CTPOP: 17264 if (SDValue Result = LowerCTPOP(SDValue(N, 0), DAG)) 17265 Results.push_back(Result); 17266 return; 17267 case AArch64ISD::SADDV: 17268 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV); 17269 return; 17270 case AArch64ISD::UADDV: 17271 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV); 17272 return; 17273 case AArch64ISD::SMINV: 17274 ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV); 17275 return; 17276 case AArch64ISD::UMINV: 17277 ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV); 17278 return; 17279 case AArch64ISD::SMAXV: 17280 ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV); 17281 return; 17282 case AArch64ISD::UMAXV: 17283 ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV); 17284 return; 17285 case ISD::FP_TO_UINT: 17286 case ISD::FP_TO_SINT: 17287 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion"); 17288 // Let normal code take care of it by not adding anything to Results. 17289 return; 17290 case ISD::ATOMIC_CMP_SWAP: 17291 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget); 17292 return; 17293 case ISD::LOAD: { 17294 assert(SDValue(N, 0).getValueType() == MVT::i128 && 17295 "unexpected load's value type"); 17296 LoadSDNode *LoadNode = cast<LoadSDNode>(N); 17297 if (!LoadNode->isVolatile() || LoadNode->getMemoryVT() != MVT::i128) { 17298 // Non-volatile loads are optimized later in AArch64's load/store 17299 // optimizer. 17300 return; 17301 } 17302 17303 SDValue Result = DAG.getMemIntrinsicNode( 17304 AArch64ISD::LDP, SDLoc(N), 17305 DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}), 17306 {LoadNode->getChain(), LoadNode->getBasePtr()}, LoadNode->getMemoryVT(), 17307 LoadNode->getMemOperand()); 17308 17309 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, 17310 Result.getValue(0), Result.getValue(1)); 17311 Results.append({Pair, Result.getValue(2) /* Chain */}); 17312 return; 17313 } 17314 case ISD::EXTRACT_SUBVECTOR: 17315 ReplaceExtractSubVectorResults(N, Results, DAG); 17316 return; 17317 case ISD::INSERT_SUBVECTOR: 17318 // Custom lowering has been requested for INSERT_SUBVECTOR -- but delegate 17319 // to common code for result type legalisation 17320 return; 17321 case ISD::INTRINSIC_WO_CHAIN: { 17322 EVT VT = N->getValueType(0); 17323 assert((VT == MVT::i8 || VT == MVT::i16) && 17324 "custom lowering for unexpected type"); 17325 17326 ConstantSDNode *CN = cast<ConstantSDNode>(N->getOperand(0)); 17327 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); 17328 switch (IntID) { 17329 default: 17330 return; 17331 case Intrinsic::aarch64_sve_clasta_n: { 17332 SDLoc DL(N); 17333 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2)); 17334 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32, 17335 N->getOperand(1), Op2, N->getOperand(3)); 17336 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V)); 17337 return; 17338 } 17339 case Intrinsic::aarch64_sve_clastb_n: { 17340 SDLoc DL(N); 17341 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2)); 17342 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32, 17343 N->getOperand(1), Op2, N->getOperand(3)); 17344 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V)); 17345 return; 17346 } 17347 case Intrinsic::aarch64_sve_lasta: { 17348 SDLoc DL(N); 17349 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32, 17350 N->getOperand(1), N->getOperand(2)); 17351 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V)); 17352 return; 17353 } 17354 case Intrinsic::aarch64_sve_lastb: { 17355 SDLoc DL(N); 17356 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32, 17357 N->getOperand(1), N->getOperand(2)); 17358 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V)); 17359 return; 17360 } 17361 } 17362 } 17363 } 17364 } 17365 17366 bool AArch64TargetLowering::useLoadStackGuardNode() const { 17367 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia()) 17368 return TargetLowering::useLoadStackGuardNode(); 17369 return true; 17370 } 17371 17372 unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const { 17373 // Combine multiple FDIVs with the same divisor into multiple FMULs by the 17374 // reciprocal if there are three or more FDIVs. 17375 return 3; 17376 } 17377 17378 TargetLoweringBase::LegalizeTypeAction 17379 AArch64TargetLowering::getPreferredVectorAction(MVT VT) const { 17380 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8, 17381 // v4i16, v2i32 instead of to promote. 17382 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 || 17383 VT == MVT::v1f32) 17384 return TypeWidenVector; 17385 17386 return TargetLoweringBase::getPreferredVectorAction(VT); 17387 } 17388 17389 // Loads and stores less than 128-bits are already atomic; ones above that 17390 // are doomed anyway, so defer to the default libcall and blame the OS when 17391 // things go wrong. 17392 bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { 17393 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); 17394 return Size == 128; 17395 } 17396 17397 // Loads and stores less than 128-bits are already atomic; ones above that 17398 // are doomed anyway, so defer to the default libcall and blame the OS when 17399 // things go wrong. 17400 TargetLowering::AtomicExpansionKind 17401 AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { 17402 unsigned Size = LI->getType()->getPrimitiveSizeInBits(); 17403 return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None; 17404 } 17405 17406 // For the real atomic operations, we have ldxr/stxr up to 128 bits, 17407 TargetLowering::AtomicExpansionKind 17408 AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 17409 if (AI->isFloatingPointOperation()) 17410 return AtomicExpansionKind::CmpXChg; 17411 17412 unsigned Size = AI->getType()->getPrimitiveSizeInBits(); 17413 if (Size > 128) return AtomicExpansionKind::None; 17414 17415 // Nand is not supported in LSE. 17416 // Leave 128 bits to LLSC or CmpXChg. 17417 if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) { 17418 if (Subtarget->hasLSE()) 17419 return AtomicExpansionKind::None; 17420 if (Subtarget->outlineAtomics()) { 17421 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far. 17422 // Don't outline them unless 17423 // (1) high level <atomic> support approved: 17424 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf 17425 // (2) low level libgcc and compiler-rt support implemented by: 17426 // min/max outline atomics helpers 17427 if (AI->getOperation() != AtomicRMWInst::Min && 17428 AI->getOperation() != AtomicRMWInst::Max && 17429 AI->getOperation() != AtomicRMWInst::UMin && 17430 AI->getOperation() != AtomicRMWInst::UMax) { 17431 return AtomicExpansionKind::None; 17432 } 17433 } 17434 } 17435 17436 // At -O0, fast-regalloc cannot cope with the live vregs necessary to 17437 // implement atomicrmw without spilling. If the target address is also on the 17438 // stack and close enough to the spill slot, this can lead to a situation 17439 // where the monitor always gets cleared and the atomic operation can never 17440 // succeed. So at -O0 lower this operation to a CAS loop. 17441 if (getTargetMachine().getOptLevel() == CodeGenOpt::None) 17442 return AtomicExpansionKind::CmpXChg; 17443 17444 return AtomicExpansionKind::LLSC; 17445 } 17446 17447 TargetLowering::AtomicExpansionKind 17448 AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR( 17449 AtomicCmpXchgInst *AI) const { 17450 // If subtarget has LSE, leave cmpxchg intact for codegen. 17451 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) 17452 return AtomicExpansionKind::None; 17453 // At -O0, fast-regalloc cannot cope with the live vregs necessary to 17454 // implement cmpxchg without spilling. If the address being exchanged is also 17455 // on the stack and close enough to the spill slot, this can lead to a 17456 // situation where the monitor always gets cleared and the atomic operation 17457 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. 17458 if (getTargetMachine().getOptLevel() == CodeGenOpt::None) 17459 return AtomicExpansionKind::None; 17460 17461 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand 17462 // it. 17463 unsigned Size = AI->getCompareOperand()->getType()->getPrimitiveSizeInBits(); 17464 if (Size > 64) 17465 return AtomicExpansionKind::None; 17466 17467 return AtomicExpansionKind::LLSC; 17468 } 17469 17470 Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder, 17471 Type *ValueTy, Value *Addr, 17472 AtomicOrdering Ord) const { 17473 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 17474 bool IsAcquire = isAcquireOrStronger(Ord); 17475 17476 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd 17477 // intrinsic must return {i64, i64} and we have to recombine them into a 17478 // single i128 here. 17479 if (ValueTy->getPrimitiveSizeInBits() == 128) { 17480 Intrinsic::ID Int = 17481 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp; 17482 Function *Ldxr = Intrinsic::getDeclaration(M, Int); 17483 17484 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 17485 Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi"); 17486 17487 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); 17488 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); 17489 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64"); 17490 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64"); 17491 return Builder.CreateOr( 17492 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64"); 17493 } 17494 17495 Type *Tys[] = { Addr->getType() }; 17496 Intrinsic::ID Int = 17497 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr; 17498 Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys); 17499 17500 const DataLayout &DL = M->getDataLayout(); 17501 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy)); 17502 Value *Trunc = Builder.CreateTrunc(Builder.CreateCall(Ldxr, Addr), IntEltTy); 17503 17504 return Builder.CreateBitCast(Trunc, ValueTy); 17505 } 17506 17507 void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance( 17508 IRBuilderBase &Builder) const { 17509 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 17510 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex)); 17511 } 17512 17513 Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder, 17514 Value *Val, Value *Addr, 17515 AtomicOrdering Ord) const { 17516 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 17517 bool IsRelease = isReleaseOrStronger(Ord); 17518 17519 // Since the intrinsics must have legal type, the i128 intrinsics take two 17520 // parameters: "i64, i64". We must marshal Val into the appropriate form 17521 // before the call. 17522 if (Val->getType()->getPrimitiveSizeInBits() == 128) { 17523 Intrinsic::ID Int = 17524 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp; 17525 Function *Stxr = Intrinsic::getDeclaration(M, Int); 17526 Type *Int64Ty = Type::getInt64Ty(M->getContext()); 17527 17528 Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo"); 17529 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi"); 17530 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 17531 return Builder.CreateCall(Stxr, {Lo, Hi, Addr}); 17532 } 17533 17534 Intrinsic::ID Int = 17535 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr; 17536 Type *Tys[] = { Addr->getType() }; 17537 Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys); 17538 17539 const DataLayout &DL = M->getDataLayout(); 17540 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType())); 17541 Val = Builder.CreateBitCast(Val, IntValTy); 17542 17543 return Builder.CreateCall(Stxr, 17544 {Builder.CreateZExtOrBitCast( 17545 Val, Stxr->getFunctionType()->getParamType(0)), 17546 Addr}); 17547 } 17548 17549 bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters( 17550 Type *Ty, CallingConv::ID CallConv, bool isVarArg, 17551 const DataLayout &DL) const { 17552 if (!Ty->isArrayTy()) { 17553 const TypeSize &TySize = Ty->getPrimitiveSizeInBits(); 17554 return TySize.isScalable() && TySize.getKnownMinSize() > 128; 17555 } 17556 17557 // All non aggregate members of the type must have the same type 17558 SmallVector<EVT> ValueVTs; 17559 ComputeValueVTs(*this, DL, Ty, ValueVTs); 17560 return is_splat(ValueVTs); 17561 } 17562 17563 bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &, 17564 EVT) const { 17565 return false; 17566 } 17567 17568 static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) { 17569 Module *M = IRB.GetInsertBlock()->getParent()->getParent(); 17570 Function *ThreadPointerFunc = 17571 Intrinsic::getDeclaration(M, Intrinsic::thread_pointer); 17572 return IRB.CreatePointerCast( 17573 IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc), 17574 Offset), 17575 IRB.getInt8PtrTy()->getPointerTo(0)); 17576 } 17577 17578 Value *AArch64TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const { 17579 // Android provides a fixed TLS slot for the stack cookie. See the definition 17580 // of TLS_SLOT_STACK_GUARD in 17581 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h 17582 if (Subtarget->isTargetAndroid()) 17583 return UseTlsOffset(IRB, 0x28); 17584 17585 // Fuchsia is similar. 17586 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value. 17587 if (Subtarget->isTargetFuchsia()) 17588 return UseTlsOffset(IRB, -0x10); 17589 17590 return TargetLowering::getIRStackGuard(IRB); 17591 } 17592 17593 void AArch64TargetLowering::insertSSPDeclarations(Module &M) const { 17594 // MSVC CRT provides functionalities for stack protection. 17595 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) { 17596 // MSVC CRT has a global variable holding security cookie. 17597 M.getOrInsertGlobal("__security_cookie", 17598 Type::getInt8PtrTy(M.getContext())); 17599 17600 // MSVC CRT has a function to validate security cookie. 17601 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( 17602 "__security_check_cookie", Type::getVoidTy(M.getContext()), 17603 Type::getInt8PtrTy(M.getContext())); 17604 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) { 17605 F->setCallingConv(CallingConv::Win64); 17606 F->addAttribute(1, Attribute::AttrKind::InReg); 17607 } 17608 return; 17609 } 17610 TargetLowering::insertSSPDeclarations(M); 17611 } 17612 17613 Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const { 17614 // MSVC CRT has a global variable holding security cookie. 17615 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 17616 return M.getGlobalVariable("__security_cookie"); 17617 return TargetLowering::getSDagStackGuard(M); 17618 } 17619 17620 Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const { 17621 // MSVC CRT has a function to validate security cookie. 17622 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 17623 return M.getFunction("__security_check_cookie"); 17624 return TargetLowering::getSSPStackGuardCheck(M); 17625 } 17626 17627 Value * 17628 AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const { 17629 // Android provides a fixed TLS slot for the SafeStack pointer. See the 17630 // definition of TLS_SLOT_SAFESTACK in 17631 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h 17632 if (Subtarget->isTargetAndroid()) 17633 return UseTlsOffset(IRB, 0x48); 17634 17635 // Fuchsia is similar. 17636 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value. 17637 if (Subtarget->isTargetFuchsia()) 17638 return UseTlsOffset(IRB, -0x8); 17639 17640 return TargetLowering::getSafeStackPointerLocation(IRB); 17641 } 17642 17643 bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial( 17644 const Instruction &AndI) const { 17645 // Only sink 'and' mask to cmp use block if it is masking a single bit, since 17646 // this is likely to be fold the and/cmp/br into a single tbz instruction. It 17647 // may be beneficial to sink in other cases, but we would have to check that 17648 // the cmp would not get folded into the br to form a cbz for these to be 17649 // beneficial. 17650 ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1)); 17651 if (!Mask) 17652 return false; 17653 return Mask->getValue().isPowerOf2(); 17654 } 17655 17656 bool AArch64TargetLowering:: 17657 shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( 17658 SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, 17659 unsigned OldShiftOpcode, unsigned NewShiftOpcode, 17660 SelectionDAG &DAG) const { 17661 // Does baseline recommend not to perform the fold by default? 17662 if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( 17663 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG)) 17664 return false; 17665 // Else, if this is a vector shift, prefer 'shl'. 17666 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL; 17667 } 17668 17669 bool AArch64TargetLowering::shouldExpandShift(SelectionDAG &DAG, 17670 SDNode *N) const { 17671 if (DAG.getMachineFunction().getFunction().hasMinSize() && 17672 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin()) 17673 return false; 17674 return true; 17675 } 17676 17677 void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { 17678 // Update IsSplitCSR in AArch64unctionInfo. 17679 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>(); 17680 AFI->setIsSplitCSR(true); 17681 } 17682 17683 void AArch64TargetLowering::insertCopiesSplitCSR( 17684 MachineBasicBlock *Entry, 17685 const SmallVectorImpl<MachineBasicBlock *> &Exits) const { 17686 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 17687 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); 17688 if (!IStart) 17689 return; 17690 17691 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 17692 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); 17693 MachineBasicBlock::iterator MBBI = Entry->begin(); 17694 for (const MCPhysReg *I = IStart; *I; ++I) { 17695 const TargetRegisterClass *RC = nullptr; 17696 if (AArch64::GPR64RegClass.contains(*I)) 17697 RC = &AArch64::GPR64RegClass; 17698 else if (AArch64::FPR64RegClass.contains(*I)) 17699 RC = &AArch64::FPR64RegClass; 17700 else 17701 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 17702 17703 Register NewVR = MRI->createVirtualRegister(RC); 17704 // Create copy from CSR to a virtual register. 17705 // FIXME: this currently does not emit CFI pseudo-instructions, it works 17706 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be 17707 // nounwind. If we want to generalize this later, we may need to emit 17708 // CFI pseudo-instructions. 17709 assert(Entry->getParent()->getFunction().hasFnAttribute( 17710 Attribute::NoUnwind) && 17711 "Function should be nounwind in insertCopiesSplitCSR!"); 17712 Entry->addLiveIn(*I); 17713 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) 17714 .addReg(*I); 17715 17716 // Insert the copy-back instructions right before the terminator. 17717 for (auto *Exit : Exits) 17718 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), 17719 TII->get(TargetOpcode::COPY), *I) 17720 .addReg(NewVR); 17721 } 17722 } 17723 17724 bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const { 17725 // Integer division on AArch64 is expensive. However, when aggressively 17726 // optimizing for code size, we prefer to use a div instruction, as it is 17727 // usually smaller than the alternative sequence. 17728 // The exception to this is vector division. Since AArch64 doesn't have vector 17729 // integer division, leaving the division as-is is a loss even in terms of 17730 // size, because it will have to be scalarized, while the alternative code 17731 // sequence can be performed in vector form. 17732 bool OptSize = Attr.hasFnAttribute(Attribute::MinSize); 17733 return OptSize && !VT.isVector(); 17734 } 17735 17736 bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const { 17737 // We want inc-of-add for scalars and sub-of-not for vectors. 17738 return VT.isScalarInteger(); 17739 } 17740 17741 bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const { 17742 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint(); 17743 } 17744 17745 unsigned 17746 AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const { 17747 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows()) 17748 return getPointerTy(DL).getSizeInBits(); 17749 17750 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32; 17751 } 17752 17753 void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const { 17754 MF.getFrameInfo().computeMaxCallFrameSize(MF); 17755 TargetLoweringBase::finalizeLowering(MF); 17756 } 17757 17758 // Unlike X86, we let frame lowering assign offsets to all catch objects. 17759 bool AArch64TargetLowering::needsFixedCatchObjects() const { 17760 return false; 17761 } 17762 17763 bool AArch64TargetLowering::shouldLocalize( 17764 const MachineInstr &MI, const TargetTransformInfo *TTI) const { 17765 switch (MI.getOpcode()) { 17766 case TargetOpcode::G_GLOBAL_VALUE: { 17767 // On Darwin, TLS global vars get selected into function calls, which 17768 // we don't want localized, as they can get moved into the middle of a 17769 // another call sequence. 17770 const GlobalValue &GV = *MI.getOperand(1).getGlobal(); 17771 if (GV.isThreadLocal() && Subtarget->isTargetMachO()) 17772 return false; 17773 break; 17774 } 17775 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being 17776 // localizable. 17777 case AArch64::ADRP: 17778 case AArch64::G_ADD_LOW: 17779 return true; 17780 default: 17781 break; 17782 } 17783 return TargetLoweringBase::shouldLocalize(MI, TTI); 17784 } 17785 17786 bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const { 17787 if (isa<ScalableVectorType>(Inst.getType())) 17788 return true; 17789 17790 for (unsigned i = 0; i < Inst.getNumOperands(); ++i) 17791 if (isa<ScalableVectorType>(Inst.getOperand(i)->getType())) 17792 return true; 17793 17794 if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) { 17795 if (isa<ScalableVectorType>(AI->getAllocatedType())) 17796 return true; 17797 } 17798 17799 return false; 17800 } 17801 17802 // Return the largest legal scalable vector type that matches VT's element type. 17803 static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) { 17804 assert(VT.isFixedLengthVector() && 17805 DAG.getTargetLoweringInfo().isTypeLegal(VT) && 17806 "Expected legal fixed length vector!"); 17807 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { 17808 default: 17809 llvm_unreachable("unexpected element type for SVE container"); 17810 case MVT::i8: 17811 return EVT(MVT::nxv16i8); 17812 case MVT::i16: 17813 return EVT(MVT::nxv8i16); 17814 case MVT::i32: 17815 return EVT(MVT::nxv4i32); 17816 case MVT::i64: 17817 return EVT(MVT::nxv2i64); 17818 case MVT::f16: 17819 return EVT(MVT::nxv8f16); 17820 case MVT::f32: 17821 return EVT(MVT::nxv4f32); 17822 case MVT::f64: 17823 return EVT(MVT::nxv2f64); 17824 } 17825 } 17826 17827 // Return a PTRUE with active lanes corresponding to the extent of VT. 17828 static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, 17829 EVT VT) { 17830 assert(VT.isFixedLengthVector() && 17831 DAG.getTargetLoweringInfo().isTypeLegal(VT) && 17832 "Expected legal fixed length vector!"); 17833 17834 int PgPattern; 17835 switch (VT.getVectorNumElements()) { 17836 default: 17837 llvm_unreachable("unexpected element count for SVE predicate"); 17838 case 1: 17839 PgPattern = AArch64SVEPredPattern::vl1; 17840 break; 17841 case 2: 17842 PgPattern = AArch64SVEPredPattern::vl2; 17843 break; 17844 case 4: 17845 PgPattern = AArch64SVEPredPattern::vl4; 17846 break; 17847 case 8: 17848 PgPattern = AArch64SVEPredPattern::vl8; 17849 break; 17850 case 16: 17851 PgPattern = AArch64SVEPredPattern::vl16; 17852 break; 17853 case 32: 17854 PgPattern = AArch64SVEPredPattern::vl32; 17855 break; 17856 case 64: 17857 PgPattern = AArch64SVEPredPattern::vl64; 17858 break; 17859 case 128: 17860 PgPattern = AArch64SVEPredPattern::vl128; 17861 break; 17862 case 256: 17863 PgPattern = AArch64SVEPredPattern::vl256; 17864 break; 17865 } 17866 17867 // TODO: For vectors that are exactly getMaxSVEVectorSizeInBits big, we can 17868 // use AArch64SVEPredPattern::all, which can enable the use of unpredicated 17869 // variants of instructions when available. 17870 17871 MVT MaskVT; 17872 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { 17873 default: 17874 llvm_unreachable("unexpected element type for SVE predicate"); 17875 case MVT::i8: 17876 MaskVT = MVT::nxv16i1; 17877 break; 17878 case MVT::i16: 17879 case MVT::f16: 17880 MaskVT = MVT::nxv8i1; 17881 break; 17882 case MVT::i32: 17883 case MVT::f32: 17884 MaskVT = MVT::nxv4i1; 17885 break; 17886 case MVT::i64: 17887 case MVT::f64: 17888 MaskVT = MVT::nxv2i1; 17889 break; 17890 } 17891 17892 return DAG.getNode(AArch64ISD::PTRUE, DL, MaskVT, 17893 DAG.getTargetConstant(PgPattern, DL, MVT::i64)); 17894 } 17895 17896 static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, 17897 EVT VT) { 17898 assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && 17899 "Expected legal scalable vector!"); 17900 auto PredTy = VT.changeVectorElementType(MVT::i1); 17901 return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all); 17902 } 17903 17904 static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) { 17905 if (VT.isFixedLengthVector()) 17906 return getPredicateForFixedLengthVector(DAG, DL, VT); 17907 17908 return getPredicateForScalableVector(DAG, DL, VT); 17909 } 17910 17911 // Grow V to consume an entire SVE register. 17912 static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) { 17913 assert(VT.isScalableVector() && 17914 "Expected to convert into a scalable vector!"); 17915 assert(V.getValueType().isFixedLengthVector() && 17916 "Expected a fixed length vector operand!"); 17917 SDLoc DL(V); 17918 SDValue Zero = DAG.getConstant(0, DL, MVT::i64); 17919 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero); 17920 } 17921 17922 // Shrink V so it's just big enough to maintain a VT's worth of data. 17923 static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) { 17924 assert(VT.isFixedLengthVector() && 17925 "Expected to convert into a fixed length vector!"); 17926 assert(V.getValueType().isScalableVector() && 17927 "Expected a scalable vector operand!"); 17928 SDLoc DL(V); 17929 SDValue Zero = DAG.getConstant(0, DL, MVT::i64); 17930 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero); 17931 } 17932 17933 // Convert all fixed length vector loads larger than NEON to masked_loads. 17934 SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE( 17935 SDValue Op, SelectionDAG &DAG) const { 17936 auto Load = cast<LoadSDNode>(Op); 17937 17938 SDLoc DL(Op); 17939 EVT VT = Op.getValueType(); 17940 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 17941 17942 auto NewLoad = DAG.getMaskedLoad( 17943 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), 17944 getPredicateForFixedLengthVector(DAG, DL, VT), DAG.getUNDEF(ContainerVT), 17945 Load->getMemoryVT(), Load->getMemOperand(), Load->getAddressingMode(), 17946 Load->getExtensionType()); 17947 17948 auto Result = convertFromScalableVector(DAG, VT, NewLoad); 17949 SDValue MergedValues[2] = {Result, Load->getChain()}; 17950 return DAG.getMergeValues(MergedValues, DL); 17951 } 17952 17953 static SDValue convertFixedMaskToScalableVector(SDValue Mask, 17954 SelectionDAG &DAG) { 17955 SDLoc DL(Mask); 17956 EVT InVT = Mask.getValueType(); 17957 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); 17958 17959 auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask); 17960 auto Op2 = DAG.getConstant(0, DL, ContainerVT); 17961 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT); 17962 17963 EVT CmpVT = Pg.getValueType(); 17964 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT, 17965 {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)}); 17966 } 17967 17968 // Convert all fixed length vector loads larger than NEON to masked_loads. 17969 SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE( 17970 SDValue Op, SelectionDAG &DAG) const { 17971 auto Load = cast<MaskedLoadSDNode>(Op); 17972 17973 if (Load->getExtensionType() != ISD::LoadExtType::NON_EXTLOAD) 17974 return SDValue(); 17975 17976 SDLoc DL(Op); 17977 EVT VT = Op.getValueType(); 17978 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 17979 17980 SDValue Mask = convertFixedMaskToScalableVector(Load->getMask(), DAG); 17981 17982 SDValue PassThru; 17983 bool IsPassThruZeroOrUndef = false; 17984 17985 if (Load->getPassThru()->isUndef()) { 17986 PassThru = DAG.getUNDEF(ContainerVT); 17987 IsPassThruZeroOrUndef = true; 17988 } else { 17989 if (ContainerVT.isInteger()) 17990 PassThru = DAG.getConstant(0, DL, ContainerVT); 17991 else 17992 PassThru = DAG.getConstantFP(0, DL, ContainerVT); 17993 if (isZerosVector(Load->getPassThru().getNode())) 17994 IsPassThruZeroOrUndef = true; 17995 } 17996 17997 auto NewLoad = DAG.getMaskedLoad( 17998 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), 17999 Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(), 18000 Load->getAddressingMode(), Load->getExtensionType()); 18001 18002 if (!IsPassThruZeroOrUndef) { 18003 SDValue OldPassThru = 18004 convertToScalableVector(DAG, ContainerVT, Load->getPassThru()); 18005 NewLoad = DAG.getSelect(DL, ContainerVT, Mask, NewLoad, OldPassThru); 18006 } 18007 18008 auto Result = convertFromScalableVector(DAG, VT, NewLoad); 18009 SDValue MergedValues[2] = {Result, Load->getChain()}; 18010 return DAG.getMergeValues(MergedValues, DL); 18011 } 18012 18013 // Convert all fixed length vector stores larger than NEON to masked_stores. 18014 SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE( 18015 SDValue Op, SelectionDAG &DAG) const { 18016 auto Store = cast<StoreSDNode>(Op); 18017 18018 SDLoc DL(Op); 18019 EVT VT = Store->getValue().getValueType(); 18020 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 18021 18022 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue()); 18023 return DAG.getMaskedStore( 18024 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(), 18025 getPredicateForFixedLengthVector(DAG, DL, VT), Store->getMemoryVT(), 18026 Store->getMemOperand(), Store->getAddressingMode(), 18027 Store->isTruncatingStore()); 18028 } 18029 18030 SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE( 18031 SDValue Op, SelectionDAG &DAG) const { 18032 auto Store = cast<MaskedStoreSDNode>(Op); 18033 18034 if (Store->isTruncatingStore()) 18035 return SDValue(); 18036 18037 SDLoc DL(Op); 18038 EVT VT = Store->getValue().getValueType(); 18039 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 18040 18041 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue()); 18042 SDValue Mask = convertFixedMaskToScalableVector(Store->getMask(), DAG); 18043 18044 return DAG.getMaskedStore( 18045 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(), 18046 Mask, Store->getMemoryVT(), Store->getMemOperand(), 18047 Store->getAddressingMode(), Store->isTruncatingStore()); 18048 } 18049 18050 SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE( 18051 SDValue Op, SelectionDAG &DAG) const { 18052 SDLoc dl(Op); 18053 EVT VT = Op.getValueType(); 18054 EVT EltVT = VT.getVectorElementType(); 18055 18056 bool Signed = Op.getOpcode() == ISD::SDIV; 18057 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED; 18058 18059 // Scalable vector i32/i64 DIV is supported. 18060 if (EltVT == MVT::i32 || EltVT == MVT::i64) 18061 return LowerToPredicatedOp(Op, DAG, PredOpcode, /*OverrideNEON=*/true); 18062 18063 // Scalable vector i8/i16 DIV is not supported. Promote it to i32. 18064 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 18065 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); 18066 EVT FixedWidenedVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext()); 18067 EVT ScalableWidenedVT = getContainerForFixedLengthVector(DAG, FixedWidenedVT); 18068 18069 // If this is not a full vector, extend, div, and truncate it. 18070 EVT WidenedVT = VT.widenIntegerVectorElementType(*DAG.getContext()); 18071 if (DAG.getTargetLoweringInfo().isTypeLegal(WidenedVT)) { 18072 unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 18073 SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WidenedVT, Op.getOperand(0)); 18074 SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WidenedVT, Op.getOperand(1)); 18075 SDValue Div = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0, Op1); 18076 return DAG.getNode(ISD::TRUNCATE, dl, VT, Div); 18077 } 18078 18079 // Convert the operands to scalable vectors. 18080 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0)); 18081 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1)); 18082 18083 // Extend the scalable operands. 18084 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO; 18085 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI; 18086 SDValue Op0Lo = DAG.getNode(UnpkLo, dl, ScalableWidenedVT, Op0); 18087 SDValue Op1Lo = DAG.getNode(UnpkLo, dl, ScalableWidenedVT, Op1); 18088 SDValue Op0Hi = DAG.getNode(UnpkHi, dl, ScalableWidenedVT, Op0); 18089 SDValue Op1Hi = DAG.getNode(UnpkHi, dl, ScalableWidenedVT, Op1); 18090 18091 // Convert back to fixed vectors so the DIV can be further lowered. 18092 Op0Lo = convertFromScalableVector(DAG, FixedWidenedVT, Op0Lo); 18093 Op1Lo = convertFromScalableVector(DAG, FixedWidenedVT, Op1Lo); 18094 Op0Hi = convertFromScalableVector(DAG, FixedWidenedVT, Op0Hi); 18095 Op1Hi = convertFromScalableVector(DAG, FixedWidenedVT, Op1Hi); 18096 SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, FixedWidenedVT, 18097 Op0Lo, Op1Lo); 18098 SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, FixedWidenedVT, 18099 Op0Hi, Op1Hi); 18100 18101 // Convert again to scalable vectors to truncate. 18102 ResultLo = convertToScalableVector(DAG, ScalableWidenedVT, ResultLo); 18103 ResultHi = convertToScalableVector(DAG, ScalableWidenedVT, ResultHi); 18104 SDValue ScalableResult = DAG.getNode(AArch64ISD::UZP1, dl, ContainerVT, 18105 ResultLo, ResultHi); 18106 18107 return convertFromScalableVector(DAG, VT, ScalableResult); 18108 } 18109 18110 SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE( 18111 SDValue Op, SelectionDAG &DAG) const { 18112 EVT VT = Op.getValueType(); 18113 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); 18114 18115 SDLoc DL(Op); 18116 SDValue Val = Op.getOperand(0); 18117 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType()); 18118 Val = convertToScalableVector(DAG, ContainerVT, Val); 18119 18120 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND; 18121 unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO; 18122 18123 // Repeatedly unpack Val until the result is of the desired element type. 18124 switch (ContainerVT.getSimpleVT().SimpleTy) { 18125 default: 18126 llvm_unreachable("unimplemented container type"); 18127 case MVT::nxv16i8: 18128 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val); 18129 if (VT.getVectorElementType() == MVT::i16) 18130 break; 18131 LLVM_FALLTHROUGH; 18132 case MVT::nxv8i16: 18133 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val); 18134 if (VT.getVectorElementType() == MVT::i32) 18135 break; 18136 LLVM_FALLTHROUGH; 18137 case MVT::nxv4i32: 18138 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val); 18139 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!"); 18140 break; 18141 } 18142 18143 return convertFromScalableVector(DAG, VT, Val); 18144 } 18145 18146 SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE( 18147 SDValue Op, SelectionDAG &DAG) const { 18148 EVT VT = Op.getValueType(); 18149 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); 18150 18151 SDLoc DL(Op); 18152 SDValue Val = Op.getOperand(0); 18153 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType()); 18154 Val = convertToScalableVector(DAG, ContainerVT, Val); 18155 18156 // Repeatedly truncate Val until the result is of the desired element type. 18157 switch (ContainerVT.getSimpleVT().SimpleTy) { 18158 default: 18159 llvm_unreachable("unimplemented container type"); 18160 case MVT::nxv2i64: 18161 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val); 18162 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val); 18163 if (VT.getVectorElementType() == MVT::i32) 18164 break; 18165 LLVM_FALLTHROUGH; 18166 case MVT::nxv4i32: 18167 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val); 18168 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val); 18169 if (VT.getVectorElementType() == MVT::i16) 18170 break; 18171 LLVM_FALLTHROUGH; 18172 case MVT::nxv8i16: 18173 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val); 18174 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val); 18175 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!"); 18176 break; 18177 } 18178 18179 return convertFromScalableVector(DAG, VT, Val); 18180 } 18181 18182 SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt( 18183 SDValue Op, SelectionDAG &DAG) const { 18184 EVT VT = Op.getValueType(); 18185 EVT InVT = Op.getOperand(0).getValueType(); 18186 assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!"); 18187 18188 SDLoc DL(Op); 18189 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); 18190 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0)); 18191 18192 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1)); 18193 } 18194 18195 SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt( 18196 SDValue Op, SelectionDAG &DAG) const { 18197 EVT VT = Op.getValueType(); 18198 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); 18199 18200 SDLoc DL(Op); 18201 EVT InVT = Op.getOperand(0).getValueType(); 18202 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); 18203 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0)); 18204 18205 auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0, 18206 Op.getOperand(1), Op.getOperand(2)); 18207 18208 return convertFromScalableVector(DAG, VT, ScalableRes); 18209 } 18210 18211 // Convert vector operation 'Op' to an equivalent predicated operation whereby 18212 // the original operation's type is used to construct a suitable predicate. 18213 // NOTE: The results for inactive lanes are undefined. 18214 SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op, 18215 SelectionDAG &DAG, 18216 unsigned NewOp, 18217 bool OverrideNEON) const { 18218 EVT VT = Op.getValueType(); 18219 SDLoc DL(Op); 18220 auto Pg = getPredicateForVector(DAG, DL, VT); 18221 18222 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON)) { 18223 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 18224 18225 // Create list of operands by converting existing ones to scalable types. 18226 SmallVector<SDValue, 4> Operands = {Pg}; 18227 for (const SDValue &V : Op->op_values()) { 18228 if (isa<CondCodeSDNode>(V)) { 18229 Operands.push_back(V); 18230 continue; 18231 } 18232 18233 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) { 18234 EVT VTArg = VTNode->getVT().getVectorElementType(); 18235 EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg); 18236 Operands.push_back(DAG.getValueType(NewVTArg)); 18237 continue; 18238 } 18239 18240 assert(useSVEForFixedLengthVectorVT(V.getValueType(), OverrideNEON) && 18241 "Only fixed length vectors are supported!"); 18242 Operands.push_back(convertToScalableVector(DAG, ContainerVT, V)); 18243 } 18244 18245 if (isMergePassthruOpcode(NewOp)) 18246 Operands.push_back(DAG.getUNDEF(ContainerVT)); 18247 18248 auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands); 18249 return convertFromScalableVector(DAG, VT, ScalableRes); 18250 } 18251 18252 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!"); 18253 18254 SmallVector<SDValue, 4> Operands = {Pg}; 18255 for (const SDValue &V : Op->op_values()) { 18256 assert((!V.getValueType().isVector() || 18257 V.getValueType().isScalableVector()) && 18258 "Only scalable vectors are supported!"); 18259 Operands.push_back(V); 18260 } 18261 18262 if (isMergePassthruOpcode(NewOp)) 18263 Operands.push_back(DAG.getUNDEF(VT)); 18264 18265 return DAG.getNode(NewOp, DL, VT, Operands); 18266 } 18267 18268 // If a fixed length vector operation has no side effects when applied to 18269 // undefined elements, we can safely use scalable vectors to perform the same 18270 // operation without needing to worry about predication. 18271 SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op, 18272 SelectionDAG &DAG) const { 18273 EVT VT = Op.getValueType(); 18274 assert(useSVEForFixedLengthVectorVT(VT) && 18275 "Only expected to lower fixed length vector operation!"); 18276 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 18277 18278 // Create list of operands by converting existing ones to scalable types. 18279 SmallVector<SDValue, 4> Ops; 18280 for (const SDValue &V : Op->op_values()) { 18281 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!"); 18282 18283 // Pass through non-vector operands. 18284 if (!V.getValueType().isVector()) { 18285 Ops.push_back(V); 18286 continue; 18287 } 18288 18289 // "cast" fixed length vector to a scalable vector. 18290 assert(useSVEForFixedLengthVectorVT(V.getValueType()) && 18291 "Only fixed length vectors are supported!"); 18292 Ops.push_back(convertToScalableVector(DAG, ContainerVT, V)); 18293 } 18294 18295 auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops); 18296 return convertFromScalableVector(DAG, VT, ScalableRes); 18297 } 18298 18299 SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp, 18300 SelectionDAG &DAG) const { 18301 SDLoc DL(ScalarOp); 18302 SDValue AccOp = ScalarOp.getOperand(0); 18303 SDValue VecOp = ScalarOp.getOperand(1); 18304 EVT SrcVT = VecOp.getValueType(); 18305 EVT ResVT = SrcVT.getVectorElementType(); 18306 18307 EVT ContainerVT = SrcVT; 18308 if (SrcVT.isFixedLengthVector()) { 18309 ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT); 18310 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp); 18311 } 18312 18313 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT); 18314 SDValue Zero = DAG.getConstant(0, DL, MVT::i64); 18315 18316 // Convert operands to Scalable. 18317 AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, 18318 DAG.getUNDEF(ContainerVT), AccOp, Zero); 18319 18320 // Perform reduction. 18321 SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT, 18322 Pg, AccOp, VecOp); 18323 18324 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero); 18325 } 18326 18327 SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp, 18328 SelectionDAG &DAG) const { 18329 SDLoc DL(ReduceOp); 18330 SDValue Op = ReduceOp.getOperand(0); 18331 EVT OpVT = Op.getValueType(); 18332 EVT VT = ReduceOp.getValueType(); 18333 18334 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1) 18335 return SDValue(); 18336 18337 SDValue Pg = getPredicateForVector(DAG, DL, OpVT); 18338 18339 switch (ReduceOp.getOpcode()) { 18340 default: 18341 return SDValue(); 18342 case ISD::VECREDUCE_OR: 18343 return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE); 18344 case ISD::VECREDUCE_AND: { 18345 Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg); 18346 return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE); 18347 } 18348 case ISD::VECREDUCE_XOR: { 18349 SDValue ID = 18350 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64); 18351 SDValue Cntp = 18352 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op); 18353 return DAG.getAnyExtOrTrunc(Cntp, DL, VT); 18354 } 18355 } 18356 18357 return SDValue(); 18358 } 18359 18360 SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode, 18361 SDValue ScalarOp, 18362 SelectionDAG &DAG) const { 18363 SDLoc DL(ScalarOp); 18364 SDValue VecOp = ScalarOp.getOperand(0); 18365 EVT SrcVT = VecOp.getValueType(); 18366 18367 if (useSVEForFixedLengthVectorVT(SrcVT, true)) { 18368 EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT); 18369 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp); 18370 } 18371 18372 // UADDV always returns an i64 result. 18373 EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 : 18374 SrcVT.getVectorElementType(); 18375 EVT RdxVT = SrcVT; 18376 if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED) 18377 RdxVT = getPackedSVEVectorVT(ResVT); 18378 18379 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT); 18380 SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp); 18381 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, 18382 Rdx, DAG.getConstant(0, DL, MVT::i64)); 18383 18384 // The VEC_REDUCE nodes expect an element size result. 18385 if (ResVT != ScalarOp.getValueType()) 18386 Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType()); 18387 18388 return Res; 18389 } 18390 18391 SDValue 18392 AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op, 18393 SelectionDAG &DAG) const { 18394 EVT VT = Op.getValueType(); 18395 SDLoc DL(Op); 18396 18397 EVT InVT = Op.getOperand(1).getValueType(); 18398 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); 18399 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1)); 18400 SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2)); 18401 18402 // Convert the mask to a predicated (NOTE: We don't need to worry about 18403 // inactive lanes since VSELECT is safe when given undefined elements). 18404 EVT MaskVT = Op.getOperand(0).getValueType(); 18405 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT); 18406 auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0)); 18407 Mask = DAG.getNode(ISD::TRUNCATE, DL, 18408 MaskContainerVT.changeVectorElementType(MVT::i1), Mask); 18409 18410 auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT, 18411 Mask, Op1, Op2); 18412 18413 return convertFromScalableVector(DAG, VT, ScalableRes); 18414 } 18415 18416 SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE( 18417 SDValue Op, SelectionDAG &DAG) const { 18418 SDLoc DL(Op); 18419 EVT InVT = Op.getOperand(0).getValueType(); 18420 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); 18421 18422 assert(useSVEForFixedLengthVectorVT(InVT) && 18423 "Only expected to lower fixed length vector operation!"); 18424 assert(Op.getValueType() == InVT.changeTypeToInteger() && 18425 "Expected integer result of the same bit length as the inputs!"); 18426 18427 auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0)); 18428 auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1)); 18429 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT); 18430 18431 EVT CmpVT = Pg.getValueType(); 18432 auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT, 18433 {Pg, Op1, Op2, Op.getOperand(2)}); 18434 18435 EVT PromoteVT = ContainerVT.changeTypeToInteger(); 18436 auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT); 18437 return convertFromScalableVector(DAG, Op.getValueType(), Promote); 18438 } 18439 18440 SDValue 18441 AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op, 18442 SelectionDAG &DAG) const { 18443 SDLoc DL(Op); 18444 auto SrcOp = Op.getOperand(0); 18445 EVT VT = Op.getValueType(); 18446 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT); 18447 EVT ContainerSrcVT = 18448 getContainerForFixedLengthVector(DAG, SrcOp.getValueType()); 18449 18450 SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp); 18451 Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp); 18452 return convertFromScalableVector(DAG, VT, Op); 18453 } 18454 18455 SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE( 18456 SDValue Op, SelectionDAG &DAG) const { 18457 SDLoc DL(Op); 18458 unsigned NumOperands = Op->getNumOperands(); 18459 18460 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) && 18461 "Unexpected number of operands in CONCAT_VECTORS"); 18462 18463 auto SrcOp1 = Op.getOperand(0); 18464 auto SrcOp2 = Op.getOperand(1); 18465 EVT VT = Op.getValueType(); 18466 EVT SrcVT = SrcOp1.getValueType(); 18467 18468 if (NumOperands > 2) { 18469 SmallVector<SDValue, 4> Ops; 18470 EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext()); 18471 for (unsigned I = 0; I < NumOperands; I += 2) 18472 Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT, 18473 Op->getOperand(I), Op->getOperand(I + 1))); 18474 18475 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops); 18476 } 18477 18478 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 18479 18480 SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT); 18481 SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1); 18482 SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2); 18483 18484 Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2); 18485 18486 return convertFromScalableVector(DAG, VT, Op); 18487 } 18488 18489 SDValue 18490 AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op, 18491 SelectionDAG &DAG) const { 18492 EVT VT = Op.getValueType(); 18493 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); 18494 18495 SDLoc DL(Op); 18496 SDValue Val = Op.getOperand(0); 18497 SDValue Pg = getPredicateForVector(DAG, DL, VT); 18498 EVT SrcVT = Val.getValueType(); 18499 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 18500 EVT ExtendVT = ContainerVT.changeVectorElementType( 18501 SrcVT.getVectorElementType()); 18502 18503 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val); 18504 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val); 18505 18506 Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val); 18507 Val = getSVESafeBitCast(ExtendVT, Val, DAG); 18508 Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT, 18509 Pg, Val, DAG.getUNDEF(ContainerVT)); 18510 18511 return convertFromScalableVector(DAG, VT, Val); 18512 } 18513 18514 SDValue 18515 AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op, 18516 SelectionDAG &DAG) const { 18517 EVT VT = Op.getValueType(); 18518 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); 18519 18520 SDLoc DL(Op); 18521 SDValue Val = Op.getOperand(0); 18522 EVT SrcVT = Val.getValueType(); 18523 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT); 18524 EVT RoundVT = ContainerSrcVT.changeVectorElementType( 18525 VT.getVectorElementType()); 18526 SDValue Pg = getPredicateForVector(DAG, DL, RoundVT); 18527 18528 Val = convertToScalableVector(DAG, ContainerSrcVT, Val); 18529 Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val, 18530 Op.getOperand(1), DAG.getUNDEF(RoundVT)); 18531 Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG); 18532 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val); 18533 18534 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val); 18535 return DAG.getNode(ISD::BITCAST, DL, VT, Val); 18536 } 18537 18538 SDValue 18539 AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op, 18540 SelectionDAG &DAG) const { 18541 EVT VT = Op.getValueType(); 18542 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); 18543 18544 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP; 18545 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU 18546 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU; 18547 18548 SDLoc DL(Op); 18549 SDValue Val = Op.getOperand(0); 18550 EVT SrcVT = Val.getValueType(); 18551 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT); 18552 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT); 18553 18554 if (ContainerSrcVT.getVectorElementType().getSizeInBits() <= 18555 ContainerDstVT.getVectorElementType().getSizeInBits()) { 18556 SDValue Pg = getPredicateForVector(DAG, DL, VT); 18557 18558 Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, 18559 VT.changeTypeToInteger(), Val); 18560 18561 Val = convertToScalableVector(DAG, ContainerSrcVT, Val); 18562 Val = getSVESafeBitCast(ContainerDstVT.changeTypeToInteger(), Val, DAG); 18563 // Safe to use a larger than specified operand since we just unpacked the 18564 // data, hence the upper bits are zero. 18565 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val, 18566 DAG.getUNDEF(ContainerDstVT)); 18567 return convertFromScalableVector(DAG, VT, Val); 18568 } else { 18569 EVT CvtVT = ContainerSrcVT.changeVectorElementType( 18570 ContainerDstVT.getVectorElementType()); 18571 SDValue Pg = getPredicateForVector(DAG, DL, CvtVT); 18572 18573 Val = convertToScalableVector(DAG, ContainerSrcVT, Val); 18574 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT)); 18575 Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG); 18576 Val = convertFromScalableVector(DAG, SrcVT, Val); 18577 18578 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val); 18579 return DAG.getNode(ISD::BITCAST, DL, VT, Val); 18580 } 18581 } 18582 18583 SDValue 18584 AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op, 18585 SelectionDAG &DAG) const { 18586 EVT VT = Op.getValueType(); 18587 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); 18588 18589 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT; 18590 unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU 18591 : AArch64ISD::FCVTZU_MERGE_PASSTHRU; 18592 18593 SDLoc DL(Op); 18594 SDValue Val = Op.getOperand(0); 18595 EVT SrcVT = Val.getValueType(); 18596 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT); 18597 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT); 18598 18599 if (ContainerSrcVT.getVectorElementType().getSizeInBits() <= 18600 ContainerDstVT.getVectorElementType().getSizeInBits()) { 18601 EVT CvtVT = ContainerDstVT.changeVectorElementType( 18602 ContainerSrcVT.getVectorElementType()); 18603 SDValue Pg = getPredicateForVector(DAG, DL, VT); 18604 18605 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val); 18606 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val); 18607 18608 Val = convertToScalableVector(DAG, ContainerSrcVT, Val); 18609 Val = getSVESafeBitCast(CvtVT, Val, DAG); 18610 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val, 18611 DAG.getUNDEF(ContainerDstVT)); 18612 return convertFromScalableVector(DAG, VT, Val); 18613 } else { 18614 EVT CvtVT = ContainerSrcVT.changeTypeToInteger(); 18615 SDValue Pg = getPredicateForVector(DAG, DL, CvtVT); 18616 18617 // Safe to use a larger than specified result since an fp_to_int where the 18618 // result doesn't fit into the destination is undefined. 18619 Val = convertToScalableVector(DAG, ContainerSrcVT, Val); 18620 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT)); 18621 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val); 18622 18623 return DAG.getNode(ISD::TRUNCATE, DL, VT, Val); 18624 } 18625 } 18626 18627 SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE( 18628 SDValue Op, SelectionDAG &DAG) const { 18629 EVT VT = Op.getValueType(); 18630 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); 18631 18632 auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 18633 auto ShuffleMask = SVN->getMask(); 18634 18635 SDLoc DL(Op); 18636 SDValue Op1 = Op.getOperand(0); 18637 SDValue Op2 = Op.getOperand(1); 18638 18639 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 18640 Op1 = convertToScalableVector(DAG, ContainerVT, Op1); 18641 Op2 = convertToScalableVector(DAG, ContainerVT, Op2); 18642 18643 bool ReverseEXT = false; 18644 unsigned Imm; 18645 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) && 18646 Imm == VT.getVectorNumElements() - 1) { 18647 if (ReverseEXT) 18648 std::swap(Op1, Op2); 18649 18650 EVT ScalarTy = VT.getVectorElementType(); 18651 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) 18652 ScalarTy = MVT::i32; 18653 SDValue Scalar = DAG.getNode( 18654 ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1, 18655 DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64)); 18656 Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar); 18657 return convertFromScalableVector(DAG, VT, Op); 18658 } 18659 18660 return SDValue(); 18661 } 18662 18663 SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op, 18664 SelectionDAG &DAG) const { 18665 SDLoc DL(Op); 18666 EVT InVT = Op.getValueType(); 18667 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 18668 (void)TLI; 18669 18670 assert(VT.isScalableVector() && TLI.isTypeLegal(VT) && 18671 InVT.isScalableVector() && TLI.isTypeLegal(InVT) && 18672 "Only expect to cast between legal scalable vector types!"); 18673 assert((VT.getVectorElementType() == MVT::i1) == 18674 (InVT.getVectorElementType() == MVT::i1) && 18675 "Cannot cast between data and predicate scalable vector types!"); 18676 18677 if (InVT == VT) 18678 return Op; 18679 18680 if (VT.getVectorElementType() == MVT::i1) 18681 return DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op); 18682 18683 EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType()); 18684 EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType()); 18685 18686 // Pack input if required. 18687 if (InVT != PackedInVT) 18688 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op); 18689 18690 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op); 18691 18692 // Unpack result if required. 18693 if (VT != PackedVT) 18694 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op); 18695 18696 return Op; 18697 } 18698 18699 bool AArch64TargetLowering::isAllActivePredicate(SDValue N) const { 18700 return ::isAllActivePredicate(N); 18701 } 18702 18703 EVT AArch64TargetLowering::getPromotedVTForPredicate(EVT VT) const { 18704 return ::getPromotedVTForPredicate(VT); 18705 } 18706 18707 bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode( 18708 SDValue Op, const APInt &OriginalDemandedBits, 18709 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, 18710 unsigned Depth) const { 18711 18712 unsigned Opc = Op.getOpcode(); 18713 switch (Opc) { 18714 case AArch64ISD::VSHL: { 18715 // Match (VSHL (VLSHR Val X) X) 18716 SDValue ShiftL = Op; 18717 SDValue ShiftR = Op->getOperand(0); 18718 if (ShiftR->getOpcode() != AArch64ISD::VLSHR) 18719 return false; 18720 18721 if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse()) 18722 return false; 18723 18724 unsigned ShiftLBits = ShiftL->getConstantOperandVal(1); 18725 unsigned ShiftRBits = ShiftR->getConstantOperandVal(1); 18726 18727 // Other cases can be handled as well, but this is not 18728 // implemented. 18729 if (ShiftRBits != ShiftLBits) 18730 return false; 18731 18732 unsigned ScalarSize = Op.getScalarValueSizeInBits(); 18733 assert(ScalarSize > ShiftLBits && "Invalid shift imm"); 18734 18735 APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits); 18736 APInt UnusedBits = ~OriginalDemandedBits; 18737 18738 if ((ZeroBits & UnusedBits) != ZeroBits) 18739 return false; 18740 18741 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not 18742 // used - simplify to just Val. 18743 return TLO.CombineTo(Op, ShiftR->getOperand(0)); 18744 } 18745 } 18746 18747 return TargetLowering::SimplifyDemandedBitsForTargetNode( 18748 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth); 18749 } 18750 18751 bool AArch64TargetLowering::isConstantUnsignedBitfieldExtactLegal( 18752 unsigned Opc, LLT Ty1, LLT Ty2) const { 18753 return Ty1 == Ty2 && (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64)); 18754 } 18755