1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the AArch64TargetLowering class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AArch64ISelLowering.h" 14 #include "AArch64CallingConvention.h" 15 #include "AArch64ExpandImm.h" 16 #include "AArch64MachineFunctionInfo.h" 17 #include "AArch64PerfectShuffle.h" 18 #include "AArch64RegisterInfo.h" 19 #include "AArch64Subtarget.h" 20 #include "MCTargetDesc/AArch64AddressingModes.h" 21 #include "Utils/AArch64BaseInfo.h" 22 #include "llvm/ADT/APFloat.h" 23 #include "llvm/ADT/APInt.h" 24 #include "llvm/ADT/ArrayRef.h" 25 #include "llvm/ADT/STLExtras.h" 26 #include "llvm/ADT/SmallSet.h" 27 #include "llvm/ADT/SmallVector.h" 28 #include "llvm/ADT/Statistic.h" 29 #include "llvm/ADT/StringRef.h" 30 #include "llvm/ADT/Triple.h" 31 #include "llvm/ADT/Twine.h" 32 #include "llvm/Analysis/LoopInfo.h" 33 #include "llvm/Analysis/MemoryLocation.h" 34 #include "llvm/Analysis/ObjCARCUtil.h" 35 #include "llvm/Analysis/TargetTransformInfo.h" 36 #include "llvm/Analysis/ValueTracking.h" 37 #include "llvm/Analysis/VectorUtils.h" 38 #include "llvm/CodeGen/Analysis.h" 39 #include "llvm/CodeGen/CallingConvLower.h" 40 #include "llvm/CodeGen/ISDOpcodes.h" 41 #include "llvm/CodeGen/MachineBasicBlock.h" 42 #include "llvm/CodeGen/MachineFrameInfo.h" 43 #include "llvm/CodeGen/MachineFunction.h" 44 #include "llvm/CodeGen/MachineInstr.h" 45 #include "llvm/CodeGen/MachineInstrBuilder.h" 46 #include "llvm/CodeGen/MachineMemOperand.h" 47 #include "llvm/CodeGen/MachineRegisterInfo.h" 48 #include "llvm/CodeGen/RuntimeLibcalls.h" 49 #include "llvm/CodeGen/SelectionDAG.h" 50 #include "llvm/CodeGen/SelectionDAGNodes.h" 51 #include "llvm/CodeGen/TargetCallingConv.h" 52 #include "llvm/CodeGen/TargetInstrInfo.h" 53 #include "llvm/CodeGen/ValueTypes.h" 54 #include "llvm/IR/Attributes.h" 55 #include "llvm/IR/Constants.h" 56 #include "llvm/IR/DataLayout.h" 57 #include "llvm/IR/DebugLoc.h" 58 #include "llvm/IR/DerivedTypes.h" 59 #include "llvm/IR/Function.h" 60 #include "llvm/IR/GetElementPtrTypeIterator.h" 61 #include "llvm/IR/GlobalValue.h" 62 #include "llvm/IR/IRBuilder.h" 63 #include "llvm/IR/Instruction.h" 64 #include "llvm/IR/Instructions.h" 65 #include "llvm/IR/IntrinsicInst.h" 66 #include "llvm/IR/Intrinsics.h" 67 #include "llvm/IR/IntrinsicsAArch64.h" 68 #include "llvm/IR/Module.h" 69 #include "llvm/IR/OperandTraits.h" 70 #include "llvm/IR/PatternMatch.h" 71 #include "llvm/IR/Type.h" 72 #include "llvm/IR/Use.h" 73 #include "llvm/IR/Value.h" 74 #include "llvm/MC/MCRegisterInfo.h" 75 #include "llvm/Support/Casting.h" 76 #include "llvm/Support/CodeGen.h" 77 #include "llvm/Support/CommandLine.h" 78 #include "llvm/Support/Compiler.h" 79 #include "llvm/Support/Debug.h" 80 #include "llvm/Support/ErrorHandling.h" 81 #include "llvm/Support/InstructionCost.h" 82 #include "llvm/Support/KnownBits.h" 83 #include "llvm/Support/MachineValueType.h" 84 #include "llvm/Support/MathExtras.h" 85 #include "llvm/Support/raw_ostream.h" 86 #include "llvm/Target/TargetMachine.h" 87 #include "llvm/Target/TargetOptions.h" 88 #include <algorithm> 89 #include <bitset> 90 #include <cassert> 91 #include <cctype> 92 #include <cstdint> 93 #include <cstdlib> 94 #include <iterator> 95 #include <limits> 96 #include <optional> 97 #include <tuple> 98 #include <utility> 99 #include <vector> 100 101 using namespace llvm; 102 using namespace llvm::PatternMatch; 103 104 #define DEBUG_TYPE "aarch64-lower" 105 106 STATISTIC(NumTailCalls, "Number of tail calls"); 107 STATISTIC(NumShiftInserts, "Number of vector shift inserts"); 108 STATISTIC(NumOptimizedImms, "Number of times immediates were optimized"); 109 110 // FIXME: The necessary dtprel relocations don't seem to be supported 111 // well in the GNU bfd and gold linkers at the moment. Therefore, by 112 // default, for now, fall back to GeneralDynamic code generation. 113 cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration( 114 "aarch64-elf-ldtls-generation", cl::Hidden, 115 cl::desc("Allow AArch64 Local Dynamic TLS code generation"), 116 cl::init(false)); 117 118 static cl::opt<bool> 119 EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, 120 cl::desc("Enable AArch64 logical imm instruction " 121 "optimization"), 122 cl::init(true)); 123 124 // Temporary option added for the purpose of testing functionality added 125 // to DAGCombiner.cpp in D92230. It is expected that this can be removed 126 // in future when both implementations will be based off MGATHER rather 127 // than the GLD1 nodes added for the SVE gather load intrinsics. 128 static cl::opt<bool> 129 EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, 130 cl::desc("Combine extends of AArch64 masked " 131 "gather intrinsics"), 132 cl::init(true)); 133 134 // All of the XOR, OR and CMP use ALU ports, and data dependency will become the 135 // bottleneck after this transform on high end CPU. So this max leaf node 136 // limitation is guard cmp+ccmp will be profitable. 137 static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden, 138 cl::desc("Maximum of xors")); 139 140 /// Value type used for condition codes. 141 static const MVT MVT_CC = MVT::i32; 142 143 static inline EVT getPackedSVEVectorVT(EVT VT) { 144 switch (VT.getSimpleVT().SimpleTy) { 145 default: 146 llvm_unreachable("unexpected element type for vector"); 147 case MVT::i8: 148 return MVT::nxv16i8; 149 case MVT::i16: 150 return MVT::nxv8i16; 151 case MVT::i32: 152 return MVT::nxv4i32; 153 case MVT::i64: 154 return MVT::nxv2i64; 155 case MVT::f16: 156 return MVT::nxv8f16; 157 case MVT::f32: 158 return MVT::nxv4f32; 159 case MVT::f64: 160 return MVT::nxv2f64; 161 case MVT::bf16: 162 return MVT::nxv8bf16; 163 } 164 } 165 166 // NOTE: Currently there's only a need to return integer vector types. If this 167 // changes then just add an extra "type" parameter. 168 static inline EVT getPackedSVEVectorVT(ElementCount EC) { 169 switch (EC.getKnownMinValue()) { 170 default: 171 llvm_unreachable("unexpected element count for vector"); 172 case 16: 173 return MVT::nxv16i8; 174 case 8: 175 return MVT::nxv8i16; 176 case 4: 177 return MVT::nxv4i32; 178 case 2: 179 return MVT::nxv2i64; 180 } 181 } 182 183 static inline EVT getPromotedVTForPredicate(EVT VT) { 184 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) && 185 "Expected scalable predicate vector type!"); 186 switch (VT.getVectorMinNumElements()) { 187 default: 188 llvm_unreachable("unexpected element count for vector"); 189 case 2: 190 return MVT::nxv2i64; 191 case 4: 192 return MVT::nxv4i32; 193 case 8: 194 return MVT::nxv8i16; 195 case 16: 196 return MVT::nxv16i8; 197 } 198 } 199 200 /// Returns true if VT's elements occupy the lowest bit positions of its 201 /// associated register class without any intervening space. 202 /// 203 /// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the 204 /// same register class, but only nxv8f16 can be treated as a packed vector. 205 static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) { 206 assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && 207 "Expected legal vector type!"); 208 return VT.isFixedLengthVector() || 209 VT.getSizeInBits().getKnownMinValue() == AArch64::SVEBitsPerBlock; 210 } 211 212 // Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading 213 // predicate and end with a passthru value matching the result type. 214 static bool isMergePassthruOpcode(unsigned Opc) { 215 switch (Opc) { 216 default: 217 return false; 218 case AArch64ISD::BITREVERSE_MERGE_PASSTHRU: 219 case AArch64ISD::BSWAP_MERGE_PASSTHRU: 220 case AArch64ISD::REVH_MERGE_PASSTHRU: 221 case AArch64ISD::REVW_MERGE_PASSTHRU: 222 case AArch64ISD::REVD_MERGE_PASSTHRU: 223 case AArch64ISD::CTLZ_MERGE_PASSTHRU: 224 case AArch64ISD::CTPOP_MERGE_PASSTHRU: 225 case AArch64ISD::DUP_MERGE_PASSTHRU: 226 case AArch64ISD::ABS_MERGE_PASSTHRU: 227 case AArch64ISD::NEG_MERGE_PASSTHRU: 228 case AArch64ISD::FNEG_MERGE_PASSTHRU: 229 case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU: 230 case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU: 231 case AArch64ISD::FCEIL_MERGE_PASSTHRU: 232 case AArch64ISD::FFLOOR_MERGE_PASSTHRU: 233 case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU: 234 case AArch64ISD::FRINT_MERGE_PASSTHRU: 235 case AArch64ISD::FROUND_MERGE_PASSTHRU: 236 case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU: 237 case AArch64ISD::FTRUNC_MERGE_PASSTHRU: 238 case AArch64ISD::FP_ROUND_MERGE_PASSTHRU: 239 case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU: 240 case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU: 241 case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU: 242 case AArch64ISD::FCVTZU_MERGE_PASSTHRU: 243 case AArch64ISD::FCVTZS_MERGE_PASSTHRU: 244 case AArch64ISD::FSQRT_MERGE_PASSTHRU: 245 case AArch64ISD::FRECPX_MERGE_PASSTHRU: 246 case AArch64ISD::FABS_MERGE_PASSTHRU: 247 return true; 248 } 249 } 250 251 // Returns true if inactive lanes are known to be zeroed by construction. 252 static bool isZeroingInactiveLanes(SDValue Op) { 253 switch (Op.getOpcode()) { 254 default: 255 // We guarantee i1 splat_vectors to zero the other lanes by 256 // implementing it with ptrue and possibly a punpklo for nxv1i1. 257 if (ISD::isConstantSplatVectorAllOnes(Op.getNode())) 258 return true; 259 return false; 260 case AArch64ISD::PTRUE: 261 case AArch64ISD::SETCC_MERGE_ZERO: 262 return true; 263 case ISD::INTRINSIC_WO_CHAIN: 264 switch (Op.getConstantOperandVal(0)) { 265 default: 266 return false; 267 case Intrinsic::aarch64_sve_ptrue: 268 case Intrinsic::aarch64_sve_pnext: 269 case Intrinsic::aarch64_sve_cmpeq: 270 case Intrinsic::aarch64_sve_cmpne: 271 case Intrinsic::aarch64_sve_cmpge: 272 case Intrinsic::aarch64_sve_cmpgt: 273 case Intrinsic::aarch64_sve_cmphs: 274 case Intrinsic::aarch64_sve_cmphi: 275 case Intrinsic::aarch64_sve_cmpeq_wide: 276 case Intrinsic::aarch64_sve_cmpne_wide: 277 case Intrinsic::aarch64_sve_cmpge_wide: 278 case Intrinsic::aarch64_sve_cmpgt_wide: 279 case Intrinsic::aarch64_sve_cmplt_wide: 280 case Intrinsic::aarch64_sve_cmple_wide: 281 case Intrinsic::aarch64_sve_cmphs_wide: 282 case Intrinsic::aarch64_sve_cmphi_wide: 283 case Intrinsic::aarch64_sve_cmplo_wide: 284 case Intrinsic::aarch64_sve_cmpls_wide: 285 case Intrinsic::aarch64_sve_fcmpeq: 286 case Intrinsic::aarch64_sve_fcmpne: 287 case Intrinsic::aarch64_sve_fcmpge: 288 case Intrinsic::aarch64_sve_fcmpgt: 289 case Intrinsic::aarch64_sve_fcmpuo: 290 case Intrinsic::aarch64_sve_facgt: 291 case Intrinsic::aarch64_sve_facge: 292 case Intrinsic::aarch64_sve_whilege: 293 case Intrinsic::aarch64_sve_whilegt: 294 case Intrinsic::aarch64_sve_whilehi: 295 case Intrinsic::aarch64_sve_whilehs: 296 case Intrinsic::aarch64_sve_whilele: 297 case Intrinsic::aarch64_sve_whilelo: 298 case Intrinsic::aarch64_sve_whilels: 299 case Intrinsic::aarch64_sve_whilelt: 300 case Intrinsic::aarch64_sve_match: 301 case Intrinsic::aarch64_sve_nmatch: 302 case Intrinsic::aarch64_sve_whilege_x2: 303 case Intrinsic::aarch64_sve_whilegt_x2: 304 case Intrinsic::aarch64_sve_whilehi_x2: 305 case Intrinsic::aarch64_sve_whilehs_x2: 306 case Intrinsic::aarch64_sve_whilele_x2: 307 case Intrinsic::aarch64_sve_whilelo_x2: 308 case Intrinsic::aarch64_sve_whilels_x2: 309 case Intrinsic::aarch64_sve_whilelt_x2: 310 return true; 311 } 312 } 313 } 314 315 AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, 316 const AArch64Subtarget &STI) 317 : TargetLowering(TM), Subtarget(&STI) { 318 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so 319 // we have to make something up. Arbitrarily, choose ZeroOrOne. 320 setBooleanContents(ZeroOrOneBooleanContent); 321 // When comparing vectors the result sets the different elements in the 322 // vector to all-one or all-zero. 323 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 324 325 // Set up the register classes. 326 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass); 327 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass); 328 329 if (Subtarget->hasLS64()) { 330 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass); 331 setOperationAction(ISD::LOAD, MVT::i64x8, Custom); 332 setOperationAction(ISD::STORE, MVT::i64x8, Custom); 333 } 334 335 if (Subtarget->hasFPARMv8()) { 336 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); 337 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass); 338 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); 339 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass); 340 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); 341 } 342 343 if (Subtarget->hasNEON()) { 344 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass); 345 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass); 346 // Someone set us up the NEON. 347 addDRTypeForNEON(MVT::v2f32); 348 addDRTypeForNEON(MVT::v8i8); 349 addDRTypeForNEON(MVT::v4i16); 350 addDRTypeForNEON(MVT::v2i32); 351 addDRTypeForNEON(MVT::v1i64); 352 addDRTypeForNEON(MVT::v1f64); 353 addDRTypeForNEON(MVT::v4f16); 354 if (Subtarget->hasBF16()) 355 addDRTypeForNEON(MVT::v4bf16); 356 357 addQRTypeForNEON(MVT::v4f32); 358 addQRTypeForNEON(MVT::v2f64); 359 addQRTypeForNEON(MVT::v16i8); 360 addQRTypeForNEON(MVT::v8i16); 361 addQRTypeForNEON(MVT::v4i32); 362 addQRTypeForNEON(MVT::v2i64); 363 addQRTypeForNEON(MVT::v8f16); 364 if (Subtarget->hasBF16()) 365 addQRTypeForNEON(MVT::v8bf16); 366 } 367 368 if (Subtarget->hasSVEorSME()) { 369 // Add legal sve predicate types 370 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass); 371 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass); 372 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass); 373 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass); 374 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass); 375 376 // Add legal sve data types 377 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass); 378 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass); 379 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass); 380 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass); 381 382 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass); 383 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass); 384 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass); 385 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass); 386 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass); 387 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass); 388 389 if (Subtarget->hasBF16()) { 390 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass); 391 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass); 392 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass); 393 } 394 395 if (Subtarget->useSVEForFixedLengthVectors()) { 396 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) 397 if (useSVEForFixedLengthVectorVT(VT)) 398 addRegisterClass(VT, &AArch64::ZPRRegClass); 399 400 for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) 401 if (useSVEForFixedLengthVectorVT(VT)) 402 addRegisterClass(VT, &AArch64::ZPRRegClass); 403 } 404 } 405 406 // Compute derived properties from the register classes 407 computeRegisterProperties(Subtarget->getRegisterInfo()); 408 409 // Provide all sorts of operation actions 410 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 411 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 412 setOperationAction(ISD::SETCC, MVT::i32, Custom); 413 setOperationAction(ISD::SETCC, MVT::i64, Custom); 414 setOperationAction(ISD::SETCC, MVT::f16, Custom); 415 setOperationAction(ISD::SETCC, MVT::f32, Custom); 416 setOperationAction(ISD::SETCC, MVT::f64, Custom); 417 setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom); 418 setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom); 419 setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom); 420 setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom); 421 setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom); 422 setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom); 423 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 424 setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); 425 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 426 setOperationAction(ISD::BR_CC, MVT::i32, Custom); 427 setOperationAction(ISD::BR_CC, MVT::i64, Custom); 428 setOperationAction(ISD::BR_CC, MVT::f16, Custom); 429 setOperationAction(ISD::BR_CC, MVT::f32, Custom); 430 setOperationAction(ISD::BR_CC, MVT::f64, Custom); 431 setOperationAction(ISD::SELECT, MVT::i32, Custom); 432 setOperationAction(ISD::SELECT, MVT::i64, Custom); 433 setOperationAction(ISD::SELECT, MVT::f16, Custom); 434 setOperationAction(ISD::SELECT, MVT::bf16, Custom); 435 setOperationAction(ISD::SELECT, MVT::f32, Custom); 436 setOperationAction(ISD::SELECT, MVT::f64, Custom); 437 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 438 setOperationAction(ISD::SELECT_CC, MVT::i64, Custom); 439 setOperationAction(ISD::SELECT_CC, MVT::f16, Custom); 440 setOperationAction(ISD::SELECT_CC, MVT::bf16, Expand); 441 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 442 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 443 setOperationAction(ISD::BR_JT, MVT::Other, Custom); 444 setOperationAction(ISD::JumpTable, MVT::i64, Custom); 445 setOperationAction(ISD::SETCCCARRY, MVT::i64, Custom); 446 447 setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom); 448 setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom); 449 setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom); 450 451 setOperationAction(ISD::FREM, MVT::f32, Expand); 452 setOperationAction(ISD::FREM, MVT::f64, Expand); 453 setOperationAction(ISD::FREM, MVT::f80, Expand); 454 455 setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand); 456 457 // Custom lowering hooks are needed for XOR 458 // to fold it into CSINC/CSINV. 459 setOperationAction(ISD::XOR, MVT::i32, Custom); 460 setOperationAction(ISD::XOR, MVT::i64, Custom); 461 462 // Virtually no operation on f128 is legal, but LLVM can't expand them when 463 // there's a valid register class, so we need custom operations in most cases. 464 setOperationAction(ISD::FABS, MVT::f128, Expand); 465 setOperationAction(ISD::FADD, MVT::f128, LibCall); 466 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); 467 setOperationAction(ISD::FCOS, MVT::f128, Expand); 468 setOperationAction(ISD::FDIV, MVT::f128, LibCall); 469 setOperationAction(ISD::FMA, MVT::f128, Expand); 470 setOperationAction(ISD::FMUL, MVT::f128, LibCall); 471 setOperationAction(ISD::FNEG, MVT::f128, Expand); 472 setOperationAction(ISD::FPOW, MVT::f128, Expand); 473 setOperationAction(ISD::FREM, MVT::f128, Expand); 474 setOperationAction(ISD::FRINT, MVT::f128, Expand); 475 setOperationAction(ISD::FSIN, MVT::f128, Expand); 476 setOperationAction(ISD::FSINCOS, MVT::f128, Expand); 477 setOperationAction(ISD::FSQRT, MVT::f128, Expand); 478 setOperationAction(ISD::FSUB, MVT::f128, LibCall); 479 setOperationAction(ISD::FTRUNC, MVT::f128, Expand); 480 setOperationAction(ISD::SETCC, MVT::f128, Custom); 481 setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom); 482 setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom); 483 setOperationAction(ISD::BR_CC, MVT::f128, Custom); 484 setOperationAction(ISD::SELECT, MVT::f128, Custom); 485 setOperationAction(ISD::SELECT_CC, MVT::f128, Custom); 486 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); 487 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently 488 // aren't handled. 489 490 // Lowering for many of the conversions is actually specified by the non-f128 491 // type. The LowerXXX function will be trivial when f128 isn't involved. 492 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 493 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 494 setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom); 495 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom); 496 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom); 497 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom); 498 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 499 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); 500 setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom); 501 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom); 502 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom); 503 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom); 504 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 505 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); 506 setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom); 507 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom); 508 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom); 509 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom); 510 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 511 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); 512 setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom); 513 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom); 514 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom); 515 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom); 516 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); 517 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); 518 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); 519 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); 520 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom); 521 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom); 522 523 setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i32, Custom); 524 setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom); 525 setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i32, Custom); 526 setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom); 527 528 // Variable arguments. 529 setOperationAction(ISD::VASTART, MVT::Other, Custom); 530 setOperationAction(ISD::VAARG, MVT::Other, Custom); 531 setOperationAction(ISD::VACOPY, MVT::Other, Custom); 532 setOperationAction(ISD::VAEND, MVT::Other, Expand); 533 534 // Variable-sized objects. 535 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 536 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 537 538 if (Subtarget->isTargetWindows()) 539 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom); 540 else 541 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 542 543 // Constant pool entries 544 setOperationAction(ISD::ConstantPool, MVT::i64, Custom); 545 546 // BlockAddress 547 setOperationAction(ISD::BlockAddress, MVT::i64, Custom); 548 549 // AArch64 lacks both left-rotate and popcount instructions. 550 setOperationAction(ISD::ROTL, MVT::i32, Expand); 551 setOperationAction(ISD::ROTL, MVT::i64, Expand); 552 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 553 setOperationAction(ISD::ROTL, VT, Expand); 554 setOperationAction(ISD::ROTR, VT, Expand); 555 } 556 557 // AArch64 doesn't have i32 MULH{S|U}. 558 setOperationAction(ISD::MULHU, MVT::i32, Expand); 559 setOperationAction(ISD::MULHS, MVT::i32, Expand); 560 561 // AArch64 doesn't have {U|S}MUL_LOHI. 562 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); 563 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); 564 565 if (Subtarget->hasCSSC()) { 566 setOperationAction(ISD::CTPOP, MVT::i32, Legal); 567 setOperationAction(ISD::CTPOP, MVT::i64, Legal); 568 setOperationAction(ISD::CTPOP, MVT::i128, Expand); 569 570 setOperationAction(ISD::PARITY, MVT::i128, Expand); 571 572 setOperationAction(ISD::CTTZ, MVT::i32, Legal); 573 setOperationAction(ISD::CTTZ, MVT::i64, Legal); 574 setOperationAction(ISD::CTTZ, MVT::i128, Expand); 575 576 setOperationAction(ISD::ABS, MVT::i32, Legal); 577 setOperationAction(ISD::ABS, MVT::i64, Legal); 578 579 setOperationAction(ISD::SMAX, MVT::i32, Legal); 580 setOperationAction(ISD::SMAX, MVT::i64, Legal); 581 setOperationAction(ISD::UMAX, MVT::i32, Legal); 582 setOperationAction(ISD::UMAX, MVT::i64, Legal); 583 584 setOperationAction(ISD::SMIN, MVT::i32, Legal); 585 setOperationAction(ISD::SMIN, MVT::i64, Legal); 586 setOperationAction(ISD::UMIN, MVT::i32, Legal); 587 setOperationAction(ISD::UMIN, MVT::i64, Legal); 588 } else { 589 setOperationAction(ISD::CTPOP, MVT::i32, Custom); 590 setOperationAction(ISD::CTPOP, MVT::i64, Custom); 591 setOperationAction(ISD::CTPOP, MVT::i128, Custom); 592 593 setOperationAction(ISD::PARITY, MVT::i64, Custom); 594 setOperationAction(ISD::PARITY, MVT::i128, Custom); 595 596 setOperationAction(ISD::ABS, MVT::i32, Custom); 597 setOperationAction(ISD::ABS, MVT::i64, Custom); 598 } 599 600 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 601 setOperationAction(ISD::SDIVREM, MVT::i64, Expand); 602 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 603 setOperationAction(ISD::SDIVREM, VT, Expand); 604 setOperationAction(ISD::UDIVREM, VT, Expand); 605 } 606 setOperationAction(ISD::SREM, MVT::i32, Expand); 607 setOperationAction(ISD::SREM, MVT::i64, Expand); 608 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 609 setOperationAction(ISD::UDIVREM, MVT::i64, Expand); 610 setOperationAction(ISD::UREM, MVT::i32, Expand); 611 setOperationAction(ISD::UREM, MVT::i64, Expand); 612 613 // Custom lower Add/Sub/Mul with overflow. 614 setOperationAction(ISD::SADDO, MVT::i32, Custom); 615 setOperationAction(ISD::SADDO, MVT::i64, Custom); 616 setOperationAction(ISD::UADDO, MVT::i32, Custom); 617 setOperationAction(ISD::UADDO, MVT::i64, Custom); 618 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 619 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 620 setOperationAction(ISD::USUBO, MVT::i32, Custom); 621 setOperationAction(ISD::USUBO, MVT::i64, Custom); 622 setOperationAction(ISD::SMULO, MVT::i32, Custom); 623 setOperationAction(ISD::SMULO, MVT::i64, Custom); 624 setOperationAction(ISD::UMULO, MVT::i32, Custom); 625 setOperationAction(ISD::UMULO, MVT::i64, Custom); 626 627 setOperationAction(ISD::ADDCARRY, MVT::i32, Custom); 628 setOperationAction(ISD::ADDCARRY, MVT::i64, Custom); 629 setOperationAction(ISD::SUBCARRY, MVT::i32, Custom); 630 setOperationAction(ISD::SUBCARRY, MVT::i64, Custom); 631 setOperationAction(ISD::SADDO_CARRY, MVT::i32, Custom); 632 setOperationAction(ISD::SADDO_CARRY, MVT::i64, Custom); 633 setOperationAction(ISD::SSUBO_CARRY, MVT::i32, Custom); 634 setOperationAction(ISD::SSUBO_CARRY, MVT::i64, Custom); 635 636 setOperationAction(ISD::FSIN, MVT::f32, Expand); 637 setOperationAction(ISD::FSIN, MVT::f64, Expand); 638 setOperationAction(ISD::FCOS, MVT::f32, Expand); 639 setOperationAction(ISD::FCOS, MVT::f64, Expand); 640 setOperationAction(ISD::FPOW, MVT::f32, Expand); 641 setOperationAction(ISD::FPOW, MVT::f64, Expand); 642 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 643 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 644 if (Subtarget->hasFullFP16()) 645 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom); 646 else 647 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote); 648 649 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI, 650 ISD::FCOS, ISD::FSIN, ISD::FSINCOS, 651 ISD::FEXP, ISD::FEXP2, ISD::FLOG, 652 ISD::FLOG2, ISD::FLOG10, ISD::STRICT_FREM, 653 ISD::STRICT_FPOW, ISD::STRICT_FPOWI, ISD::STRICT_FCOS, 654 ISD::STRICT_FSIN, ISD::STRICT_FEXP, ISD::STRICT_FEXP2, 655 ISD::STRICT_FLOG, ISD::STRICT_FLOG2, ISD::STRICT_FLOG10}) { 656 setOperationAction(Op, MVT::f16, Promote); 657 setOperationAction(Op, MVT::v4f16, Expand); 658 setOperationAction(Op, MVT::v8f16, Expand); 659 } 660 661 if (!Subtarget->hasFullFP16()) { 662 for (auto Op : 663 {ISD::SETCC, ISD::SELECT_CC, 664 ISD::BR_CC, ISD::FADD, ISD::FSUB, 665 ISD::FMUL, ISD::FDIV, ISD::FMA, 666 ISD::FNEG, ISD::FABS, ISD::FCEIL, 667 ISD::FSQRT, ISD::FFLOOR, ISD::FNEARBYINT, 668 ISD::FRINT, ISD::FROUND, ISD::FROUNDEVEN, 669 ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM, 670 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::STRICT_FADD, 671 ISD::STRICT_FSUB, ISD::STRICT_FMUL, ISD::STRICT_FDIV, 672 ISD::STRICT_FMA, ISD::STRICT_FCEIL, ISD::STRICT_FFLOOR, 673 ISD::STRICT_FSQRT, ISD::STRICT_FRINT, ISD::STRICT_FNEARBYINT, 674 ISD::STRICT_FROUND, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN, 675 ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM, 676 ISD::STRICT_FMAXIMUM}) 677 setOperationAction(Op, MVT::f16, Promote); 678 679 // Round-to-integer need custom lowering for fp16, as Promote doesn't work 680 // because the result type is integer. 681 for (auto Op : {ISD::STRICT_LROUND, ISD::STRICT_LLROUND, ISD::STRICT_LRINT, 682 ISD::STRICT_LLRINT}) 683 setOperationAction(Op, MVT::f16, Custom); 684 685 // promote v4f16 to v4f32 when that is known to be safe. 686 setOperationPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32); 687 setOperationPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32); 688 setOperationPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32); 689 setOperationPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32); 690 691 setOperationAction(ISD::FABS, MVT::v4f16, Expand); 692 setOperationAction(ISD::FNEG, MVT::v4f16, Expand); 693 setOperationAction(ISD::FROUND, MVT::v4f16, Expand); 694 setOperationAction(ISD::FROUNDEVEN, MVT::v4f16, Expand); 695 setOperationAction(ISD::FMA, MVT::v4f16, Expand); 696 setOperationAction(ISD::SETCC, MVT::v4f16, Expand); 697 setOperationAction(ISD::BR_CC, MVT::v4f16, Expand); 698 setOperationAction(ISD::SELECT, MVT::v4f16, Expand); 699 setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand); 700 setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand); 701 setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand); 702 setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand); 703 setOperationAction(ISD::FCEIL, MVT::v4f16, Expand); 704 setOperationAction(ISD::FRINT, MVT::v4f16, Expand); 705 setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand); 706 setOperationAction(ISD::FSQRT, MVT::v4f16, Expand); 707 708 setOperationAction(ISD::FABS, MVT::v8f16, Expand); 709 setOperationAction(ISD::FADD, MVT::v8f16, Expand); 710 setOperationAction(ISD::FCEIL, MVT::v8f16, Expand); 711 setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand); 712 setOperationAction(ISD::FDIV, MVT::v8f16, Expand); 713 setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand); 714 setOperationAction(ISD::FMA, MVT::v8f16, Expand); 715 setOperationAction(ISD::FMUL, MVT::v8f16, Expand); 716 setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand); 717 setOperationAction(ISD::FNEG, MVT::v8f16, Expand); 718 setOperationAction(ISD::FROUND, MVT::v8f16, Expand); 719 setOperationAction(ISD::FROUNDEVEN, MVT::v8f16, Expand); 720 setOperationAction(ISD::FRINT, MVT::v8f16, Expand); 721 setOperationAction(ISD::FSQRT, MVT::v8f16, Expand); 722 setOperationAction(ISD::FSUB, MVT::v8f16, Expand); 723 setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand); 724 setOperationAction(ISD::SETCC, MVT::v8f16, Expand); 725 setOperationAction(ISD::BR_CC, MVT::v8f16, Expand); 726 setOperationAction(ISD::SELECT, MVT::v8f16, Expand); 727 setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand); 728 setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand); 729 } 730 731 // AArch64 has implementations of a lot of rounding-like FP operations. 732 for (auto Op : 733 {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, 734 ISD::FRINT, ISD::FTRUNC, ISD::FROUND, 735 ISD::FROUNDEVEN, ISD::FMINNUM, ISD::FMAXNUM, 736 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::LROUND, 737 ISD::LLROUND, ISD::LRINT, ISD::LLRINT, 738 ISD::STRICT_FFLOOR, ISD::STRICT_FCEIL, ISD::STRICT_FNEARBYINT, 739 ISD::STRICT_FRINT, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN, 740 ISD::STRICT_FROUND, ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, 741 ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_LROUND, 742 ISD::STRICT_LLROUND, ISD::STRICT_LRINT, ISD::STRICT_LLRINT}) { 743 for (MVT Ty : {MVT::f32, MVT::f64}) 744 setOperationAction(Op, Ty, Legal); 745 if (Subtarget->hasFullFP16()) 746 setOperationAction(Op, MVT::f16, Legal); 747 } 748 749 // Basic strict FP operations are legal 750 for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL, 751 ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT}) { 752 for (MVT Ty : {MVT::f32, MVT::f64}) 753 setOperationAction(Op, Ty, Legal); 754 if (Subtarget->hasFullFP16()) 755 setOperationAction(Op, MVT::f16, Legal); 756 } 757 758 // Strict conversion to a larger type is legal 759 for (auto VT : {MVT::f32, MVT::f64}) 760 setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal); 761 762 setOperationAction(ISD::PREFETCH, MVT::Other, Custom); 763 764 setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom); 765 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom); 766 767 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); 768 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); 769 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 770 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom); 771 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 772 773 // Generate outline atomics library calls only if LSE was not specified for 774 // subtarget 775 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) { 776 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, LibCall); 777 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, LibCall); 778 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall); 779 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, LibCall); 780 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, LibCall); 781 setOperationAction(ISD::ATOMIC_SWAP, MVT::i8, LibCall); 782 setOperationAction(ISD::ATOMIC_SWAP, MVT::i16, LibCall); 783 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall); 784 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, LibCall); 785 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i8, LibCall); 786 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i16, LibCall); 787 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall); 788 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, LibCall); 789 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i8, LibCall); 790 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i16, LibCall); 791 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall); 792 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, LibCall); 793 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i8, LibCall); 794 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i16, LibCall); 795 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i32, LibCall); 796 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i64, LibCall); 797 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i8, LibCall); 798 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, LibCall); 799 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall); 800 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, LibCall); 801 #define LCALLNAMES(A, B, N) \ 802 setLibcallName(A##N##_RELAX, #B #N "_relax"); \ 803 setLibcallName(A##N##_ACQ, #B #N "_acq"); \ 804 setLibcallName(A##N##_REL, #B #N "_rel"); \ 805 setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel"); 806 #define LCALLNAME4(A, B) \ 807 LCALLNAMES(A, B, 1) \ 808 LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) 809 #define LCALLNAME5(A, B) \ 810 LCALLNAMES(A, B, 1) \ 811 LCALLNAMES(A, B, 2) \ 812 LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16) 813 LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas) 814 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp) 815 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd) 816 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset) 817 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr) 818 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor) 819 #undef LCALLNAMES 820 #undef LCALLNAME4 821 #undef LCALLNAME5 822 } 823 824 // 128-bit loads and stores can be done without expanding 825 setOperationAction(ISD::LOAD, MVT::i128, Custom); 826 setOperationAction(ISD::STORE, MVT::i128, Custom); 827 828 // Aligned 128-bit loads and stores are single-copy atomic according to the 829 // v8.4a spec. 830 if (Subtarget->hasLSE2()) { 831 setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom); 832 setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom); 833 } 834 835 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the 836 // custom lowering, as there are no un-paired non-temporal stores and 837 // legalization will break up 256 bit inputs. 838 setOperationAction(ISD::STORE, MVT::v32i8, Custom); 839 setOperationAction(ISD::STORE, MVT::v16i16, Custom); 840 setOperationAction(ISD::STORE, MVT::v16f16, Custom); 841 setOperationAction(ISD::STORE, MVT::v8i32, Custom); 842 setOperationAction(ISD::STORE, MVT::v8f32, Custom); 843 setOperationAction(ISD::STORE, MVT::v4f64, Custom); 844 setOperationAction(ISD::STORE, MVT::v4i64, Custom); 845 846 // 256 bit non-temporal loads can be lowered to LDNP. This is done using 847 // custom lowering, as there are no un-paired non-temporal loads legalization 848 // will break up 256 bit inputs. 849 setOperationAction(ISD::LOAD, MVT::v32i8, Custom); 850 setOperationAction(ISD::LOAD, MVT::v16i16, Custom); 851 setOperationAction(ISD::LOAD, MVT::v16f16, Custom); 852 setOperationAction(ISD::LOAD, MVT::v8i32, Custom); 853 setOperationAction(ISD::LOAD, MVT::v8f32, Custom); 854 setOperationAction(ISD::LOAD, MVT::v4f64, Custom); 855 setOperationAction(ISD::LOAD, MVT::v4i64, Custom); 856 857 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0. 858 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); 859 860 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && 861 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { 862 // Issue __sincos_stret if available. 863 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 864 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 865 } else { 866 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 867 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 868 } 869 870 if (Subtarget->getTargetTriple().isOSMSVCRT()) { 871 // MSVCRT doesn't have powi; fall back to pow 872 setLibcallName(RTLIB::POWI_F32, nullptr); 873 setLibcallName(RTLIB::POWI_F64, nullptr); 874 } 875 876 // Make floating-point constants legal for the large code model, so they don't 877 // become loads from the constant pool. 878 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) { 879 setOperationAction(ISD::ConstantFP, MVT::f32, Legal); 880 setOperationAction(ISD::ConstantFP, MVT::f64, Legal); 881 } 882 883 // AArch64 does not have floating-point extending loads, i1 sign-extending 884 // load, floating-point truncating stores, or v2i32->v2i16 truncating store. 885 for (MVT VT : MVT::fp_valuetypes()) { 886 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); 887 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); 888 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand); 889 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand); 890 } 891 for (MVT VT : MVT::integer_valuetypes()) 892 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand); 893 894 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 895 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 896 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 897 setTruncStoreAction(MVT::f128, MVT::f80, Expand); 898 setTruncStoreAction(MVT::f128, MVT::f64, Expand); 899 setTruncStoreAction(MVT::f128, MVT::f32, Expand); 900 setTruncStoreAction(MVT::f128, MVT::f16, Expand); 901 902 setOperationAction(ISD::BITCAST, MVT::i16, Custom); 903 setOperationAction(ISD::BITCAST, MVT::f16, Custom); 904 setOperationAction(ISD::BITCAST, MVT::bf16, Custom); 905 906 // Indexed loads and stores are supported. 907 for (unsigned im = (unsigned)ISD::PRE_INC; 908 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 909 setIndexedLoadAction(im, MVT::i8, Legal); 910 setIndexedLoadAction(im, MVT::i16, Legal); 911 setIndexedLoadAction(im, MVT::i32, Legal); 912 setIndexedLoadAction(im, MVT::i64, Legal); 913 setIndexedLoadAction(im, MVT::f64, Legal); 914 setIndexedLoadAction(im, MVT::f32, Legal); 915 setIndexedLoadAction(im, MVT::f16, Legal); 916 setIndexedLoadAction(im, MVT::bf16, Legal); 917 setIndexedStoreAction(im, MVT::i8, Legal); 918 setIndexedStoreAction(im, MVT::i16, Legal); 919 setIndexedStoreAction(im, MVT::i32, Legal); 920 setIndexedStoreAction(im, MVT::i64, Legal); 921 setIndexedStoreAction(im, MVT::f64, Legal); 922 setIndexedStoreAction(im, MVT::f32, Legal); 923 setIndexedStoreAction(im, MVT::f16, Legal); 924 setIndexedStoreAction(im, MVT::bf16, Legal); 925 } 926 927 // Trap. 928 setOperationAction(ISD::TRAP, MVT::Other, Legal); 929 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); 930 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal); 931 932 // We combine OR nodes for bitfield operations. 933 setTargetDAGCombine(ISD::OR); 934 // Try to create BICs for vector ANDs. 935 setTargetDAGCombine(ISD::AND); 936 937 // Vector add and sub nodes may conceal a high-half opportunity. 938 // Also, try to fold ADD into CSINC/CSINV.. 939 setTargetDAGCombine({ISD::ADD, ISD::ABS, ISD::SUB, ISD::XOR, ISD::SINT_TO_FP, 940 ISD::UINT_TO_FP}); 941 942 setTargetDAGCombine({ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT, 943 ISD::FP_TO_UINT_SAT, ISD::FDIV}); 944 945 // Try and combine setcc with csel 946 setTargetDAGCombine(ISD::SETCC); 947 948 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 949 950 setTargetDAGCombine({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND, 951 ISD::VECTOR_SPLICE, ISD::SIGN_EXTEND_INREG, 952 ISD::CONCAT_VECTORS, ISD::EXTRACT_SUBVECTOR, 953 ISD::INSERT_SUBVECTOR, ISD::STORE, ISD::BUILD_VECTOR}); 954 setTargetDAGCombine(ISD::TRUNCATE); 955 setTargetDAGCombine(ISD::LOAD); 956 957 setTargetDAGCombine(ISD::MSTORE); 958 959 setTargetDAGCombine(ISD::MUL); 960 961 setTargetDAGCombine({ISD::SELECT, ISD::VSELECT}); 962 963 setTargetDAGCombine({ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN, 964 ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT, 965 ISD::VECREDUCE_ADD, ISD::STEP_VECTOR}); 966 967 setTargetDAGCombine({ISD::MGATHER, ISD::MSCATTER}); 968 969 setTargetDAGCombine(ISD::FP_EXTEND); 970 971 setTargetDAGCombine(ISD::GlobalAddress); 972 973 setTargetDAGCombine(ISD::CTLZ); 974 975 // In case of strict alignment, avoid an excessive number of byte wide stores. 976 MaxStoresPerMemsetOptSize = 8; 977 MaxStoresPerMemset = 978 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32; 979 980 MaxGluedStoresPerMemcpy = 4; 981 MaxStoresPerMemcpyOptSize = 4; 982 MaxStoresPerMemcpy = 983 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16; 984 985 MaxStoresPerMemmoveOptSize = 4; 986 MaxStoresPerMemmove = 4; 987 988 MaxLoadsPerMemcmpOptSize = 4; 989 MaxLoadsPerMemcmp = 990 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8; 991 992 setStackPointerRegisterToSaveRestore(AArch64::SP); 993 994 setSchedulingPreference(Sched::Hybrid); 995 996 EnableExtLdPromotion = true; 997 998 // Set required alignment. 999 setMinFunctionAlignment(Align(4)); 1000 // Set preferred alignments. 1001 setPrefLoopAlignment(Align(1ULL << STI.getPrefLoopLogAlignment())); 1002 setMaxBytesForAlignment(STI.getMaxBytesForLoopAlignment()); 1003 setPrefFunctionAlignment(Align(1ULL << STI.getPrefFunctionLogAlignment())); 1004 1005 // Only change the limit for entries in a jump table if specified by 1006 // the sub target, but not at the command line. 1007 unsigned MaxJT = STI.getMaximumJumpTableSize(); 1008 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX) 1009 setMaximumJumpTableSize(MaxJT); 1010 1011 setHasExtractBitsInsn(true); 1012 1013 setMaxDivRemBitWidthSupported(128); 1014 1015 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1016 1017 if (Subtarget->hasNEON()) { 1018 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to 1019 // silliness like this: 1020 for (auto Op : 1021 {ISD::SELECT, ISD::SELECT_CC, ISD::SETCC, 1022 ISD::BR_CC, ISD::FADD, ISD::FSUB, 1023 ISD::FMUL, ISD::FDIV, ISD::FMA, 1024 ISD::FNEG, ISD::FABS, ISD::FCEIL, 1025 ISD::FSQRT, ISD::FFLOOR, ISD::FNEARBYINT, 1026 ISD::FRINT, ISD::FROUND, ISD::FROUNDEVEN, 1027 ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM, 1028 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::STRICT_FADD, 1029 ISD::STRICT_FSUB, ISD::STRICT_FMUL, ISD::STRICT_FDIV, 1030 ISD::STRICT_FMA, ISD::STRICT_FCEIL, ISD::STRICT_FFLOOR, 1031 ISD::STRICT_FSQRT, ISD::STRICT_FRINT, ISD::STRICT_FNEARBYINT, 1032 ISD::STRICT_FROUND, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN, 1033 ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM, 1034 ISD::STRICT_FMAXIMUM}) 1035 setOperationAction(Op, MVT::v1f64, Expand); 1036 1037 for (auto Op : 1038 {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP, 1039 ISD::FP_ROUND, ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT, ISD::MUL, 1040 ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT, 1041 ISD::STRICT_SINT_TO_FP, ISD::STRICT_UINT_TO_FP, ISD::STRICT_FP_ROUND}) 1042 setOperationAction(Op, MVT::v1i64, Expand); 1043 1044 // AArch64 doesn't have a direct vector ->f32 conversion instructions for 1045 // elements smaller than i32, so promote the input to i32 first. 1046 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32); 1047 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32); 1048 1049 // Similarly, there is no direct i32 -> f64 vector conversion instruction. 1050 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the 1051 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16 1052 for (auto Op : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP, 1053 ISD::STRICT_UINT_TO_FP}) 1054 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32}) 1055 setOperationAction(Op, VT, Custom); 1056 1057 if (Subtarget->hasFullFP16()) { 1058 setOperationAction(ISD::ConstantFP, MVT::f16, Legal); 1059 1060 setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Custom); 1061 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom); 1062 setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Custom); 1063 setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom); 1064 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); 1065 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); 1066 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom); 1067 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); 1068 } else { 1069 // when AArch64 doesn't have fullfp16 support, promote the input 1070 // to i32 first. 1071 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32); 1072 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32); 1073 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32); 1074 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32); 1075 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32); 1076 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32); 1077 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32); 1078 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32); 1079 } 1080 1081 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); 1082 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); 1083 setOperationAction(ISD::BITREVERSE, MVT::v8i8, Legal); 1084 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Legal); 1085 setOperationAction(ISD::BITREVERSE, MVT::v2i32, Custom); 1086 setOperationAction(ISD::BITREVERSE, MVT::v4i32, Custom); 1087 setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom); 1088 setOperationAction(ISD::BITREVERSE, MVT::v2i64, Custom); 1089 for (auto VT : {MVT::v1i64, MVT::v2i64}) { 1090 setOperationAction(ISD::UMAX, VT, Custom); 1091 setOperationAction(ISD::SMAX, VT, Custom); 1092 setOperationAction(ISD::UMIN, VT, Custom); 1093 setOperationAction(ISD::SMIN, VT, Custom); 1094 } 1095 1096 // AArch64 doesn't have MUL.2d: 1097 setOperationAction(ISD::MUL, MVT::v2i64, Expand); 1098 // Custom handling for some quad-vector types to detect MULL. 1099 setOperationAction(ISD::MUL, MVT::v8i16, Custom); 1100 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 1101 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 1102 1103 // Saturates 1104 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, 1105 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { 1106 setOperationAction(ISD::SADDSAT, VT, Legal); 1107 setOperationAction(ISD::UADDSAT, VT, Legal); 1108 setOperationAction(ISD::SSUBSAT, VT, Legal); 1109 setOperationAction(ISD::USUBSAT, VT, Legal); 1110 } 1111 1112 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16, 1113 MVT::v4i32}) { 1114 setOperationAction(ISD::AVGFLOORS, VT, Legal); 1115 setOperationAction(ISD::AVGFLOORU, VT, Legal); 1116 setOperationAction(ISD::AVGCEILS, VT, Legal); 1117 setOperationAction(ISD::AVGCEILU, VT, Legal); 1118 setOperationAction(ISD::ABDS, VT, Legal); 1119 setOperationAction(ISD::ABDU, VT, Legal); 1120 } 1121 1122 // Vector reductions 1123 for (MVT VT : { MVT::v4f16, MVT::v2f32, 1124 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) { 1125 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) { 1126 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); 1127 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); 1128 1129 setOperationAction(ISD::VECREDUCE_FADD, VT, Legal); 1130 } 1131 } 1132 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, 1133 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { 1134 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); 1135 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); 1136 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); 1137 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); 1138 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); 1139 } 1140 setOperationAction(ISD::VECREDUCE_ADD, MVT::v2i64, Custom); 1141 1142 setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); 1143 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); 1144 // Likewise, narrowing and extending vector loads/stores aren't handled 1145 // directly. 1146 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 1147 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); 1148 1149 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) { 1150 setOperationAction(ISD::MULHS, VT, Legal); 1151 setOperationAction(ISD::MULHU, VT, Legal); 1152 } else { 1153 setOperationAction(ISD::MULHS, VT, Expand); 1154 setOperationAction(ISD::MULHU, VT, Expand); 1155 } 1156 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 1157 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 1158 1159 setOperationAction(ISD::BSWAP, VT, Expand); 1160 setOperationAction(ISD::CTTZ, VT, Expand); 1161 1162 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { 1163 setTruncStoreAction(VT, InnerVT, Expand); 1164 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); 1165 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); 1166 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); 1167 } 1168 } 1169 1170 // AArch64 has implementations of a lot of rounding-like FP operations. 1171 for (auto Op : 1172 {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC, 1173 ISD::FROUND, ISD::FROUNDEVEN, ISD::STRICT_FFLOOR, 1174 ISD::STRICT_FNEARBYINT, ISD::STRICT_FCEIL, ISD::STRICT_FRINT, 1175 ISD::STRICT_FTRUNC, ISD::STRICT_FROUND, ISD::STRICT_FROUNDEVEN}) { 1176 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) 1177 setOperationAction(Op, Ty, Legal); 1178 if (Subtarget->hasFullFP16()) 1179 for (MVT Ty : {MVT::v4f16, MVT::v8f16}) 1180 setOperationAction(Op, Ty, Legal); 1181 } 1182 1183 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom); 1184 1185 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom); 1186 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom); 1187 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom); 1188 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom); 1189 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom); 1190 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom); 1191 1192 // ADDP custom lowering 1193 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) 1194 setOperationAction(ISD::ADD, VT, Custom); 1195 // FADDP custom lowering 1196 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 }) 1197 setOperationAction(ISD::FADD, VT, Custom); 1198 } 1199 1200 if (Subtarget->hasSME()) { 1201 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); 1202 } 1203 1204 // FIXME: Move lowering for more nodes here if those are common between 1205 // SVE and SME. 1206 if (Subtarget->hasSVEorSME()) { 1207 for (auto VT : 1208 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) { 1209 setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); 1210 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 1211 } 1212 } 1213 1214 if (Subtarget->hasSVE()) { 1215 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) { 1216 setOperationAction(ISD::BITREVERSE, VT, Custom); 1217 setOperationAction(ISD::BSWAP, VT, Custom); 1218 setOperationAction(ISD::CTLZ, VT, Custom); 1219 setOperationAction(ISD::CTPOP, VT, Custom); 1220 setOperationAction(ISD::CTTZ, VT, Custom); 1221 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 1222 setOperationAction(ISD::UINT_TO_FP, VT, Custom); 1223 setOperationAction(ISD::SINT_TO_FP, VT, Custom); 1224 setOperationAction(ISD::FP_TO_UINT, VT, Custom); 1225 setOperationAction(ISD::FP_TO_SINT, VT, Custom); 1226 setOperationAction(ISD::MGATHER, VT, Custom); 1227 setOperationAction(ISD::MSCATTER, VT, Custom); 1228 setOperationAction(ISD::MLOAD, VT, Custom); 1229 setOperationAction(ISD::MUL, VT, Custom); 1230 setOperationAction(ISD::MULHS, VT, Custom); 1231 setOperationAction(ISD::MULHU, VT, Custom); 1232 setOperationAction(ISD::SPLAT_VECTOR, VT, Legal); 1233 setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); 1234 setOperationAction(ISD::SELECT, VT, Custom); 1235 setOperationAction(ISD::SETCC, VT, Custom); 1236 setOperationAction(ISD::SDIV, VT, Custom); 1237 setOperationAction(ISD::UDIV, VT, Custom); 1238 setOperationAction(ISD::SMIN, VT, Custom); 1239 setOperationAction(ISD::UMIN, VT, Custom); 1240 setOperationAction(ISD::SMAX, VT, Custom); 1241 setOperationAction(ISD::UMAX, VT, Custom); 1242 setOperationAction(ISD::SHL, VT, Custom); 1243 setOperationAction(ISD::SRL, VT, Custom); 1244 setOperationAction(ISD::SRA, VT, Custom); 1245 setOperationAction(ISD::ABS, VT, Custom); 1246 setOperationAction(ISD::ABDS, VT, Custom); 1247 setOperationAction(ISD::ABDU, VT, Custom); 1248 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); 1249 setOperationAction(ISD::VECREDUCE_AND, VT, Custom); 1250 setOperationAction(ISD::VECREDUCE_OR, VT, Custom); 1251 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); 1252 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); 1253 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); 1254 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); 1255 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); 1256 1257 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 1258 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 1259 setOperationAction(ISD::SELECT_CC, VT, Expand); 1260 setOperationAction(ISD::ROTL, VT, Expand); 1261 setOperationAction(ISD::ROTR, VT, Expand); 1262 1263 setOperationAction(ISD::SADDSAT, VT, Legal); 1264 setOperationAction(ISD::UADDSAT, VT, Legal); 1265 setOperationAction(ISD::SSUBSAT, VT, Legal); 1266 setOperationAction(ISD::USUBSAT, VT, Legal); 1267 setOperationAction(ISD::UREM, VT, Expand); 1268 setOperationAction(ISD::SREM, VT, Expand); 1269 setOperationAction(ISD::SDIVREM, VT, Expand); 1270 setOperationAction(ISD::UDIVREM, VT, Expand); 1271 1272 if (Subtarget->hasSVE2()) { 1273 setOperationAction(ISD::AVGFLOORS, VT, Custom); 1274 setOperationAction(ISD::AVGFLOORU, VT, Custom); 1275 setOperationAction(ISD::AVGCEILS, VT, Custom); 1276 setOperationAction(ISD::AVGCEILU, VT, Custom); 1277 } 1278 } 1279 1280 // Illegal unpacked integer vector types. 1281 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) { 1282 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 1283 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 1284 } 1285 1286 // Legalize unpacked bitcasts to REINTERPRET_CAST. 1287 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16, 1288 MVT::nxv4bf16, MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32}) 1289 setOperationAction(ISD::BITCAST, VT, Custom); 1290 1291 for (auto VT : 1292 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8, 1293 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 }) 1294 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal); 1295 1296 for (auto VT : 1297 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) { 1298 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 1299 setOperationAction(ISD::SELECT, VT, Custom); 1300 setOperationAction(ISD::SETCC, VT, Custom); 1301 setOperationAction(ISD::TRUNCATE, VT, Custom); 1302 setOperationAction(ISD::VECREDUCE_AND, VT, Custom); 1303 setOperationAction(ISD::VECREDUCE_OR, VT, Custom); 1304 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); 1305 1306 setOperationAction(ISD::SELECT_CC, VT, Expand); 1307 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 1308 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 1309 1310 // There are no legal MVT::nxv16f## based types. 1311 if (VT != MVT::nxv16i1) { 1312 setOperationAction(ISD::SINT_TO_FP, VT, Custom); 1313 setOperationAction(ISD::UINT_TO_FP, VT, Custom); 1314 } 1315 } 1316 1317 // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does 1318 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64, 1319 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, 1320 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) { 1321 setOperationAction(ISD::MLOAD, VT, Custom); 1322 setOperationAction(ISD::MSTORE, VT, Custom); 1323 setOperationAction(ISD::MGATHER, VT, Custom); 1324 setOperationAction(ISD::MSCATTER, VT, Custom); 1325 } 1326 1327 // Firstly, exclude all scalable vector extending loads/truncating stores, 1328 // include both integer and floating scalable vector. 1329 for (MVT VT : MVT::scalable_vector_valuetypes()) { 1330 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) { 1331 setTruncStoreAction(VT, InnerVT, Expand); 1332 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); 1333 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); 1334 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); 1335 } 1336 } 1337 1338 // Then, selectively enable those which we directly support. 1339 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal); 1340 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal); 1341 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal); 1342 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal); 1343 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal); 1344 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal); 1345 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) { 1346 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal); 1347 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal); 1348 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal); 1349 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal); 1350 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal); 1351 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal); 1352 } 1353 1354 // SVE supports truncating stores of 64 and 128-bit vectors 1355 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom); 1356 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom); 1357 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom); 1358 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); 1359 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); 1360 1361 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32, 1362 MVT::nxv4f32, MVT::nxv2f64}) { 1363 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 1364 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 1365 setOperationAction(ISD::MGATHER, VT, Custom); 1366 setOperationAction(ISD::MSCATTER, VT, Custom); 1367 setOperationAction(ISD::MLOAD, VT, Custom); 1368 setOperationAction(ISD::SPLAT_VECTOR, VT, Legal); 1369 setOperationAction(ISD::SELECT, VT, Custom); 1370 setOperationAction(ISD::FADD, VT, Custom); 1371 setOperationAction(ISD::FCOPYSIGN, VT, Custom); 1372 setOperationAction(ISD::FDIV, VT, Custom); 1373 setOperationAction(ISD::FMA, VT, Custom); 1374 setOperationAction(ISD::FMAXIMUM, VT, Custom); 1375 setOperationAction(ISD::FMAXNUM, VT, Custom); 1376 setOperationAction(ISD::FMINIMUM, VT, Custom); 1377 setOperationAction(ISD::FMINNUM, VT, Custom); 1378 setOperationAction(ISD::FMUL, VT, Custom); 1379 setOperationAction(ISD::FNEG, VT, Custom); 1380 setOperationAction(ISD::FSUB, VT, Custom); 1381 setOperationAction(ISD::FCEIL, VT, Custom); 1382 setOperationAction(ISD::FFLOOR, VT, Custom); 1383 setOperationAction(ISD::FNEARBYINT, VT, Custom); 1384 setOperationAction(ISD::FRINT, VT, Custom); 1385 setOperationAction(ISD::FROUND, VT, Custom); 1386 setOperationAction(ISD::FROUNDEVEN, VT, Custom); 1387 setOperationAction(ISD::FTRUNC, VT, Custom); 1388 setOperationAction(ISD::FSQRT, VT, Custom); 1389 setOperationAction(ISD::FABS, VT, Custom); 1390 setOperationAction(ISD::FP_EXTEND, VT, Custom); 1391 setOperationAction(ISD::FP_ROUND, VT, Custom); 1392 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); 1393 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); 1394 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); 1395 setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); 1396 setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); 1397 1398 setOperationAction(ISD::SELECT_CC, VT, Expand); 1399 setOperationAction(ISD::FREM, VT, Expand); 1400 setOperationAction(ISD::FPOW, VT, Expand); 1401 setOperationAction(ISD::FPOWI, VT, Expand); 1402 setOperationAction(ISD::FCOS, VT, Expand); 1403 setOperationAction(ISD::FSIN, VT, Expand); 1404 setOperationAction(ISD::FSINCOS, VT, Expand); 1405 setOperationAction(ISD::FEXP, VT, Expand); 1406 setOperationAction(ISD::FEXP2, VT, Expand); 1407 setOperationAction(ISD::FLOG, VT, Expand); 1408 setOperationAction(ISD::FLOG2, VT, Expand); 1409 setOperationAction(ISD::FLOG10, VT, Expand); 1410 1411 setCondCodeAction(ISD::SETO, VT, Expand); 1412 setCondCodeAction(ISD::SETOLT, VT, Expand); 1413 setCondCodeAction(ISD::SETLT, VT, Expand); 1414 setCondCodeAction(ISD::SETOLE, VT, Expand); 1415 setCondCodeAction(ISD::SETLE, VT, Expand); 1416 setCondCodeAction(ISD::SETULT, VT, Expand); 1417 setCondCodeAction(ISD::SETULE, VT, Expand); 1418 setCondCodeAction(ISD::SETUGE, VT, Expand); 1419 setCondCodeAction(ISD::SETUGT, VT, Expand); 1420 setCondCodeAction(ISD::SETUEQ, VT, Expand); 1421 setCondCodeAction(ISD::SETONE, VT, Expand); 1422 } 1423 1424 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) { 1425 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 1426 setOperationAction(ISD::MGATHER, VT, Custom); 1427 setOperationAction(ISD::MSCATTER, VT, Custom); 1428 setOperationAction(ISD::MLOAD, VT, Custom); 1429 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 1430 setOperationAction(ISD::SPLAT_VECTOR, VT, Legal); 1431 } 1432 1433 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom); 1434 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom); 1435 1436 // NEON doesn't support integer divides, but SVE does 1437 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, 1438 MVT::v4i32, MVT::v1i64, MVT::v2i64}) { 1439 setOperationAction(ISD::SDIV, VT, Custom); 1440 setOperationAction(ISD::UDIV, VT, Custom); 1441 } 1442 1443 // NEON doesn't support 64-bit vector integer muls, but SVE does. 1444 setOperationAction(ISD::MUL, MVT::v1i64, Custom); 1445 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 1446 1447 // NEON doesn't support across-vector reductions, but SVE does. 1448 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64}) 1449 setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); 1450 1451 if (Subtarget->forceStreamingCompatibleSVE()) { 1452 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Custom); 1453 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Custom); 1454 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Custom); 1455 setTruncStoreAction(MVT::v1f64, MVT::v1f16, Custom); 1456 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Custom); 1457 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Custom); 1458 setTruncStoreAction(MVT::v1f64, MVT::v1f32, Custom); 1459 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Custom); 1460 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom); 1461 for (MVT VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, 1462 MVT::v4i32, MVT::v1i64, MVT::v2i64}) 1463 addTypeForStreamingSVE(VT); 1464 1465 for (MVT VT : 1466 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64}) 1467 addTypeForStreamingSVE(VT); 1468 } 1469 1470 // NOTE: Currently this has to happen after computeRegisterProperties rather 1471 // than the preferred option of combining it with the addRegisterClass call. 1472 if (Subtarget->useSVEForFixedLengthVectors()) { 1473 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) 1474 if (useSVEForFixedLengthVectorVT(VT)) 1475 addTypeForFixedLengthSVE(VT); 1476 for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) 1477 if (useSVEForFixedLengthVectorVT(VT)) 1478 addTypeForFixedLengthSVE(VT); 1479 1480 // 64bit results can mean a bigger than NEON input. 1481 for (auto VT : {MVT::v8i8, MVT::v4i16}) 1482 setOperationAction(ISD::TRUNCATE, VT, Custom); 1483 setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom); 1484 1485 // 128bit results imply a bigger than NEON input. 1486 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32}) 1487 setOperationAction(ISD::TRUNCATE, VT, Custom); 1488 for (auto VT : {MVT::v8f16, MVT::v4f32}) 1489 setOperationAction(ISD::FP_ROUND, VT, Custom); 1490 1491 // These operations are not supported on NEON but SVE can do them. 1492 setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom); 1493 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom); 1494 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom); 1495 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); 1496 setOperationAction(ISD::MULHS, MVT::v1i64, Custom); 1497 setOperationAction(ISD::MULHS, MVT::v2i64, Custom); 1498 setOperationAction(ISD::MULHU, MVT::v1i64, Custom); 1499 setOperationAction(ISD::MULHU, MVT::v2i64, Custom); 1500 setOperationAction(ISD::SMAX, MVT::v1i64, Custom); 1501 setOperationAction(ISD::SMAX, MVT::v2i64, Custom); 1502 setOperationAction(ISD::SMIN, MVT::v1i64, Custom); 1503 setOperationAction(ISD::SMIN, MVT::v2i64, Custom); 1504 setOperationAction(ISD::UMAX, MVT::v1i64, Custom); 1505 setOperationAction(ISD::UMAX, MVT::v2i64, Custom); 1506 setOperationAction(ISD::UMIN, MVT::v1i64, Custom); 1507 setOperationAction(ISD::UMIN, MVT::v2i64, Custom); 1508 setOperationAction(ISD::VECREDUCE_SMAX, MVT::v2i64, Custom); 1509 setOperationAction(ISD::VECREDUCE_SMIN, MVT::v2i64, Custom); 1510 setOperationAction(ISD::VECREDUCE_UMAX, MVT::v2i64, Custom); 1511 setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom); 1512 1513 // Int operations with no NEON support. 1514 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, 1515 MVT::v2i32, MVT::v4i32, MVT::v2i64}) { 1516 setOperationAction(ISD::BITREVERSE, VT, Custom); 1517 setOperationAction(ISD::CTTZ, VT, Custom); 1518 setOperationAction(ISD::VECREDUCE_AND, VT, Custom); 1519 setOperationAction(ISD::VECREDUCE_OR, VT, Custom); 1520 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); 1521 } 1522 1523 1524 // Use SVE for vectors with more than 2 elements. 1525 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32}) 1526 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); 1527 } 1528 1529 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64); 1530 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32); 1531 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16); 1532 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8); 1533 1534 setOperationAction(ISD::VSCALE, MVT::i32, Custom); 1535 } 1536 1537 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) { 1538 // Only required for llvm.aarch64.mops.memset.tag 1539 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); 1540 } 1541 1542 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 1543 1544 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); 1545 1546 IsStrictFPEnabled = true; 1547 } 1548 1549 void AArch64TargetLowering::addTypeForNEON(MVT VT) { 1550 assert(VT.isVector() && "VT should be a vector type"); 1551 1552 if (VT.isFloatingPoint()) { 1553 MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT(); 1554 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo); 1555 setOperationPromotedToType(ISD::STORE, VT, PromoteTo); 1556 } 1557 1558 // Mark vector float intrinsics as expand. 1559 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) { 1560 setOperationAction(ISD::FSIN, VT, Expand); 1561 setOperationAction(ISD::FCOS, VT, Expand); 1562 setOperationAction(ISD::FPOW, VT, Expand); 1563 setOperationAction(ISD::FLOG, VT, Expand); 1564 setOperationAction(ISD::FLOG2, VT, Expand); 1565 setOperationAction(ISD::FLOG10, VT, Expand); 1566 setOperationAction(ISD::FEXP, VT, Expand); 1567 setOperationAction(ISD::FEXP2, VT, Expand); 1568 } 1569 1570 // But we do support custom-lowering for FCOPYSIGN. 1571 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 || 1572 ((VT == MVT::v4f16 || VT == MVT::v8f16) && Subtarget->hasFullFP16())) 1573 setOperationAction(ISD::FCOPYSIGN, VT, Custom); 1574 1575 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 1576 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 1577 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 1578 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom); 1579 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 1580 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 1581 setOperationAction(ISD::SRA, VT, Custom); 1582 setOperationAction(ISD::SRL, VT, Custom); 1583 setOperationAction(ISD::SHL, VT, Custom); 1584 setOperationAction(ISD::OR, VT, Custom); 1585 setOperationAction(ISD::SETCC, VT, Custom); 1586 setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); 1587 1588 setOperationAction(ISD::SELECT, VT, Expand); 1589 setOperationAction(ISD::SELECT_CC, VT, Expand); 1590 setOperationAction(ISD::VSELECT, VT, Expand); 1591 for (MVT InnerVT : MVT::all_valuetypes()) 1592 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); 1593 1594 // CNT supports only B element sizes, then use UADDLP to widen. 1595 if (VT != MVT::v8i8 && VT != MVT::v16i8) 1596 setOperationAction(ISD::CTPOP, VT, Custom); 1597 1598 setOperationAction(ISD::UDIV, VT, Expand); 1599 setOperationAction(ISD::SDIV, VT, Expand); 1600 setOperationAction(ISD::UREM, VT, Expand); 1601 setOperationAction(ISD::SREM, VT, Expand); 1602 setOperationAction(ISD::FREM, VT, Expand); 1603 1604 for (unsigned Opcode : 1605 {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT, 1606 ISD::FP_TO_UINT_SAT, ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) 1607 setOperationAction(Opcode, VT, Custom); 1608 1609 if (!VT.isFloatingPoint()) 1610 setOperationAction(ISD::ABS, VT, Legal); 1611 1612 // [SU][MIN|MAX] are available for all NEON types apart from i64. 1613 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64) 1614 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) 1615 setOperationAction(Opcode, VT, Legal); 1616 1617 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP 1618 // NEON types. 1619 if (VT.isFloatingPoint() && 1620 VT.getVectorElementType() != MVT::bf16 && 1621 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16())) 1622 for (unsigned Opcode : 1623 {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM, 1624 ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_FMINNUM, 1625 ISD::STRICT_FMAXNUM, ISD::STRICT_FADD, ISD::STRICT_FSUB, 1626 ISD::STRICT_FMUL, ISD::STRICT_FDIV, ISD::STRICT_FMA, 1627 ISD::STRICT_FSQRT}) 1628 setOperationAction(Opcode, VT, Legal); 1629 1630 // Strict fp extend and trunc are legal 1631 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16) 1632 setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal); 1633 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64) 1634 setOperationAction(ISD::STRICT_FP_ROUND, VT, Legal); 1635 1636 // FIXME: We could potentially make use of the vector comparison instructions 1637 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of 1638 // complications: 1639 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons, 1640 // so we would need to expand when the condition code doesn't match the 1641 // kind of comparison. 1642 // * Some kinds of comparison require more than one FCMXY instruction so 1643 // would need to be expanded instead. 1644 // * The lowering of the non-strict versions involves target-specific ISD 1645 // nodes so we would likely need to add strict versions of all of them and 1646 // handle them appropriately. 1647 setOperationAction(ISD::STRICT_FSETCC, VT, Expand); 1648 setOperationAction(ISD::STRICT_FSETCCS, VT, Expand); 1649 1650 if (Subtarget->isLittleEndian()) { 1651 for (unsigned im = (unsigned)ISD::PRE_INC; 1652 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 1653 setIndexedLoadAction(im, VT, Legal); 1654 setIndexedStoreAction(im, VT, Legal); 1655 } 1656 } 1657 1658 if (Subtarget->hasD128()) { 1659 setOperationAction(ISD::READ_REGISTER, MVT::i128, Custom); 1660 setOperationAction(ISD::WRITE_REGISTER, MVT::i128, Custom); 1661 } 1662 } 1663 1664 bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT, 1665 EVT OpVT) const { 1666 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo). 1667 if (!Subtarget->hasSVE()) 1668 return true; 1669 1670 // We can only support legal predicate result types. We can use the SVE 1671 // whilelo instruction for generating fixed-width predicates too. 1672 if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 && 1673 ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 && 1674 ResVT != MVT::v8i1 && ResVT != MVT::v16i1) 1675 return true; 1676 1677 // The whilelo instruction only works with i32 or i64 scalar inputs. 1678 if (OpVT != MVT::i32 && OpVT != MVT::i64) 1679 return true; 1680 1681 return false; 1682 } 1683 1684 void AArch64TargetLowering::addTypeForStreamingSVE(MVT VT) { 1685 // By default set all operations to Expand, 1686 // then change to Legal/Custom if needed. 1687 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) 1688 setOperationAction(Op, VT, Expand); 1689 1690 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); 1691 1692 if (VT.isFloatingPoint()) { 1693 setCondCodeAction(ISD::SETO, VT, Expand); 1694 setCondCodeAction(ISD::SETOLT, VT, Expand); 1695 setCondCodeAction(ISD::SETOLE, VT, Expand); 1696 setCondCodeAction(ISD::SETULT, VT, Expand); 1697 setCondCodeAction(ISD::SETULE, VT, Expand); 1698 setCondCodeAction(ISD::SETUGE, VT, Expand); 1699 setCondCodeAction(ISD::SETUGT, VT, Expand); 1700 setCondCodeAction(ISD::SETUEQ, VT, Expand); 1701 setCondCodeAction(ISD::SETONE, VT, Expand); 1702 } 1703 1704 // STORE, LOAD, SCALAR_TO_VECTOR and BITCAST are natively supported, 1705 // so no need to Custom/Expand them. 1706 setOperationAction(ISD::STORE, VT, Legal); 1707 setOperationAction(ISD::LOAD, VT, Legal); 1708 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal); 1709 setOperationAction(ISD::BITCAST, VT, Legal); 1710 1711 // Mark integer truncating stores/extending loads as having custom lowering 1712 if (VT.isInteger()) { 1713 MVT InnerVT = VT.changeVectorElementType(MVT::i8); 1714 while (InnerVT != VT) { 1715 setTruncStoreAction(VT, InnerVT, Custom); 1716 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom); 1717 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom); 1718 InnerVT = InnerVT.changeVectorElementType( 1719 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits())); 1720 } 1721 } 1722 1723 // Mark floating-point truncating stores/extending loads as having custom 1724 // lowering 1725 if (VT.isFloatingPoint()) { 1726 MVT InnerVT = VT.changeVectorElementType(MVT::f16); 1727 while (InnerVT != VT) { 1728 setTruncStoreAction(VT, InnerVT, Custom); 1729 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Custom); 1730 InnerVT = InnerVT.changeVectorElementType( 1731 MVT::getFloatingPointVT(2 * InnerVT.getScalarSizeInBits())); 1732 } 1733 } 1734 1735 setOperationAction(ISD::ABS, VT, Custom); 1736 setOperationAction(ISD::ADD, VT, Custom); 1737 setOperationAction(ISD::AND, VT, Custom); 1738 setOperationAction(ISD::ANY_EXTEND, VT, Custom); 1739 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 1740 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 1741 setOperationAction(ISD::CTLZ, VT, Custom); 1742 setOperationAction(ISD::CTPOP, VT, Custom); 1743 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 1744 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 1745 setOperationAction(ISD::FABS, VT, Custom); 1746 setOperationAction(ISD::FADD, VT, Custom); 1747 setOperationAction(ISD::FCEIL, VT, Custom); 1748 setOperationAction(ISD::FCOPYSIGN, VT, Custom); 1749 setOperationAction(ISD::FDIV, VT, Custom); 1750 setOperationAction(ISD::FFLOOR, VT, Custom); 1751 setOperationAction(ISD::FMA, VT, Custom); 1752 setOperationAction(ISD::FMAXIMUM, VT, Custom); 1753 setOperationAction(ISD::FMAXNUM, VT, Custom); 1754 setOperationAction(ISD::FMINIMUM, VT, Custom); 1755 setOperationAction(ISD::FMINNUM, VT, Custom); 1756 setOperationAction(ISD::FMUL, VT, Custom); 1757 setOperationAction(ISD::FNEARBYINT, VT, Custom); 1758 setOperationAction(ISD::FNEG, VT, Custom); 1759 setOperationAction(ISD::FP_ROUND, VT, Custom); 1760 setOperationAction(ISD::FP_TO_SINT, VT, Custom); 1761 setOperationAction(ISD::FP_TO_UINT, VT, Custom); 1762 setOperationAction(ISD::FRINT, VT, Custom); 1763 setOperationAction(ISD::FROUND, VT, Custom); 1764 setOperationAction(ISD::FROUNDEVEN, VT, Custom); 1765 setOperationAction(ISD::FSQRT, VT, Custom); 1766 setOperationAction(ISD::FSUB, VT, Custom); 1767 setOperationAction(ISD::FTRUNC, VT, Custom); 1768 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 1769 setOperationAction(ISD::MLOAD, VT, Custom); 1770 setOperationAction(ISD::MSTORE, VT, Custom); 1771 setOperationAction(ISD::MUL, VT, Custom); 1772 setOperationAction(ISD::MULHS, VT, Custom); 1773 setOperationAction(ISD::MULHU, VT, Custom); 1774 setOperationAction(ISD::OR, VT, Custom); 1775 setOperationAction(ISD::SDIV, VT, Custom); 1776 setOperationAction(ISD::SETCC, VT, Custom); 1777 setOperationAction(ISD::SHL, VT, Custom); 1778 setOperationAction(ISD::SIGN_EXTEND, VT, Custom); 1779 setOperationAction(ISD::SINT_TO_FP, VT, Custom); 1780 setOperationAction(ISD::SMAX, VT, Custom); 1781 setOperationAction(ISD::SMIN, VT, Custom); 1782 setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); 1783 setOperationAction(ISD::SRA, VT, Custom); 1784 setOperationAction(ISD::SRL, VT, Custom); 1785 setOperationAction(ISD::SUB, VT, Custom); 1786 setOperationAction(ISD::TRUNCATE, VT, Custom); 1787 setOperationAction(ISD::UDIV, VT, Custom); 1788 setOperationAction(ISD::UINT_TO_FP, VT, Custom); 1789 setOperationAction(ISD::UMAX, VT, Custom); 1790 setOperationAction(ISD::UMIN, VT, Custom); 1791 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); 1792 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); 1793 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); 1794 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); 1795 setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); 1796 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); 1797 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); 1798 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); 1799 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); 1800 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 1801 setOperationAction(ISD::XOR, VT, Custom); 1802 setOperationAction(ISD::ZERO_EXTEND, VT, Custom); 1803 } 1804 1805 void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { 1806 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); 1807 1808 // By default everything must be expanded. 1809 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) 1810 setOperationAction(Op, VT, Expand); 1811 1812 if (VT.isFloatingPoint()) { 1813 setCondCodeAction(ISD::SETO, VT, Expand); 1814 setCondCodeAction(ISD::SETOLT, VT, Expand); 1815 setCondCodeAction(ISD::SETOLE, VT, Expand); 1816 setCondCodeAction(ISD::SETULT, VT, Expand); 1817 setCondCodeAction(ISD::SETULE, VT, Expand); 1818 setCondCodeAction(ISD::SETUGE, VT, Expand); 1819 setCondCodeAction(ISD::SETUGT, VT, Expand); 1820 setCondCodeAction(ISD::SETUEQ, VT, Expand); 1821 setCondCodeAction(ISD::SETONE, VT, Expand); 1822 } 1823 1824 // Mark integer truncating stores/extending loads as having custom lowering 1825 if (VT.isInteger()) { 1826 MVT InnerVT = VT.changeVectorElementType(MVT::i8); 1827 while (InnerVT != VT) { 1828 setTruncStoreAction(VT, InnerVT, Custom); 1829 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom); 1830 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom); 1831 InnerVT = InnerVT.changeVectorElementType( 1832 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits())); 1833 } 1834 } 1835 1836 // Mark floating-point truncating stores/extending loads as having custom 1837 // lowering 1838 if (VT.isFloatingPoint()) { 1839 MVT InnerVT = VT.changeVectorElementType(MVT::f16); 1840 while (InnerVT != VT) { 1841 setTruncStoreAction(VT, InnerVT, Custom); 1842 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Custom); 1843 InnerVT = InnerVT.changeVectorElementType( 1844 MVT::getFloatingPointVT(2 * InnerVT.getScalarSizeInBits())); 1845 } 1846 } 1847 1848 // Lower fixed length vector operations to scalable equivalents. 1849 setOperationAction(ISD::ABS, VT, Custom); 1850 setOperationAction(ISD::ADD, VT, Custom); 1851 setOperationAction(ISD::AND, VT, Custom); 1852 setOperationAction(ISD::ANY_EXTEND, VT, Custom); 1853 setOperationAction(ISD::BITCAST, VT, Custom); 1854 setOperationAction(ISD::BITREVERSE, VT, Custom); 1855 setOperationAction(ISD::BSWAP, VT, Custom); 1856 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 1857 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 1858 setOperationAction(ISD::CTLZ, VT, Custom); 1859 setOperationAction(ISD::CTPOP, VT, Custom); 1860 setOperationAction(ISD::CTTZ, VT, Custom); 1861 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 1862 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 1863 setOperationAction(ISD::FABS, VT, Custom); 1864 setOperationAction(ISD::FADD, VT, Custom); 1865 setOperationAction(ISD::FCEIL, VT, Custom); 1866 setOperationAction(ISD::FCOPYSIGN, VT, Custom); 1867 setOperationAction(ISD::FDIV, VT, Custom); 1868 setOperationAction(ISD::FFLOOR, VT, Custom); 1869 setOperationAction(ISD::FMA, VT, Custom); 1870 setOperationAction(ISD::FMAXIMUM, VT, Custom); 1871 setOperationAction(ISD::FMAXNUM, VT, Custom); 1872 setOperationAction(ISD::FMINIMUM, VT, Custom); 1873 setOperationAction(ISD::FMINNUM, VT, Custom); 1874 setOperationAction(ISD::FMUL, VT, Custom); 1875 setOperationAction(ISD::FNEARBYINT, VT, Custom); 1876 setOperationAction(ISD::FNEG, VT, Custom); 1877 setOperationAction(ISD::FP_EXTEND, VT, Custom); 1878 setOperationAction(ISD::FP_ROUND, VT, Custom); 1879 setOperationAction(ISD::FP_TO_SINT, VT, Custom); 1880 setOperationAction(ISD::FP_TO_UINT, VT, Custom); 1881 setOperationAction(ISD::FRINT, VT, Custom); 1882 setOperationAction(ISD::FROUND, VT, Custom); 1883 setOperationAction(ISD::FROUNDEVEN, VT, Custom); 1884 setOperationAction(ISD::FSQRT, VT, Custom); 1885 setOperationAction(ISD::FSUB, VT, Custom); 1886 setOperationAction(ISD::FTRUNC, VT, Custom); 1887 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 1888 setOperationAction(ISD::LOAD, VT, Custom); 1889 setOperationAction(ISD::MGATHER, VT, Custom); 1890 setOperationAction(ISD::MLOAD, VT, Custom); 1891 setOperationAction(ISD::MSCATTER, VT, Custom); 1892 setOperationAction(ISD::MSTORE, VT, Custom); 1893 setOperationAction(ISD::MUL, VT, Custom); 1894 setOperationAction(ISD::MULHS, VT, Custom); 1895 setOperationAction(ISD::MULHU, VT, Custom); 1896 setOperationAction(ISD::OR, VT, Custom); 1897 setOperationAction(ISD::SDIV, VT, Custom); 1898 setOperationAction(ISD::SELECT, VT, Custom); 1899 setOperationAction(ISD::SETCC, VT, Custom); 1900 setOperationAction(ISD::SHL, VT, Custom); 1901 setOperationAction(ISD::SIGN_EXTEND, VT, Custom); 1902 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom); 1903 setOperationAction(ISD::SINT_TO_FP, VT, Custom); 1904 setOperationAction(ISD::SMAX, VT, Custom); 1905 setOperationAction(ISD::SMIN, VT, Custom); 1906 setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); 1907 setOperationAction(ISD::SRA, VT, Custom); 1908 setOperationAction(ISD::SRL, VT, Custom); 1909 setOperationAction(ISD::STORE, VT, Custom); 1910 setOperationAction(ISD::SUB, VT, Custom); 1911 setOperationAction(ISD::TRUNCATE, VT, Custom); 1912 setOperationAction(ISD::UDIV, VT, Custom); 1913 setOperationAction(ISD::UINT_TO_FP, VT, Custom); 1914 setOperationAction(ISD::UMAX, VT, Custom); 1915 setOperationAction(ISD::UMIN, VT, Custom); 1916 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); 1917 setOperationAction(ISD::VECREDUCE_AND, VT, Custom); 1918 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); 1919 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); 1920 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); 1921 setOperationAction(ISD::VECREDUCE_OR, VT, Custom); 1922 setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); 1923 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); 1924 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); 1925 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); 1926 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); 1927 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); 1928 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 1929 setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); 1930 setOperationAction(ISD::VSELECT, VT, Custom); 1931 setOperationAction(ISD::XOR, VT, Custom); 1932 setOperationAction(ISD::ZERO_EXTEND, VT, Custom); 1933 } 1934 1935 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) { 1936 addRegisterClass(VT, &AArch64::FPR64RegClass); 1937 addTypeForNEON(VT); 1938 } 1939 1940 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) { 1941 addRegisterClass(VT, &AArch64::FPR128RegClass); 1942 addTypeForNEON(VT); 1943 } 1944 1945 EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, 1946 LLVMContext &C, EVT VT) const { 1947 if (!VT.isVector()) 1948 return MVT::i32; 1949 if (VT.isScalableVector()) 1950 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount()); 1951 return VT.changeVectorElementTypeToInteger(); 1952 } 1953 1954 // isIntImmediate - This method tests to see if the node is a constant 1955 // operand. If so Imm will receive the value. 1956 static bool isIntImmediate(const SDNode *N, uint64_t &Imm) { 1957 if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) { 1958 Imm = C->getZExtValue(); 1959 return true; 1960 } 1961 return false; 1962 } 1963 1964 // isOpcWithIntImmediate - This method tests to see if the node is a specific 1965 // opcode and that it has a immediate integer right operand. 1966 // If so Imm will receive the value. 1967 static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc, 1968 uint64_t &Imm) { 1969 return N->getOpcode() == Opc && 1970 isIntImmediate(N->getOperand(1).getNode(), Imm); 1971 } 1972 1973 static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, 1974 const APInt &Demanded, 1975 TargetLowering::TargetLoweringOpt &TLO, 1976 unsigned NewOpc) { 1977 uint64_t OldImm = Imm, NewImm, Enc; 1978 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask; 1979 1980 // Return if the immediate is already all zeros, all ones, a bimm32 or a 1981 // bimm64. 1982 if (Imm == 0 || Imm == Mask || 1983 AArch64_AM::isLogicalImmediate(Imm & Mask, Size)) 1984 return false; 1985 1986 unsigned EltSize = Size; 1987 uint64_t DemandedBits = Demanded.getZExtValue(); 1988 1989 // Clear bits that are not demanded. 1990 Imm &= DemandedBits; 1991 1992 while (true) { 1993 // The goal here is to set the non-demanded bits in a way that minimizes 1994 // the number of switching between 0 and 1. In order to achieve this goal, 1995 // we set the non-demanded bits to the value of the preceding demanded bits. 1996 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a 1997 // non-demanded bit), we copy bit0 (1) to the least significant 'x', 1998 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'. 1999 // The final result is 0b11000011. 2000 uint64_t NonDemandedBits = ~DemandedBits; 2001 uint64_t InvertedImm = ~Imm & DemandedBits; 2002 uint64_t RotatedImm = 2003 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) & 2004 NonDemandedBits; 2005 uint64_t Sum = RotatedImm + NonDemandedBits; 2006 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1)); 2007 uint64_t Ones = (Sum + Carry) & NonDemandedBits; 2008 NewImm = (Imm | Ones) & Mask; 2009 2010 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate 2011 // or all-ones or all-zeros, in which case we can stop searching. Otherwise, 2012 // we halve the element size and continue the search. 2013 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask))) 2014 break; 2015 2016 // We cannot shrink the element size any further if it is 2-bits. 2017 if (EltSize == 2) 2018 return false; 2019 2020 EltSize /= 2; 2021 Mask >>= EltSize; 2022 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize; 2023 2024 // Return if there is mismatch in any of the demanded bits of Imm and Hi. 2025 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0) 2026 return false; 2027 2028 // Merge the upper and lower halves of Imm and DemandedBits. 2029 Imm |= Hi; 2030 DemandedBits |= DemandedBitsHi; 2031 } 2032 2033 ++NumOptimizedImms; 2034 2035 // Replicate the element across the register width. 2036 while (EltSize < Size) { 2037 NewImm |= NewImm << EltSize; 2038 EltSize *= 2; 2039 } 2040 2041 (void)OldImm; 2042 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 && 2043 "demanded bits should never be altered"); 2044 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm"); 2045 2046 // Create the new constant immediate node. 2047 EVT VT = Op.getValueType(); 2048 SDLoc DL(Op); 2049 SDValue New; 2050 2051 // If the new constant immediate is all-zeros or all-ones, let the target 2052 // independent DAG combine optimize this node. 2053 if (NewImm == 0 || NewImm == OrigMask) { 2054 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0), 2055 TLO.DAG.getConstant(NewImm, DL, VT)); 2056 // Otherwise, create a machine node so that target independent DAG combine 2057 // doesn't undo this optimization. 2058 } else { 2059 Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size); 2060 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT); 2061 New = SDValue( 2062 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0); 2063 } 2064 2065 return TLO.CombineTo(Op, New); 2066 } 2067 2068 bool AArch64TargetLowering::targetShrinkDemandedConstant( 2069 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, 2070 TargetLoweringOpt &TLO) const { 2071 // Delay this optimization to as late as possible. 2072 if (!TLO.LegalOps) 2073 return false; 2074 2075 if (!EnableOptimizeLogicalImm) 2076 return false; 2077 2078 EVT VT = Op.getValueType(); 2079 if (VT.isVector()) 2080 return false; 2081 2082 unsigned Size = VT.getSizeInBits(); 2083 assert((Size == 32 || Size == 64) && 2084 "i32 or i64 is expected after legalization."); 2085 2086 // Exit early if we demand all bits. 2087 if (DemandedBits.countPopulation() == Size) 2088 return false; 2089 2090 unsigned NewOpc; 2091 switch (Op.getOpcode()) { 2092 default: 2093 return false; 2094 case ISD::AND: 2095 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri; 2096 break; 2097 case ISD::OR: 2098 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri; 2099 break; 2100 case ISD::XOR: 2101 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri; 2102 break; 2103 } 2104 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 2105 if (!C) 2106 return false; 2107 uint64_t Imm = C->getZExtValue(); 2108 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc); 2109 } 2110 2111 /// computeKnownBitsForTargetNode - Determine which of the bits specified in 2112 /// Mask are known to be either zero or one and return them Known. 2113 void AArch64TargetLowering::computeKnownBitsForTargetNode( 2114 const SDValue Op, KnownBits &Known, const APInt &DemandedElts, 2115 const SelectionDAG &DAG, unsigned Depth) const { 2116 switch (Op.getOpcode()) { 2117 default: 2118 break; 2119 case AArch64ISD::DUP: { 2120 SDValue SrcOp = Op.getOperand(0); 2121 Known = DAG.computeKnownBits(SrcOp, Depth + 1); 2122 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) { 2123 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() && 2124 "Expected DUP implicit truncation"); 2125 Known = Known.trunc(Op.getScalarValueSizeInBits()); 2126 } 2127 break; 2128 } 2129 case AArch64ISD::CSEL: { 2130 KnownBits Known2; 2131 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); 2132 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); 2133 Known = KnownBits::commonBits(Known, Known2); 2134 break; 2135 } 2136 case AArch64ISD::BICi: { 2137 // Compute the bit cleared value. 2138 uint64_t Mask = 2139 ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2)); 2140 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); 2141 Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask)); 2142 break; 2143 } 2144 case AArch64ISD::VLSHR: { 2145 KnownBits Known2; 2146 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); 2147 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); 2148 Known = KnownBits::lshr(Known, Known2); 2149 break; 2150 } 2151 case AArch64ISD::VASHR: { 2152 KnownBits Known2; 2153 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); 2154 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); 2155 Known = KnownBits::ashr(Known, Known2); 2156 break; 2157 } 2158 case AArch64ISD::MOVI: { 2159 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(0)); 2160 Known = 2161 KnownBits::makeConstant(APInt(Known.getBitWidth(), CN->getZExtValue())); 2162 break; 2163 } 2164 case AArch64ISD::LOADgot: 2165 case AArch64ISD::ADDlow: { 2166 if (!Subtarget->isTargetILP32()) 2167 break; 2168 // In ILP32 mode all valid pointers are in the low 4GB of the address-space. 2169 Known.Zero = APInt::getHighBitsSet(64, 32); 2170 break; 2171 } 2172 case AArch64ISD::ASSERT_ZEXT_BOOL: { 2173 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); 2174 Known.Zero |= APInt(Known.getBitWidth(), 0xFE); 2175 break; 2176 } 2177 case ISD::INTRINSIC_W_CHAIN: { 2178 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); 2179 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); 2180 switch (IntID) { 2181 default: return; 2182 case Intrinsic::aarch64_ldaxr: 2183 case Intrinsic::aarch64_ldxr: { 2184 unsigned BitWidth = Known.getBitWidth(); 2185 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); 2186 unsigned MemBits = VT.getScalarSizeInBits(); 2187 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); 2188 return; 2189 } 2190 } 2191 break; 2192 } 2193 case ISD::INTRINSIC_WO_CHAIN: 2194 case ISD::INTRINSIC_VOID: { 2195 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 2196 switch (IntNo) { 2197 default: 2198 break; 2199 case Intrinsic::aarch64_neon_umaxv: 2200 case Intrinsic::aarch64_neon_uminv: { 2201 // Figure out the datatype of the vector operand. The UMINV instruction 2202 // will zero extend the result, so we can mark as known zero all the 2203 // bits larger than the element datatype. 32-bit or larget doesn't need 2204 // this as those are legal types and will be handled by isel directly. 2205 MVT VT = Op.getOperand(1).getValueType().getSimpleVT(); 2206 unsigned BitWidth = Known.getBitWidth(); 2207 if (VT == MVT::v8i8 || VT == MVT::v16i8) { 2208 assert(BitWidth >= 8 && "Unexpected width!"); 2209 APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8); 2210 Known.Zero |= Mask; 2211 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) { 2212 assert(BitWidth >= 16 && "Unexpected width!"); 2213 APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16); 2214 Known.Zero |= Mask; 2215 } 2216 break; 2217 } break; 2218 } 2219 } 2220 } 2221 } 2222 2223 MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL, 2224 EVT) const { 2225 return MVT::i64; 2226 } 2227 2228 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( 2229 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, 2230 unsigned *Fast) const { 2231 if (Subtarget->requiresStrictAlign()) 2232 return false; 2233 2234 if (Fast) { 2235 // Some CPUs are fine with unaligned stores except for 128-bit ones. 2236 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 || 2237 // See comments in performSTORECombine() for more details about 2238 // these conditions. 2239 2240 // Code that uses clang vector extensions can mark that it 2241 // wants unaligned accesses to be treated as fast by 2242 // underspecifying alignment to be 1 or 2. 2243 Alignment <= 2 || 2244 2245 // Disregard v2i64. Memcpy lowering produces those and splitting 2246 // them regresses performance on micro-benchmarks and olden/bh. 2247 VT == MVT::v2i64; 2248 } 2249 return true; 2250 } 2251 2252 // Same as above but handling LLTs instead. 2253 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( 2254 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, 2255 unsigned *Fast) const { 2256 if (Subtarget->requiresStrictAlign()) 2257 return false; 2258 2259 if (Fast) { 2260 // Some CPUs are fine with unaligned stores except for 128-bit ones. 2261 *Fast = !Subtarget->isMisaligned128StoreSlow() || 2262 Ty.getSizeInBytes() != 16 || 2263 // See comments in performSTORECombine() for more details about 2264 // these conditions. 2265 2266 // Code that uses clang vector extensions can mark that it 2267 // wants unaligned accesses to be treated as fast by 2268 // underspecifying alignment to be 1 or 2. 2269 Alignment <= 2 || 2270 2271 // Disregard v2i64. Memcpy lowering produces those and splitting 2272 // them regresses performance on micro-benchmarks and olden/bh. 2273 Ty == LLT::fixed_vector(2, 64); 2274 } 2275 return true; 2276 } 2277 2278 FastISel * 2279 AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 2280 const TargetLibraryInfo *libInfo) const { 2281 return AArch64::createFastISel(funcInfo, libInfo); 2282 } 2283 2284 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { 2285 #define MAKE_CASE(V) \ 2286 case V: \ 2287 return #V; 2288 switch ((AArch64ISD::NodeType)Opcode) { 2289 case AArch64ISD::FIRST_NUMBER: 2290 break; 2291 MAKE_CASE(AArch64ISD::OBSCURE_COPY) 2292 MAKE_CASE(AArch64ISD::SMSTART) 2293 MAKE_CASE(AArch64ISD::SMSTOP) 2294 MAKE_CASE(AArch64ISD::RESTORE_ZA) 2295 MAKE_CASE(AArch64ISD::CALL) 2296 MAKE_CASE(AArch64ISD::ADRP) 2297 MAKE_CASE(AArch64ISD::ADR) 2298 MAKE_CASE(AArch64ISD::ADDlow) 2299 MAKE_CASE(AArch64ISD::LOADgot) 2300 MAKE_CASE(AArch64ISD::RET_FLAG) 2301 MAKE_CASE(AArch64ISD::BRCOND) 2302 MAKE_CASE(AArch64ISD::CSEL) 2303 MAKE_CASE(AArch64ISD::CSINV) 2304 MAKE_CASE(AArch64ISD::CSNEG) 2305 MAKE_CASE(AArch64ISD::CSINC) 2306 MAKE_CASE(AArch64ISD::THREAD_POINTER) 2307 MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ) 2308 MAKE_CASE(AArch64ISD::ABDS_PRED) 2309 MAKE_CASE(AArch64ISD::ABDU_PRED) 2310 MAKE_CASE(AArch64ISD::HADDS_PRED) 2311 MAKE_CASE(AArch64ISD::HADDU_PRED) 2312 MAKE_CASE(AArch64ISD::MUL_PRED) 2313 MAKE_CASE(AArch64ISD::MULHS_PRED) 2314 MAKE_CASE(AArch64ISD::MULHU_PRED) 2315 MAKE_CASE(AArch64ISD::RHADDS_PRED) 2316 MAKE_CASE(AArch64ISD::RHADDU_PRED) 2317 MAKE_CASE(AArch64ISD::SDIV_PRED) 2318 MAKE_CASE(AArch64ISD::SHL_PRED) 2319 MAKE_CASE(AArch64ISD::SMAX_PRED) 2320 MAKE_CASE(AArch64ISD::SMIN_PRED) 2321 MAKE_CASE(AArch64ISD::SRA_PRED) 2322 MAKE_CASE(AArch64ISD::SRL_PRED) 2323 MAKE_CASE(AArch64ISD::UDIV_PRED) 2324 MAKE_CASE(AArch64ISD::UMAX_PRED) 2325 MAKE_CASE(AArch64ISD::UMIN_PRED) 2326 MAKE_CASE(AArch64ISD::SRAD_MERGE_OP1) 2327 MAKE_CASE(AArch64ISD::FNEG_MERGE_PASSTHRU) 2328 MAKE_CASE(AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU) 2329 MAKE_CASE(AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU) 2330 MAKE_CASE(AArch64ISD::FCEIL_MERGE_PASSTHRU) 2331 MAKE_CASE(AArch64ISD::FFLOOR_MERGE_PASSTHRU) 2332 MAKE_CASE(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU) 2333 MAKE_CASE(AArch64ISD::FRINT_MERGE_PASSTHRU) 2334 MAKE_CASE(AArch64ISD::FROUND_MERGE_PASSTHRU) 2335 MAKE_CASE(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU) 2336 MAKE_CASE(AArch64ISD::FTRUNC_MERGE_PASSTHRU) 2337 MAKE_CASE(AArch64ISD::FP_ROUND_MERGE_PASSTHRU) 2338 MAKE_CASE(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU) 2339 MAKE_CASE(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU) 2340 MAKE_CASE(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU) 2341 MAKE_CASE(AArch64ISD::FCVTZU_MERGE_PASSTHRU) 2342 MAKE_CASE(AArch64ISD::FCVTZS_MERGE_PASSTHRU) 2343 MAKE_CASE(AArch64ISD::FSQRT_MERGE_PASSTHRU) 2344 MAKE_CASE(AArch64ISD::FRECPX_MERGE_PASSTHRU) 2345 MAKE_CASE(AArch64ISD::FABS_MERGE_PASSTHRU) 2346 MAKE_CASE(AArch64ISD::ABS_MERGE_PASSTHRU) 2347 MAKE_CASE(AArch64ISD::NEG_MERGE_PASSTHRU) 2348 MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO) 2349 MAKE_CASE(AArch64ISD::ADC) 2350 MAKE_CASE(AArch64ISD::SBC) 2351 MAKE_CASE(AArch64ISD::ADDS) 2352 MAKE_CASE(AArch64ISD::SUBS) 2353 MAKE_CASE(AArch64ISD::ADCS) 2354 MAKE_CASE(AArch64ISD::SBCS) 2355 MAKE_CASE(AArch64ISD::ANDS) 2356 MAKE_CASE(AArch64ISD::CCMP) 2357 MAKE_CASE(AArch64ISD::CCMN) 2358 MAKE_CASE(AArch64ISD::FCCMP) 2359 MAKE_CASE(AArch64ISD::FCMP) 2360 MAKE_CASE(AArch64ISD::STRICT_FCMP) 2361 MAKE_CASE(AArch64ISD::STRICT_FCMPE) 2362 MAKE_CASE(AArch64ISD::DUP) 2363 MAKE_CASE(AArch64ISD::DUPLANE8) 2364 MAKE_CASE(AArch64ISD::DUPLANE16) 2365 MAKE_CASE(AArch64ISD::DUPLANE32) 2366 MAKE_CASE(AArch64ISD::DUPLANE64) 2367 MAKE_CASE(AArch64ISD::DUPLANE128) 2368 MAKE_CASE(AArch64ISD::MOVI) 2369 MAKE_CASE(AArch64ISD::MOVIshift) 2370 MAKE_CASE(AArch64ISD::MOVIedit) 2371 MAKE_CASE(AArch64ISD::MOVImsl) 2372 MAKE_CASE(AArch64ISD::FMOV) 2373 MAKE_CASE(AArch64ISD::MVNIshift) 2374 MAKE_CASE(AArch64ISD::MVNImsl) 2375 MAKE_CASE(AArch64ISD::BICi) 2376 MAKE_CASE(AArch64ISD::ORRi) 2377 MAKE_CASE(AArch64ISD::BSP) 2378 MAKE_CASE(AArch64ISD::EXTR) 2379 MAKE_CASE(AArch64ISD::ZIP1) 2380 MAKE_CASE(AArch64ISD::ZIP2) 2381 MAKE_CASE(AArch64ISD::UZP1) 2382 MAKE_CASE(AArch64ISD::UZP2) 2383 MAKE_CASE(AArch64ISD::TRN1) 2384 MAKE_CASE(AArch64ISD::TRN2) 2385 MAKE_CASE(AArch64ISD::REV16) 2386 MAKE_CASE(AArch64ISD::REV32) 2387 MAKE_CASE(AArch64ISD::REV64) 2388 MAKE_CASE(AArch64ISD::EXT) 2389 MAKE_CASE(AArch64ISD::SPLICE) 2390 MAKE_CASE(AArch64ISD::VSHL) 2391 MAKE_CASE(AArch64ISD::VLSHR) 2392 MAKE_CASE(AArch64ISD::VASHR) 2393 MAKE_CASE(AArch64ISD::VSLI) 2394 MAKE_CASE(AArch64ISD::VSRI) 2395 MAKE_CASE(AArch64ISD::CMEQ) 2396 MAKE_CASE(AArch64ISD::CMGE) 2397 MAKE_CASE(AArch64ISD::CMGT) 2398 MAKE_CASE(AArch64ISD::CMHI) 2399 MAKE_CASE(AArch64ISD::CMHS) 2400 MAKE_CASE(AArch64ISD::FCMEQ) 2401 MAKE_CASE(AArch64ISD::FCMGE) 2402 MAKE_CASE(AArch64ISD::FCMGT) 2403 MAKE_CASE(AArch64ISD::CMEQz) 2404 MAKE_CASE(AArch64ISD::CMGEz) 2405 MAKE_CASE(AArch64ISD::CMGTz) 2406 MAKE_CASE(AArch64ISD::CMLEz) 2407 MAKE_CASE(AArch64ISD::CMLTz) 2408 MAKE_CASE(AArch64ISD::FCMEQz) 2409 MAKE_CASE(AArch64ISD::FCMGEz) 2410 MAKE_CASE(AArch64ISD::FCMGTz) 2411 MAKE_CASE(AArch64ISD::FCMLEz) 2412 MAKE_CASE(AArch64ISD::FCMLTz) 2413 MAKE_CASE(AArch64ISD::SADDV) 2414 MAKE_CASE(AArch64ISD::UADDV) 2415 MAKE_CASE(AArch64ISD::SDOT) 2416 MAKE_CASE(AArch64ISD::UDOT) 2417 MAKE_CASE(AArch64ISD::SMINV) 2418 MAKE_CASE(AArch64ISD::UMINV) 2419 MAKE_CASE(AArch64ISD::SMAXV) 2420 MAKE_CASE(AArch64ISD::UMAXV) 2421 MAKE_CASE(AArch64ISD::SADDV_PRED) 2422 MAKE_CASE(AArch64ISD::UADDV_PRED) 2423 MAKE_CASE(AArch64ISD::SMAXV_PRED) 2424 MAKE_CASE(AArch64ISD::UMAXV_PRED) 2425 MAKE_CASE(AArch64ISD::SMINV_PRED) 2426 MAKE_CASE(AArch64ISD::UMINV_PRED) 2427 MAKE_CASE(AArch64ISD::ORV_PRED) 2428 MAKE_CASE(AArch64ISD::EORV_PRED) 2429 MAKE_CASE(AArch64ISD::ANDV_PRED) 2430 MAKE_CASE(AArch64ISD::CLASTA_N) 2431 MAKE_CASE(AArch64ISD::CLASTB_N) 2432 MAKE_CASE(AArch64ISD::LASTA) 2433 MAKE_CASE(AArch64ISD::LASTB) 2434 MAKE_CASE(AArch64ISD::REINTERPRET_CAST) 2435 MAKE_CASE(AArch64ISD::LS64_BUILD) 2436 MAKE_CASE(AArch64ISD::LS64_EXTRACT) 2437 MAKE_CASE(AArch64ISD::TBL) 2438 MAKE_CASE(AArch64ISD::FADD_PRED) 2439 MAKE_CASE(AArch64ISD::FADDA_PRED) 2440 MAKE_CASE(AArch64ISD::FADDV_PRED) 2441 MAKE_CASE(AArch64ISD::FDIV_PRED) 2442 MAKE_CASE(AArch64ISD::FMA_PRED) 2443 MAKE_CASE(AArch64ISD::FMAX_PRED) 2444 MAKE_CASE(AArch64ISD::FMAXV_PRED) 2445 MAKE_CASE(AArch64ISD::FMAXNM_PRED) 2446 MAKE_CASE(AArch64ISD::FMAXNMV_PRED) 2447 MAKE_CASE(AArch64ISD::FMIN_PRED) 2448 MAKE_CASE(AArch64ISD::FMINV_PRED) 2449 MAKE_CASE(AArch64ISD::FMINNM_PRED) 2450 MAKE_CASE(AArch64ISD::FMINNMV_PRED) 2451 MAKE_CASE(AArch64ISD::FMUL_PRED) 2452 MAKE_CASE(AArch64ISD::FSUB_PRED) 2453 MAKE_CASE(AArch64ISD::RDSVL) 2454 MAKE_CASE(AArch64ISD::BIC) 2455 MAKE_CASE(AArch64ISD::BIT) 2456 MAKE_CASE(AArch64ISD::CBZ) 2457 MAKE_CASE(AArch64ISD::CBNZ) 2458 MAKE_CASE(AArch64ISD::TBZ) 2459 MAKE_CASE(AArch64ISD::TBNZ) 2460 MAKE_CASE(AArch64ISD::TC_RETURN) 2461 MAKE_CASE(AArch64ISD::PREFETCH) 2462 MAKE_CASE(AArch64ISD::SITOF) 2463 MAKE_CASE(AArch64ISD::UITOF) 2464 MAKE_CASE(AArch64ISD::NVCAST) 2465 MAKE_CASE(AArch64ISD::MRS) 2466 MAKE_CASE(AArch64ISD::SQSHL_I) 2467 MAKE_CASE(AArch64ISD::UQSHL_I) 2468 MAKE_CASE(AArch64ISD::SRSHR_I) 2469 MAKE_CASE(AArch64ISD::URSHR_I) 2470 MAKE_CASE(AArch64ISD::SQSHLU_I) 2471 MAKE_CASE(AArch64ISD::WrapperLarge) 2472 MAKE_CASE(AArch64ISD::LD2post) 2473 MAKE_CASE(AArch64ISD::LD3post) 2474 MAKE_CASE(AArch64ISD::LD4post) 2475 MAKE_CASE(AArch64ISD::ST2post) 2476 MAKE_CASE(AArch64ISD::ST3post) 2477 MAKE_CASE(AArch64ISD::ST4post) 2478 MAKE_CASE(AArch64ISD::LD1x2post) 2479 MAKE_CASE(AArch64ISD::LD1x3post) 2480 MAKE_CASE(AArch64ISD::LD1x4post) 2481 MAKE_CASE(AArch64ISD::ST1x2post) 2482 MAKE_CASE(AArch64ISD::ST1x3post) 2483 MAKE_CASE(AArch64ISD::ST1x4post) 2484 MAKE_CASE(AArch64ISD::LD1DUPpost) 2485 MAKE_CASE(AArch64ISD::LD2DUPpost) 2486 MAKE_CASE(AArch64ISD::LD3DUPpost) 2487 MAKE_CASE(AArch64ISD::LD4DUPpost) 2488 MAKE_CASE(AArch64ISD::LD1LANEpost) 2489 MAKE_CASE(AArch64ISD::LD2LANEpost) 2490 MAKE_CASE(AArch64ISD::LD3LANEpost) 2491 MAKE_CASE(AArch64ISD::LD4LANEpost) 2492 MAKE_CASE(AArch64ISD::ST2LANEpost) 2493 MAKE_CASE(AArch64ISD::ST3LANEpost) 2494 MAKE_CASE(AArch64ISD::ST4LANEpost) 2495 MAKE_CASE(AArch64ISD::SMULL) 2496 MAKE_CASE(AArch64ISD::UMULL) 2497 MAKE_CASE(AArch64ISD::PMULL) 2498 MAKE_CASE(AArch64ISD::FRECPE) 2499 MAKE_CASE(AArch64ISD::FRECPS) 2500 MAKE_CASE(AArch64ISD::FRSQRTE) 2501 MAKE_CASE(AArch64ISD::FRSQRTS) 2502 MAKE_CASE(AArch64ISD::STG) 2503 MAKE_CASE(AArch64ISD::STZG) 2504 MAKE_CASE(AArch64ISD::ST2G) 2505 MAKE_CASE(AArch64ISD::STZ2G) 2506 MAKE_CASE(AArch64ISD::SUNPKHI) 2507 MAKE_CASE(AArch64ISD::SUNPKLO) 2508 MAKE_CASE(AArch64ISD::UUNPKHI) 2509 MAKE_CASE(AArch64ISD::UUNPKLO) 2510 MAKE_CASE(AArch64ISD::INSR) 2511 MAKE_CASE(AArch64ISD::PTEST) 2512 MAKE_CASE(AArch64ISD::PTEST_ANY) 2513 MAKE_CASE(AArch64ISD::PTRUE) 2514 MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO) 2515 MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO) 2516 MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO) 2517 MAKE_CASE(AArch64ISD::LDNF1S_MERGE_ZERO) 2518 MAKE_CASE(AArch64ISD::LDFF1_MERGE_ZERO) 2519 MAKE_CASE(AArch64ISD::LDFF1S_MERGE_ZERO) 2520 MAKE_CASE(AArch64ISD::LD1RQ_MERGE_ZERO) 2521 MAKE_CASE(AArch64ISD::LD1RO_MERGE_ZERO) 2522 MAKE_CASE(AArch64ISD::SVE_LD2_MERGE_ZERO) 2523 MAKE_CASE(AArch64ISD::SVE_LD3_MERGE_ZERO) 2524 MAKE_CASE(AArch64ISD::SVE_LD4_MERGE_ZERO) 2525 MAKE_CASE(AArch64ISD::GLD1_MERGE_ZERO) 2526 MAKE_CASE(AArch64ISD::GLD1_SCALED_MERGE_ZERO) 2527 MAKE_CASE(AArch64ISD::GLD1_SXTW_MERGE_ZERO) 2528 MAKE_CASE(AArch64ISD::GLD1_UXTW_MERGE_ZERO) 2529 MAKE_CASE(AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO) 2530 MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO) 2531 MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO) 2532 MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO) 2533 MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO) 2534 MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO) 2535 MAKE_CASE(AArch64ISD::GLD1S_UXTW_MERGE_ZERO) 2536 MAKE_CASE(AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO) 2537 MAKE_CASE(AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO) 2538 MAKE_CASE(AArch64ISD::GLD1S_IMM_MERGE_ZERO) 2539 MAKE_CASE(AArch64ISD::GLDFF1_MERGE_ZERO) 2540 MAKE_CASE(AArch64ISD::GLDFF1_SCALED_MERGE_ZERO) 2541 MAKE_CASE(AArch64ISD::GLDFF1_SXTW_MERGE_ZERO) 2542 MAKE_CASE(AArch64ISD::GLDFF1_UXTW_MERGE_ZERO) 2543 MAKE_CASE(AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO) 2544 MAKE_CASE(AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO) 2545 MAKE_CASE(AArch64ISD::GLDFF1_IMM_MERGE_ZERO) 2546 MAKE_CASE(AArch64ISD::GLDFF1S_MERGE_ZERO) 2547 MAKE_CASE(AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO) 2548 MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO) 2549 MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO) 2550 MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO) 2551 MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO) 2552 MAKE_CASE(AArch64ISD::GLDFF1S_IMM_MERGE_ZERO) 2553 MAKE_CASE(AArch64ISD::GLDNT1_MERGE_ZERO) 2554 MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) 2555 MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO) 2556 MAKE_CASE(AArch64ISD::ST1_PRED) 2557 MAKE_CASE(AArch64ISD::SST1_PRED) 2558 MAKE_CASE(AArch64ISD::SST1_SCALED_PRED) 2559 MAKE_CASE(AArch64ISD::SST1_SXTW_PRED) 2560 MAKE_CASE(AArch64ISD::SST1_UXTW_PRED) 2561 MAKE_CASE(AArch64ISD::SST1_SXTW_SCALED_PRED) 2562 MAKE_CASE(AArch64ISD::SST1_UXTW_SCALED_PRED) 2563 MAKE_CASE(AArch64ISD::SST1_IMM_PRED) 2564 MAKE_CASE(AArch64ISD::SSTNT1_PRED) 2565 MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED) 2566 MAKE_CASE(AArch64ISD::LDP) 2567 MAKE_CASE(AArch64ISD::LDNP) 2568 MAKE_CASE(AArch64ISD::STP) 2569 MAKE_CASE(AArch64ISD::STNP) 2570 MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU) 2571 MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU) 2572 MAKE_CASE(AArch64ISD::REVH_MERGE_PASSTHRU) 2573 MAKE_CASE(AArch64ISD::REVW_MERGE_PASSTHRU) 2574 MAKE_CASE(AArch64ISD::REVD_MERGE_PASSTHRU) 2575 MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU) 2576 MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU) 2577 MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU) 2578 MAKE_CASE(AArch64ISD::INDEX_VECTOR) 2579 MAKE_CASE(AArch64ISD::ADDP) 2580 MAKE_CASE(AArch64ISD::SADDLP) 2581 MAKE_CASE(AArch64ISD::UADDLP) 2582 MAKE_CASE(AArch64ISD::CALL_RVMARKER) 2583 MAKE_CASE(AArch64ISD::ASSERT_ZEXT_BOOL) 2584 MAKE_CASE(AArch64ISD::MOPS_MEMSET) 2585 MAKE_CASE(AArch64ISD::MOPS_MEMSET_TAGGING) 2586 MAKE_CASE(AArch64ISD::MOPS_MEMCOPY) 2587 MAKE_CASE(AArch64ISD::MOPS_MEMMOVE) 2588 MAKE_CASE(AArch64ISD::CALL_BTI) 2589 MAKE_CASE(AArch64ISD::MRRS) 2590 MAKE_CASE(AArch64ISD::MSRR) 2591 } 2592 #undef MAKE_CASE 2593 return nullptr; 2594 } 2595 2596 MachineBasicBlock * 2597 AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI, 2598 MachineBasicBlock *MBB) const { 2599 // We materialise the F128CSEL pseudo-instruction as some control flow and a 2600 // phi node: 2601 2602 // OrigBB: 2603 // [... previous instrs leading to comparison ...] 2604 // b.ne TrueBB 2605 // b EndBB 2606 // TrueBB: 2607 // ; Fallthrough 2608 // EndBB: 2609 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB] 2610 2611 MachineFunction *MF = MBB->getParent(); 2612 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 2613 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 2614 DebugLoc DL = MI.getDebugLoc(); 2615 MachineFunction::iterator It = ++MBB->getIterator(); 2616 2617 Register DestReg = MI.getOperand(0).getReg(); 2618 Register IfTrueReg = MI.getOperand(1).getReg(); 2619 Register IfFalseReg = MI.getOperand(2).getReg(); 2620 unsigned CondCode = MI.getOperand(3).getImm(); 2621 bool NZCVKilled = MI.getOperand(4).isKill(); 2622 2623 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB); 2624 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB); 2625 MF->insert(It, TrueBB); 2626 MF->insert(It, EndBB); 2627 2628 // Transfer rest of current basic-block to EndBB 2629 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), 2630 MBB->end()); 2631 EndBB->transferSuccessorsAndUpdatePHIs(MBB); 2632 2633 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB); 2634 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB); 2635 MBB->addSuccessor(TrueBB); 2636 MBB->addSuccessor(EndBB); 2637 2638 // TrueBB falls through to the end. 2639 TrueBB->addSuccessor(EndBB); 2640 2641 if (!NZCVKilled) { 2642 TrueBB->addLiveIn(AArch64::NZCV); 2643 EndBB->addLiveIn(AArch64::NZCV); 2644 } 2645 2646 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg) 2647 .addReg(IfTrueReg) 2648 .addMBB(TrueBB) 2649 .addReg(IfFalseReg) 2650 .addMBB(MBB); 2651 2652 MI.eraseFromParent(); 2653 return EndBB; 2654 } 2655 2656 MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet( 2657 MachineInstr &MI, MachineBasicBlock *BB) const { 2658 assert(!isAsynchronousEHPersonality(classifyEHPersonality( 2659 BB->getParent()->getFunction().getPersonalityFn())) && 2660 "SEH does not use catchret!"); 2661 return BB; 2662 } 2663 2664 MachineBasicBlock * 2665 AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg, 2666 MachineInstr &MI, 2667 MachineBasicBlock *BB) const { 2668 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 2669 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc)); 2670 2671 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define); 2672 MIB.add(MI.getOperand(1)); // slice index register 2673 MIB.add(MI.getOperand(2)); // slice index offset 2674 MIB.add(MI.getOperand(3)); // pg 2675 MIB.add(MI.getOperand(4)); // base 2676 MIB.add(MI.getOperand(5)); // offset 2677 2678 MI.eraseFromParent(); // The pseudo is gone now. 2679 return BB; 2680 } 2681 2682 MachineBasicBlock * 2683 AArch64TargetLowering::EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const { 2684 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 2685 MachineInstrBuilder MIB = 2686 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA)); 2687 2688 MIB.addReg(AArch64::ZA, RegState::Define); 2689 MIB.add(MI.getOperand(0)); // Vector select register 2690 MIB.add(MI.getOperand(1)); // Vector select offset 2691 MIB.add(MI.getOperand(2)); // Base 2692 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset 2693 2694 MI.eraseFromParent(); // The pseudo is gone now. 2695 return BB; 2696 } 2697 2698 MachineBasicBlock * 2699 AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg, 2700 MachineInstr &MI, 2701 MachineBasicBlock *BB, bool HasTile) const { 2702 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 2703 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc)); 2704 unsigned StartIdx = 0; 2705 2706 if (HasTile) { 2707 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define); 2708 MIB.addReg(BaseReg + MI.getOperand(0).getImm()); 2709 StartIdx = 1; 2710 } else 2711 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg); 2712 2713 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I) 2714 MIB.add(MI.getOperand(I)); 2715 2716 MI.eraseFromParent(); // The pseudo is gone now. 2717 return BB; 2718 } 2719 2720 MachineBasicBlock * 2721 AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const { 2722 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 2723 MachineInstrBuilder MIB = 2724 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M)); 2725 MIB.add(MI.getOperand(0)); // Mask 2726 2727 unsigned Mask = MI.getOperand(0).getImm(); 2728 for (unsigned I = 0; I < 8; I++) { 2729 if (Mask & (1 << I)) 2730 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine); 2731 } 2732 2733 MI.eraseFromParent(); // The pseudo is gone now. 2734 return BB; 2735 } 2736 2737 MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( 2738 MachineInstr &MI, MachineBasicBlock *BB) const { 2739 2740 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode()); 2741 if (SMEOrigInstr != -1) { 2742 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 2743 uint64_t SMEMatrixType = 2744 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask; 2745 switch (SMEMatrixType) { 2746 case (AArch64::SMEMatrixArray): 2747 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB, /*HasTile*/ false); 2748 case (AArch64::SMEMatrixTileB): 2749 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB, /*HasTile*/ true); 2750 case (AArch64::SMEMatrixTileH): 2751 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB, /*HasTile*/ true); 2752 case (AArch64::SMEMatrixTileS): 2753 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB, /*HasTile*/ true); 2754 case (AArch64::SMEMatrixTileD): 2755 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB, /*HasTile*/ true); 2756 case (AArch64::SMEMatrixTileQ): 2757 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB, /*HasTile*/ true); 2758 } 2759 } 2760 2761 switch (MI.getOpcode()) { 2762 default: 2763 #ifndef NDEBUG 2764 MI.dump(); 2765 #endif 2766 llvm_unreachable("Unexpected instruction for custom inserter!"); 2767 2768 case AArch64::F128CSEL: 2769 return EmitF128CSEL(MI, BB); 2770 case TargetOpcode::STATEPOINT: 2771 // STATEPOINT is a pseudo instruction which has no implicit defs/uses 2772 // while bl call instruction (where statepoint will be lowered at the end) 2773 // has implicit def. This def is early-clobber as it will be set at 2774 // the moment of the call and earlier than any use is read. 2775 // Add this implicit dead def here as a workaround. 2776 MI.addOperand(*MI.getMF(), 2777 MachineOperand::CreateReg( 2778 AArch64::LR, /*isDef*/ true, 2779 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true, 2780 /*isUndef*/ false, /*isEarlyClobber*/ true)); 2781 [[fallthrough]]; 2782 case TargetOpcode::STACKMAP: 2783 case TargetOpcode::PATCHPOINT: 2784 return emitPatchPoint(MI, BB); 2785 2786 case AArch64::CATCHRET: 2787 return EmitLoweredCatchRet(MI, BB); 2788 case AArch64::LD1_MXIPXX_H_PSEUDO_B: 2789 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB); 2790 case AArch64::LD1_MXIPXX_H_PSEUDO_H: 2791 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB); 2792 case AArch64::LD1_MXIPXX_H_PSEUDO_S: 2793 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB); 2794 case AArch64::LD1_MXIPXX_H_PSEUDO_D: 2795 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB); 2796 case AArch64::LD1_MXIPXX_H_PSEUDO_Q: 2797 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB); 2798 case AArch64::LD1_MXIPXX_V_PSEUDO_B: 2799 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB); 2800 case AArch64::LD1_MXIPXX_V_PSEUDO_H: 2801 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB); 2802 case AArch64::LD1_MXIPXX_V_PSEUDO_S: 2803 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB); 2804 case AArch64::LD1_MXIPXX_V_PSEUDO_D: 2805 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB); 2806 case AArch64::LD1_MXIPXX_V_PSEUDO_Q: 2807 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB); 2808 case AArch64::LDR_ZA_PSEUDO: 2809 return EmitFill(MI, BB); 2810 case AArch64::ZERO_M_PSEUDO: 2811 return EmitZero(MI, BB); 2812 } 2813 } 2814 2815 //===----------------------------------------------------------------------===// 2816 // AArch64 Lowering private implementation. 2817 //===----------------------------------------------------------------------===// 2818 2819 //===----------------------------------------------------------------------===// 2820 // Lowering Code 2821 //===----------------------------------------------------------------------===// 2822 2823 // Forward declarations of SVE fixed length lowering helpers 2824 static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT); 2825 static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V); 2826 static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V); 2827 static SDValue convertFixedMaskToScalableVector(SDValue Mask, 2828 SelectionDAG &DAG); 2829 static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, 2830 EVT VT); 2831 2832 /// isZerosVector - Check whether SDNode N is a zero-filled vector. 2833 static bool isZerosVector(const SDNode *N) { 2834 // Look through a bit convert. 2835 while (N->getOpcode() == ISD::BITCAST) 2836 N = N->getOperand(0).getNode(); 2837 2838 if (ISD::isConstantSplatVectorAllZeros(N)) 2839 return true; 2840 2841 if (N->getOpcode() != AArch64ISD::DUP) 2842 return false; 2843 2844 auto Opnd0 = N->getOperand(0); 2845 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0); 2846 } 2847 2848 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 2849 /// CC 2850 static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) { 2851 switch (CC) { 2852 default: 2853 llvm_unreachable("Unknown condition code!"); 2854 case ISD::SETNE: 2855 return AArch64CC::NE; 2856 case ISD::SETEQ: 2857 return AArch64CC::EQ; 2858 case ISD::SETGT: 2859 return AArch64CC::GT; 2860 case ISD::SETGE: 2861 return AArch64CC::GE; 2862 case ISD::SETLT: 2863 return AArch64CC::LT; 2864 case ISD::SETLE: 2865 return AArch64CC::LE; 2866 case ISD::SETUGT: 2867 return AArch64CC::HI; 2868 case ISD::SETUGE: 2869 return AArch64CC::HS; 2870 case ISD::SETULT: 2871 return AArch64CC::LO; 2872 case ISD::SETULE: 2873 return AArch64CC::LS; 2874 } 2875 } 2876 2877 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC. 2878 static void changeFPCCToAArch64CC(ISD::CondCode CC, 2879 AArch64CC::CondCode &CondCode, 2880 AArch64CC::CondCode &CondCode2) { 2881 CondCode2 = AArch64CC::AL; 2882 switch (CC) { 2883 default: 2884 llvm_unreachable("Unknown FP condition!"); 2885 case ISD::SETEQ: 2886 case ISD::SETOEQ: 2887 CondCode = AArch64CC::EQ; 2888 break; 2889 case ISD::SETGT: 2890 case ISD::SETOGT: 2891 CondCode = AArch64CC::GT; 2892 break; 2893 case ISD::SETGE: 2894 case ISD::SETOGE: 2895 CondCode = AArch64CC::GE; 2896 break; 2897 case ISD::SETOLT: 2898 CondCode = AArch64CC::MI; 2899 break; 2900 case ISD::SETOLE: 2901 CondCode = AArch64CC::LS; 2902 break; 2903 case ISD::SETONE: 2904 CondCode = AArch64CC::MI; 2905 CondCode2 = AArch64CC::GT; 2906 break; 2907 case ISD::SETO: 2908 CondCode = AArch64CC::VC; 2909 break; 2910 case ISD::SETUO: 2911 CondCode = AArch64CC::VS; 2912 break; 2913 case ISD::SETUEQ: 2914 CondCode = AArch64CC::EQ; 2915 CondCode2 = AArch64CC::VS; 2916 break; 2917 case ISD::SETUGT: 2918 CondCode = AArch64CC::HI; 2919 break; 2920 case ISD::SETUGE: 2921 CondCode = AArch64CC::PL; 2922 break; 2923 case ISD::SETLT: 2924 case ISD::SETULT: 2925 CondCode = AArch64CC::LT; 2926 break; 2927 case ISD::SETLE: 2928 case ISD::SETULE: 2929 CondCode = AArch64CC::LE; 2930 break; 2931 case ISD::SETNE: 2932 case ISD::SETUNE: 2933 CondCode = AArch64CC::NE; 2934 break; 2935 } 2936 } 2937 2938 /// Convert a DAG fp condition code to an AArch64 CC. 2939 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that 2940 /// should be AND'ed instead of OR'ed. 2941 static void changeFPCCToANDAArch64CC(ISD::CondCode CC, 2942 AArch64CC::CondCode &CondCode, 2943 AArch64CC::CondCode &CondCode2) { 2944 CondCode2 = AArch64CC::AL; 2945 switch (CC) { 2946 default: 2947 changeFPCCToAArch64CC(CC, CondCode, CondCode2); 2948 assert(CondCode2 == AArch64CC::AL); 2949 break; 2950 case ISD::SETONE: 2951 // (a one b) 2952 // == ((a olt b) || (a ogt b)) 2953 // == ((a ord b) && (a une b)) 2954 CondCode = AArch64CC::VC; 2955 CondCode2 = AArch64CC::NE; 2956 break; 2957 case ISD::SETUEQ: 2958 // (a ueq b) 2959 // == ((a uno b) || (a oeq b)) 2960 // == ((a ule b) && (a uge b)) 2961 CondCode = AArch64CC::PL; 2962 CondCode2 = AArch64CC::LE; 2963 break; 2964 } 2965 } 2966 2967 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 2968 /// CC usable with the vector instructions. Fewer operations are available 2969 /// without a real NZCV register, so we have to use less efficient combinations 2970 /// to get the same effect. 2971 static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, 2972 AArch64CC::CondCode &CondCode, 2973 AArch64CC::CondCode &CondCode2, 2974 bool &Invert) { 2975 Invert = false; 2976 switch (CC) { 2977 default: 2978 // Mostly the scalar mappings work fine. 2979 changeFPCCToAArch64CC(CC, CondCode, CondCode2); 2980 break; 2981 case ISD::SETUO: 2982 Invert = true; 2983 [[fallthrough]]; 2984 case ISD::SETO: 2985 CondCode = AArch64CC::MI; 2986 CondCode2 = AArch64CC::GE; 2987 break; 2988 case ISD::SETUEQ: 2989 case ISD::SETULT: 2990 case ISD::SETULE: 2991 case ISD::SETUGT: 2992 case ISD::SETUGE: 2993 // All of the compare-mask comparisons are ordered, but we can switch 2994 // between the two by a double inversion. E.g. ULE == !OGT. 2995 Invert = true; 2996 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32), 2997 CondCode, CondCode2); 2998 break; 2999 } 3000 } 3001 3002 static bool isLegalArithImmed(uint64_t C) { 3003 // Matches AArch64DAGToDAGISel::SelectArithImmed(). 3004 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0); 3005 LLVM_DEBUG(dbgs() << "Is imm " << C 3006 << " legal: " << (IsLegal ? "yes\n" : "no\n")); 3007 return IsLegal; 3008 } 3009 3010 // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on 3011 // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags 3012 // can be set differently by this operation. It comes down to whether 3013 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then 3014 // everything is fine. If not then the optimization is wrong. Thus general 3015 // comparisons are only valid if op2 != 0. 3016 // 3017 // So, finally, the only LLVM-native comparisons that don't mention C and V 3018 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in 3019 // the absence of information about op2. 3020 static bool isCMN(SDValue Op, ISD::CondCode CC) { 3021 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) && 3022 (CC == ISD::SETEQ || CC == ISD::SETNE); 3023 } 3024 3025 static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl, 3026 SelectionDAG &DAG, SDValue Chain, 3027 bool IsSignaling) { 3028 EVT VT = LHS.getValueType(); 3029 assert(VT != MVT::f128); 3030 3031 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16(); 3032 3033 if (VT == MVT::f16 && !FullFP16) { 3034 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other}, 3035 {Chain, LHS}); 3036 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other}, 3037 {LHS.getValue(1), RHS}); 3038 Chain = RHS.getValue(1); 3039 VT = MVT::f32; 3040 } 3041 unsigned Opcode = 3042 IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP; 3043 return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS}); 3044 } 3045 3046 static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, 3047 const SDLoc &dl, SelectionDAG &DAG) { 3048 EVT VT = LHS.getValueType(); 3049 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16(); 3050 3051 if (VT.isFloatingPoint()) { 3052 assert(VT != MVT::f128); 3053 if (VT == MVT::f16 && !FullFP16) { 3054 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); 3055 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); 3056 VT = MVT::f32; 3057 } 3058 return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS); 3059 } 3060 3061 // The CMP instruction is just an alias for SUBS, and representing it as 3062 // SUBS means that it's possible to get CSE with subtract operations. 3063 // A later phase can perform the optimization of setting the destination 3064 // register to WZR/XZR if it ends up being unused. 3065 unsigned Opcode = AArch64ISD::SUBS; 3066 3067 if (isCMN(RHS, CC)) { 3068 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ? 3069 Opcode = AArch64ISD::ADDS; 3070 RHS = RHS.getOperand(1); 3071 } else if (isCMN(LHS, CC)) { 3072 // As we are looking for EQ/NE compares, the operands can be commuted ; can 3073 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ? 3074 Opcode = AArch64ISD::ADDS; 3075 LHS = LHS.getOperand(1); 3076 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) { 3077 if (LHS.getOpcode() == ISD::AND) { 3078 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST 3079 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one 3080 // of the signed comparisons. 3081 const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl, 3082 DAG.getVTList(VT, MVT_CC), 3083 LHS.getOperand(0), 3084 LHS.getOperand(1)); 3085 // Replace all users of (and X, Y) with newly generated (ands X, Y) 3086 DAG.ReplaceAllUsesWith(LHS, ANDSNode); 3087 return ANDSNode.getValue(1); 3088 } else if (LHS.getOpcode() == AArch64ISD::ANDS) { 3089 // Use result of ANDS 3090 return LHS.getValue(1); 3091 } 3092 } 3093 3094 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS) 3095 .getValue(1); 3096 } 3097 3098 /// \defgroup AArch64CCMP CMP;CCMP matching 3099 /// 3100 /// These functions deal with the formation of CMP;CCMP;... sequences. 3101 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of 3102 /// a comparison. They set the NZCV flags to a predefined value if their 3103 /// predicate is false. This allows to express arbitrary conjunctions, for 3104 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))" 3105 /// expressed as: 3106 /// cmp A 3107 /// ccmp B, inv(CB), CA 3108 /// check for CB flags 3109 /// 3110 /// This naturally lets us implement chains of AND operations with SETCC 3111 /// operands. And we can even implement some other situations by transforming 3112 /// them: 3113 /// - We can implement (NEG SETCC) i.e. negating a single comparison by 3114 /// negating the flags used in a CCMP/FCCMP operations. 3115 /// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations 3116 /// by negating the flags we test for afterwards. i.e. 3117 /// NEG (CMP CCMP CCCMP ...) can be implemented. 3118 /// - Note that we can only ever negate all previously processed results. 3119 /// What we can not implement by flipping the flags to test is a negation 3120 /// of two sub-trees (because the negation affects all sub-trees emitted so 3121 /// far, so the 2nd sub-tree we emit would also affect the first). 3122 /// With those tools we can implement some OR operations: 3123 /// - (OR (SETCC A) (SETCC B)) can be implemented via: 3124 /// NEG (AND (NEG (SETCC A)) (NEG (SETCC B))) 3125 /// - After transforming OR to NEG/AND combinations we may be able to use NEG 3126 /// elimination rules from earlier to implement the whole thing as a 3127 /// CCMP/FCCMP chain. 3128 /// 3129 /// As complete example: 3130 /// or (or (setCA (cmp A)) (setCB (cmp B))) 3131 /// (and (setCC (cmp C)) (setCD (cmp D)))" 3132 /// can be reassociated to: 3133 /// or (and (setCC (cmp C)) setCD (cmp D)) 3134 // (or (setCA (cmp A)) (setCB (cmp B))) 3135 /// can be transformed to: 3136 /// not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) 3137 /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))" 3138 /// which can be implemented as: 3139 /// cmp C 3140 /// ccmp D, inv(CD), CC 3141 /// ccmp A, CA, inv(CD) 3142 /// ccmp B, CB, inv(CA) 3143 /// check for CB flags 3144 /// 3145 /// A counterexample is "or (and A B) (and C D)" which translates to 3146 /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we 3147 /// can only implement 1 of the inner (not) operations, but not both! 3148 /// @{ 3149 3150 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate. 3151 static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, 3152 ISD::CondCode CC, SDValue CCOp, 3153 AArch64CC::CondCode Predicate, 3154 AArch64CC::CondCode OutCC, 3155 const SDLoc &DL, SelectionDAG &DAG) { 3156 unsigned Opcode = 0; 3157 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16(); 3158 3159 if (LHS.getValueType().isFloatingPoint()) { 3160 assert(LHS.getValueType() != MVT::f128); 3161 if (LHS.getValueType() == MVT::f16 && !FullFP16) { 3162 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS); 3163 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS); 3164 } 3165 Opcode = AArch64ISD::FCCMP; 3166 } else if (RHS.getOpcode() == ISD::SUB) { 3167 SDValue SubOp0 = RHS.getOperand(0); 3168 if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { 3169 // See emitComparison() on why we can only do this for SETEQ and SETNE. 3170 Opcode = AArch64ISD::CCMN; 3171 RHS = RHS.getOperand(1); 3172 } 3173 } 3174 if (Opcode == 0) 3175 Opcode = AArch64ISD::CCMP; 3176 3177 SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC); 3178 AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); 3179 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); 3180 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); 3181 return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp); 3182 } 3183 3184 /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be 3185 /// expressed as a conjunction. See \ref AArch64CCMP. 3186 /// \param CanNegate Set to true if we can negate the whole sub-tree just by 3187 /// changing the conditions on the SETCC tests. 3188 /// (this means we can call emitConjunctionRec() with 3189 /// Negate==true on this sub-tree) 3190 /// \param MustBeFirst Set to true if this subtree needs to be negated and we 3191 /// cannot do the negation naturally. We are required to 3192 /// emit the subtree first in this case. 3193 /// \param WillNegate Is true if are called when the result of this 3194 /// subexpression must be negated. This happens when the 3195 /// outer expression is an OR. We can use this fact to know 3196 /// that we have a double negation (or (or ...) ...) that 3197 /// can be implemented for free. 3198 static bool canEmitConjunction(const SDValue Val, bool &CanNegate, 3199 bool &MustBeFirst, bool WillNegate, 3200 unsigned Depth = 0) { 3201 if (!Val.hasOneUse()) 3202 return false; 3203 unsigned Opcode = Val->getOpcode(); 3204 if (Opcode == ISD::SETCC) { 3205 if (Val->getOperand(0).getValueType() == MVT::f128) 3206 return false; 3207 CanNegate = true; 3208 MustBeFirst = false; 3209 return true; 3210 } 3211 // Protect against exponential runtime and stack overflow. 3212 if (Depth > 6) 3213 return false; 3214 if (Opcode == ISD::AND || Opcode == ISD::OR) { 3215 bool IsOR = Opcode == ISD::OR; 3216 SDValue O0 = Val->getOperand(0); 3217 SDValue O1 = Val->getOperand(1); 3218 bool CanNegateL; 3219 bool MustBeFirstL; 3220 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1)) 3221 return false; 3222 bool CanNegateR; 3223 bool MustBeFirstR; 3224 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1)) 3225 return false; 3226 3227 if (MustBeFirstL && MustBeFirstR) 3228 return false; 3229 3230 if (IsOR) { 3231 // For an OR expression we need to be able to naturally negate at least 3232 // one side or we cannot do the transformation at all. 3233 if (!CanNegateL && !CanNegateR) 3234 return false; 3235 // If we the result of the OR will be negated and we can naturally negate 3236 // the leafs, then this sub-tree as a whole negates naturally. 3237 CanNegate = WillNegate && CanNegateL && CanNegateR; 3238 // If we cannot naturally negate the whole sub-tree, then this must be 3239 // emitted first. 3240 MustBeFirst = !CanNegate; 3241 } else { 3242 assert(Opcode == ISD::AND && "Must be OR or AND"); 3243 // We cannot naturally negate an AND operation. 3244 CanNegate = false; 3245 MustBeFirst = MustBeFirstL || MustBeFirstR; 3246 } 3247 return true; 3248 } 3249 return false; 3250 } 3251 3252 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain 3253 /// of CCMP/CFCMP ops. See @ref AArch64CCMP. 3254 /// Tries to transform the given i1 producing node @p Val to a series compare 3255 /// and conditional compare operations. @returns an NZCV flags producing node 3256 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if 3257 /// transformation was not possible. 3258 /// \p Negate is true if we want this sub-tree being negated just by changing 3259 /// SETCC conditions. 3260 static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, 3261 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, 3262 AArch64CC::CondCode Predicate) { 3263 // We're at a tree leaf, produce a conditional comparison operation. 3264 unsigned Opcode = Val->getOpcode(); 3265 if (Opcode == ISD::SETCC) { 3266 SDValue LHS = Val->getOperand(0); 3267 SDValue RHS = Val->getOperand(1); 3268 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get(); 3269 bool isInteger = LHS.getValueType().isInteger(); 3270 if (Negate) 3271 CC = getSetCCInverse(CC, LHS.getValueType()); 3272 SDLoc DL(Val); 3273 // Determine OutCC and handle FP special case. 3274 if (isInteger) { 3275 OutCC = changeIntCCToAArch64CC(CC); 3276 } else { 3277 assert(LHS.getValueType().isFloatingPoint()); 3278 AArch64CC::CondCode ExtraCC; 3279 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC); 3280 // Some floating point conditions can't be tested with a single condition 3281 // code. Construct an additional comparison in this case. 3282 if (ExtraCC != AArch64CC::AL) { 3283 SDValue ExtraCmp; 3284 if (!CCOp.getNode()) 3285 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG); 3286 else 3287 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, 3288 ExtraCC, DL, DAG); 3289 CCOp = ExtraCmp; 3290 Predicate = ExtraCC; 3291 } 3292 } 3293 3294 // Produce a normal comparison if we are first in the chain 3295 if (!CCOp) 3296 return emitComparison(LHS, RHS, CC, DL, DAG); 3297 // Otherwise produce a ccmp. 3298 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL, 3299 DAG); 3300 } 3301 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree"); 3302 3303 bool IsOR = Opcode == ISD::OR; 3304 3305 SDValue LHS = Val->getOperand(0); 3306 bool CanNegateL; 3307 bool MustBeFirstL; 3308 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR); 3309 assert(ValidL && "Valid conjunction/disjunction tree"); 3310 (void)ValidL; 3311 3312 SDValue RHS = Val->getOperand(1); 3313 bool CanNegateR; 3314 bool MustBeFirstR; 3315 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR); 3316 assert(ValidR && "Valid conjunction/disjunction tree"); 3317 (void)ValidR; 3318 3319 // Swap sub-tree that must come first to the right side. 3320 if (MustBeFirstL) { 3321 assert(!MustBeFirstR && "Valid conjunction/disjunction tree"); 3322 std::swap(LHS, RHS); 3323 std::swap(CanNegateL, CanNegateR); 3324 std::swap(MustBeFirstL, MustBeFirstR); 3325 } 3326 3327 bool NegateR; 3328 bool NegateAfterR; 3329 bool NegateL; 3330 bool NegateAfterAll; 3331 if (Opcode == ISD::OR) { 3332 // Swap the sub-tree that we can negate naturally to the left. 3333 if (!CanNegateL) { 3334 assert(CanNegateR && "at least one side must be negatable"); 3335 assert(!MustBeFirstR && "invalid conjunction/disjunction tree"); 3336 assert(!Negate); 3337 std::swap(LHS, RHS); 3338 NegateR = false; 3339 NegateAfterR = true; 3340 } else { 3341 // Negate the left sub-tree if possible, otherwise negate the result. 3342 NegateR = CanNegateR; 3343 NegateAfterR = !CanNegateR; 3344 } 3345 NegateL = true; 3346 NegateAfterAll = !Negate; 3347 } else { 3348 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree"); 3349 assert(!Negate && "Valid conjunction/disjunction tree"); 3350 3351 NegateL = false; 3352 NegateR = false; 3353 NegateAfterR = false; 3354 NegateAfterAll = false; 3355 } 3356 3357 // Emit sub-trees. 3358 AArch64CC::CondCode RHSCC; 3359 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate); 3360 if (NegateAfterR) 3361 RHSCC = AArch64CC::getInvertedCondCode(RHSCC); 3362 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC); 3363 if (NegateAfterAll) 3364 OutCC = AArch64CC::getInvertedCondCode(OutCC); 3365 return CmpL; 3366 } 3367 3368 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops). 3369 /// In some cases this is even possible with OR operations in the expression. 3370 /// See \ref AArch64CCMP. 3371 /// \see emitConjunctionRec(). 3372 static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, 3373 AArch64CC::CondCode &OutCC) { 3374 bool DummyCanNegate; 3375 bool DummyMustBeFirst; 3376 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false)) 3377 return SDValue(); 3378 3379 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL); 3380 } 3381 3382 /// @} 3383 3384 /// Returns how profitable it is to fold a comparison's operand's shift and/or 3385 /// extension operations. 3386 static unsigned getCmpOperandFoldingProfit(SDValue Op) { 3387 auto isSupportedExtend = [&](SDValue V) { 3388 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG) 3389 return true; 3390 3391 if (V.getOpcode() == ISD::AND) 3392 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) { 3393 uint64_t Mask = MaskCst->getZExtValue(); 3394 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF); 3395 } 3396 3397 return false; 3398 }; 3399 3400 if (!Op.hasOneUse()) 3401 return 0; 3402 3403 if (isSupportedExtend(Op)) 3404 return 1; 3405 3406 unsigned Opc = Op.getOpcode(); 3407 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA) 3408 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 3409 uint64_t Shift = ShiftCst->getZExtValue(); 3410 if (isSupportedExtend(Op.getOperand(0))) 3411 return (Shift <= 4) ? 2 : 1; 3412 EVT VT = Op.getValueType(); 3413 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63)) 3414 return 1; 3415 } 3416 3417 return 0; 3418 } 3419 3420 static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, 3421 SDValue &AArch64cc, SelectionDAG &DAG, 3422 const SDLoc &dl) { 3423 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { 3424 EVT VT = RHS.getValueType(); 3425 uint64_t C = RHSC->getZExtValue(); 3426 if (!isLegalArithImmed(C)) { 3427 // Constant does not fit, try adjusting it by one? 3428 switch (CC) { 3429 default: 3430 break; 3431 case ISD::SETLT: 3432 case ISD::SETGE: 3433 if ((VT == MVT::i32 && C != 0x80000000 && 3434 isLegalArithImmed((uint32_t)(C - 1))) || 3435 (VT == MVT::i64 && C != 0x80000000ULL && 3436 isLegalArithImmed(C - 1ULL))) { 3437 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; 3438 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; 3439 RHS = DAG.getConstant(C, dl, VT); 3440 } 3441 break; 3442 case ISD::SETULT: 3443 case ISD::SETUGE: 3444 if ((VT == MVT::i32 && C != 0 && 3445 isLegalArithImmed((uint32_t)(C - 1))) || 3446 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) { 3447 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; 3448 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; 3449 RHS = DAG.getConstant(C, dl, VT); 3450 } 3451 break; 3452 case ISD::SETLE: 3453 case ISD::SETGT: 3454 if ((VT == MVT::i32 && C != INT32_MAX && 3455 isLegalArithImmed((uint32_t)(C + 1))) || 3456 (VT == MVT::i64 && C != INT64_MAX && 3457 isLegalArithImmed(C + 1ULL))) { 3458 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; 3459 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; 3460 RHS = DAG.getConstant(C, dl, VT); 3461 } 3462 break; 3463 case ISD::SETULE: 3464 case ISD::SETUGT: 3465 if ((VT == MVT::i32 && C != UINT32_MAX && 3466 isLegalArithImmed((uint32_t)(C + 1))) || 3467 (VT == MVT::i64 && C != UINT64_MAX && 3468 isLegalArithImmed(C + 1ULL))) { 3469 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; 3470 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; 3471 RHS = DAG.getConstant(C, dl, VT); 3472 } 3473 break; 3474 } 3475 } 3476 } 3477 3478 // Comparisons are canonicalized so that the RHS operand is simpler than the 3479 // LHS one, the extreme case being when RHS is an immediate. However, AArch64 3480 // can fold some shift+extend operations on the RHS operand, so swap the 3481 // operands if that can be done. 3482 // 3483 // For example: 3484 // lsl w13, w11, #1 3485 // cmp w13, w12 3486 // can be turned into: 3487 // cmp w12, w11, lsl #1 3488 if (!isa<ConstantSDNode>(RHS) || 3489 !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) { 3490 SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS; 3491 3492 if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) { 3493 std::swap(LHS, RHS); 3494 CC = ISD::getSetCCSwappedOperands(CC); 3495 } 3496 } 3497 3498 SDValue Cmp; 3499 AArch64CC::CondCode AArch64CC; 3500 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) { 3501 const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS); 3502 3503 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095. 3504 // For the i8 operand, the largest immediate is 255, so this can be easily 3505 // encoded in the compare instruction. For the i16 operand, however, the 3506 // largest immediate cannot be encoded in the compare. 3507 // Therefore, use a sign extending load and cmn to avoid materializing the 3508 // -1 constant. For example, 3509 // movz w1, #65535 3510 // ldrh w0, [x0, #0] 3511 // cmp w0, w1 3512 // > 3513 // ldrsh w0, [x0, #0] 3514 // cmn w0, #1 3515 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS) 3516 // if and only if (sext LHS) == (sext RHS). The checks are in place to 3517 // ensure both the LHS and RHS are truly zero extended and to make sure the 3518 // transformation is profitable. 3519 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) && 3520 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD && 3521 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 && 3522 LHS.getNode()->hasNUsesOfValue(1, 0)) { 3523 int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue(); 3524 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { 3525 SDValue SExt = 3526 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, 3527 DAG.getValueType(MVT::i16)); 3528 Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl, 3529 RHS.getValueType()), 3530 CC, dl, DAG); 3531 AArch64CC = changeIntCCToAArch64CC(CC); 3532 } 3533 } 3534 3535 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) { 3536 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) { 3537 if ((CC == ISD::SETNE) ^ RHSC->isZero()) 3538 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC); 3539 } 3540 } 3541 } 3542 3543 if (!Cmp) { 3544 Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 3545 AArch64CC = changeIntCCToAArch64CC(CC); 3546 } 3547 AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC); 3548 return Cmp; 3549 } 3550 3551 static std::pair<SDValue, SDValue> 3552 getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { 3553 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) && 3554 "Unsupported value type"); 3555 SDValue Value, Overflow; 3556 SDLoc DL(Op); 3557 SDValue LHS = Op.getOperand(0); 3558 SDValue RHS = Op.getOperand(1); 3559 unsigned Opc = 0; 3560 switch (Op.getOpcode()) { 3561 default: 3562 llvm_unreachable("Unknown overflow instruction!"); 3563 case ISD::SADDO: 3564 Opc = AArch64ISD::ADDS; 3565 CC = AArch64CC::VS; 3566 break; 3567 case ISD::UADDO: 3568 Opc = AArch64ISD::ADDS; 3569 CC = AArch64CC::HS; 3570 break; 3571 case ISD::SSUBO: 3572 Opc = AArch64ISD::SUBS; 3573 CC = AArch64CC::VS; 3574 break; 3575 case ISD::USUBO: 3576 Opc = AArch64ISD::SUBS; 3577 CC = AArch64CC::LO; 3578 break; 3579 // Multiply needs a little bit extra work. 3580 case ISD::SMULO: 3581 case ISD::UMULO: { 3582 CC = AArch64CC::NE; 3583 bool IsSigned = Op.getOpcode() == ISD::SMULO; 3584 if (Op.getValueType() == MVT::i32) { 3585 // Extend to 64-bits, then perform a 64-bit multiply. 3586 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 3587 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS); 3588 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS); 3589 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); 3590 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul); 3591 3592 // Check that the result fits into a 32-bit integer. 3593 SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC); 3594 if (IsSigned) { 3595 // cmp xreg, wreg, sxtw 3596 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value); 3597 Overflow = 3598 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1); 3599 } else { 3600 // tst xreg, #0xffffffff00000000 3601 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64); 3602 Overflow = 3603 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1); 3604 } 3605 break; 3606 } 3607 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type"); 3608 // For the 64 bit multiply 3609 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); 3610 if (IsSigned) { 3611 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS); 3612 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value, 3613 DAG.getConstant(63, DL, MVT::i64)); 3614 // It is important that LowerBits is last, otherwise the arithmetic 3615 // shift will not be folded into the compare (SUBS). 3616 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); 3617 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) 3618 .getValue(1); 3619 } else { 3620 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS); 3621 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); 3622 Overflow = 3623 DAG.getNode(AArch64ISD::SUBS, DL, VTs, 3624 DAG.getConstant(0, DL, MVT::i64), 3625 UpperBits).getValue(1); 3626 } 3627 break; 3628 } 3629 } // switch (...) 3630 3631 if (Opc) { 3632 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32); 3633 3634 // Emit the AArch64 operation with overflow check. 3635 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS); 3636 Overflow = Value.getValue(1); 3637 } 3638 return std::make_pair(Value, Overflow); 3639 } 3640 3641 SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const { 3642 if (useSVEForFixedLengthVectorVT(Op.getValueType(), 3643 Subtarget->forceStreamingCompatibleSVE())) 3644 return LowerToScalableOp(Op, DAG); 3645 3646 SDValue Sel = Op.getOperand(0); 3647 SDValue Other = Op.getOperand(1); 3648 SDLoc dl(Sel); 3649 3650 // If the operand is an overflow checking operation, invert the condition 3651 // code and kill the Not operation. I.e., transform: 3652 // (xor (overflow_op_bool, 1)) 3653 // --> 3654 // (csel 1, 0, invert(cc), overflow_op_bool) 3655 // ... which later gets transformed to just a cset instruction with an 3656 // inverted condition code, rather than a cset + eor sequence. 3657 if (isOneConstant(Other) && ISD::isOverflowIntrOpRes(Sel)) { 3658 // Only lower legal XALUO ops. 3659 if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0))) 3660 return SDValue(); 3661 3662 SDValue TVal = DAG.getConstant(1, dl, MVT::i32); 3663 SDValue FVal = DAG.getConstant(0, dl, MVT::i32); 3664 AArch64CC::CondCode CC; 3665 SDValue Value, Overflow; 3666 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG); 3667 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32); 3668 return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal, 3669 CCVal, Overflow); 3670 } 3671 // If neither operand is a SELECT_CC, give up. 3672 if (Sel.getOpcode() != ISD::SELECT_CC) 3673 std::swap(Sel, Other); 3674 if (Sel.getOpcode() != ISD::SELECT_CC) 3675 return Op; 3676 3677 // The folding we want to perform is: 3678 // (xor x, (select_cc a, b, cc, 0, -1) ) 3679 // --> 3680 // (csel x, (xor x, -1), cc ...) 3681 // 3682 // The latter will get matched to a CSINV instruction. 3683 3684 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get(); 3685 SDValue LHS = Sel.getOperand(0); 3686 SDValue RHS = Sel.getOperand(1); 3687 SDValue TVal = Sel.getOperand(2); 3688 SDValue FVal = Sel.getOperand(3); 3689 3690 // FIXME: This could be generalized to non-integer comparisons. 3691 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) 3692 return Op; 3693 3694 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal); 3695 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal); 3696 3697 // The values aren't constants, this isn't the pattern we're looking for. 3698 if (!CFVal || !CTVal) 3699 return Op; 3700 3701 // We can commute the SELECT_CC by inverting the condition. This 3702 // might be needed to make this fit into a CSINV pattern. 3703 if (CTVal->isAllOnes() && CFVal->isZero()) { 3704 std::swap(TVal, FVal); 3705 std::swap(CTVal, CFVal); 3706 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 3707 } 3708 3709 // If the constants line up, perform the transform! 3710 if (CTVal->isZero() && CFVal->isAllOnes()) { 3711 SDValue CCVal; 3712 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); 3713 3714 FVal = Other; 3715 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other, 3716 DAG.getConstant(-1ULL, dl, Other.getValueType())); 3717 3718 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal, 3719 CCVal, Cmp); 3720 } 3721 3722 return Op; 3723 } 3724 3725 // If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C' 3726 // bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else 3727 // sets 'C' bit to 0. 3728 static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert) { 3729 SDLoc DL(Value); 3730 EVT VT = Value.getValueType(); 3731 SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value; 3732 SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT); 3733 SDValue Cmp = 3734 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1); 3735 return Cmp.getValue(1); 3736 } 3737 3738 // If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0. 3739 // If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1. 3740 static SDValue carryFlagToValue(SDValue Flag, EVT VT, SelectionDAG &DAG, 3741 bool Invert) { 3742 assert(Flag.getResNo() == 1); 3743 SDLoc DL(Flag); 3744 SDValue Zero = DAG.getConstant(0, DL, VT); 3745 SDValue One = DAG.getConstant(1, DL, VT); 3746 unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS; 3747 SDValue CC = DAG.getConstant(Cond, DL, MVT::i32); 3748 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Flag); 3749 } 3750 3751 // Value is 1 if 'V' bit of NZCV is 1, else 0 3752 static SDValue overflowFlagToValue(SDValue Flag, EVT VT, SelectionDAG &DAG) { 3753 assert(Flag.getResNo() == 1); 3754 SDLoc DL(Flag); 3755 SDValue Zero = DAG.getConstant(0, DL, VT); 3756 SDValue One = DAG.getConstant(1, DL, VT); 3757 SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32); 3758 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Flag); 3759 } 3760 3761 // This lowering is inefficient, but it will get cleaned up by 3762 // `foldOverflowCheck` 3763 static SDValue lowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode, 3764 bool IsSigned) { 3765 EVT VT0 = Op.getValue(0).getValueType(); 3766 EVT VT1 = Op.getValue(1).getValueType(); 3767 3768 if (VT0 != MVT::i32 && VT0 != MVT::i64) 3769 return SDValue(); 3770 3771 bool InvertCarry = Opcode == AArch64ISD::SBCS; 3772 SDValue OpLHS = Op.getOperand(0); 3773 SDValue OpRHS = Op.getOperand(1); 3774 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry); 3775 3776 SDLoc DL(Op); 3777 SDVTList VTs = DAG.getVTList(VT0, VT1); 3778 3779 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS, 3780 OpRHS, OpCarryIn); 3781 3782 SDValue OutFlag = 3783 IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG) 3784 : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry); 3785 3786 return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag); 3787 } 3788 3789 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { 3790 // Let legalize expand this if it isn't a legal type yet. 3791 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 3792 return SDValue(); 3793 3794 SDLoc dl(Op); 3795 AArch64CC::CondCode CC; 3796 // The actual operation that sets the overflow or carry flag. 3797 SDValue Value, Overflow; 3798 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG); 3799 3800 // We use 0 and 1 as false and true values. 3801 SDValue TVal = DAG.getConstant(1, dl, MVT::i32); 3802 SDValue FVal = DAG.getConstant(0, dl, MVT::i32); 3803 3804 // We use an inverted condition, because the conditional select is inverted 3805 // too. This will allow it to be selected to a single instruction: 3806 // CSINC Wd, WZR, WZR, invert(cond). 3807 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32); 3808 Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal, 3809 CCVal, Overflow); 3810 3811 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 3812 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 3813 } 3814 3815 // Prefetch operands are: 3816 // 1: Address to prefetch 3817 // 2: bool isWrite 3818 // 3: int locality (0 = no locality ... 3 = extreme locality) 3819 // 4: bool isDataCache 3820 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) { 3821 SDLoc DL(Op); 3822 unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 3823 unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 3824 unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 3825 3826 bool IsStream = !Locality; 3827 // When the locality number is set 3828 if (Locality) { 3829 // The front-end should have filtered out the out-of-range values 3830 assert(Locality <= 3 && "Prefetch locality out-of-range"); 3831 // The locality degree is the opposite of the cache speed. 3832 // Put the number the other way around. 3833 // The encoding starts at 0 for level 1 3834 Locality = 3 - Locality; 3835 } 3836 3837 // built the mask value encoding the expected behavior. 3838 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit 3839 (!IsData << 3) | // IsDataCache bit 3840 (Locality << 1) | // Cache level bits 3841 (unsigned)IsStream; // Stream bit 3842 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0), 3843 DAG.getTargetConstant(PrfOp, DL, MVT::i32), 3844 Op.getOperand(1)); 3845 } 3846 3847 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, 3848 SelectionDAG &DAG) const { 3849 EVT VT = Op.getValueType(); 3850 if (VT.isScalableVector()) 3851 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU); 3852 3853 if (useSVEForFixedLengthVectorVT(VT)) 3854 return LowerFixedLengthFPExtendToSVE(Op, DAG); 3855 3856 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering"); 3857 return SDValue(); 3858 } 3859 3860 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, 3861 SelectionDAG &DAG) const { 3862 if (Op.getValueType().isScalableVector()) 3863 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU); 3864 3865 bool IsStrict = Op->isStrictFPOpcode(); 3866 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 3867 EVT SrcVT = SrcVal.getValueType(); 3868 3869 if (useSVEForFixedLengthVectorVT(SrcVT, 3870 Subtarget->forceStreamingCompatibleSVE())) 3871 return LowerFixedLengthFPRoundToSVE(Op, DAG); 3872 3873 if (SrcVT != MVT::f128) { 3874 // Expand cases where the input is a vector bigger than NEON. 3875 if (useSVEForFixedLengthVectorVT(SrcVT)) 3876 return SDValue(); 3877 3878 // It's legal except when f128 is involved 3879 return Op; 3880 } 3881 3882 return SDValue(); 3883 } 3884 3885 SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, 3886 SelectionDAG &DAG) const { 3887 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. 3888 // Any additional optimization in this function should be recorded 3889 // in the cost tables. 3890 bool IsStrict = Op->isStrictFPOpcode(); 3891 EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType(); 3892 EVT VT = Op.getValueType(); 3893 3894 if (VT.isScalableVector()) { 3895 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT 3896 ? AArch64ISD::FCVTZU_MERGE_PASSTHRU 3897 : AArch64ISD::FCVTZS_MERGE_PASSTHRU; 3898 return LowerToPredicatedOp(Op, DAG, Opcode); 3899 } 3900 3901 if (useSVEForFixedLengthVectorVT(VT, 3902 Subtarget->forceStreamingCompatibleSVE()) || 3903 useSVEForFixedLengthVectorVT(InVT, 3904 Subtarget->forceStreamingCompatibleSVE())) 3905 return LowerFixedLengthFPToIntToSVE(Op, DAG); 3906 3907 unsigned NumElts = InVT.getVectorNumElements(); 3908 3909 // f16 conversions are promoted to f32 when full fp16 is not supported. 3910 if (InVT.getVectorElementType() == MVT::f16 && 3911 !Subtarget->hasFullFP16()) { 3912 MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts); 3913 SDLoc dl(Op); 3914 if (IsStrict) { 3915 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other}, 3916 {Op.getOperand(0), Op.getOperand(1)}); 3917 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other}, 3918 {Ext.getValue(1), Ext.getValue(0)}); 3919 } 3920 return DAG.getNode( 3921 Op.getOpcode(), dl, Op.getValueType(), 3922 DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0))); 3923 } 3924 3925 uint64_t VTSize = VT.getFixedSizeInBits(); 3926 uint64_t InVTSize = InVT.getFixedSizeInBits(); 3927 if (VTSize < InVTSize) { 3928 SDLoc dl(Op); 3929 if (IsStrict) { 3930 InVT = InVT.changeVectorElementTypeToInteger(); 3931 SDValue Cv = DAG.getNode(Op.getOpcode(), dl, {InVT, MVT::Other}, 3932 {Op.getOperand(0), Op.getOperand(1)}); 3933 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, Cv); 3934 return DAG.getMergeValues({Trunc, Cv.getValue(1)}, dl); 3935 } 3936 SDValue Cv = 3937 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(), 3938 Op.getOperand(0)); 3939 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv); 3940 } 3941 3942 if (VTSize > InVTSize) { 3943 SDLoc dl(Op); 3944 MVT ExtVT = 3945 MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()), 3946 VT.getVectorNumElements()); 3947 if (IsStrict) { 3948 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {ExtVT, MVT::Other}, 3949 {Op.getOperand(0), Op.getOperand(1)}); 3950 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other}, 3951 {Ext.getValue(1), Ext.getValue(0)}); 3952 } 3953 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0)); 3954 return DAG.getNode(Op.getOpcode(), dl, VT, Ext); 3955 } 3956 3957 // Use a scalar operation for conversions between single-element vectors of 3958 // the same size. 3959 if (NumElts == 1) { 3960 SDLoc dl(Op); 3961 SDValue Extract = DAG.getNode( 3962 ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(), 3963 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, dl, MVT::i64)); 3964 EVT ScalarVT = VT.getScalarType(); 3965 if (IsStrict) 3966 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other}, 3967 {Op.getOperand(0), Extract}); 3968 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract); 3969 } 3970 3971 // Type changing conversions are illegal. 3972 return Op; 3973 } 3974 3975 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, 3976 SelectionDAG &DAG) const { 3977 bool IsStrict = Op->isStrictFPOpcode(); 3978 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 3979 3980 if (SrcVal.getValueType().isVector()) 3981 return LowerVectorFP_TO_INT(Op, DAG); 3982 3983 // f16 conversions are promoted to f32 when full fp16 is not supported. 3984 if (SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { 3985 SDLoc dl(Op); 3986 if (IsStrict) { 3987 SDValue Ext = 3988 DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other}, 3989 {Op.getOperand(0), SrcVal}); 3990 return DAG.getNode(Op.getOpcode(), dl, {Op.getValueType(), MVT::Other}, 3991 {Ext.getValue(1), Ext.getValue(0)}); 3992 } 3993 return DAG.getNode( 3994 Op.getOpcode(), dl, Op.getValueType(), 3995 DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal)); 3996 } 3997 3998 if (SrcVal.getValueType() != MVT::f128) { 3999 // It's legal except when f128 is involved 4000 return Op; 4001 } 4002 4003 return SDValue(); 4004 } 4005 4006 SDValue 4007 AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op, 4008 SelectionDAG &DAG) const { 4009 // AArch64 FP-to-int conversions saturate to the destination element size, so 4010 // we can lower common saturating conversions to simple instructions. 4011 SDValue SrcVal = Op.getOperand(0); 4012 EVT SrcVT = SrcVal.getValueType(); 4013 EVT DstVT = Op.getValueType(); 4014 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); 4015 4016 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits(); 4017 uint64_t DstElementWidth = DstVT.getScalarSizeInBits(); 4018 uint64_t SatWidth = SatVT.getScalarSizeInBits(); 4019 assert(SatWidth <= DstElementWidth && 4020 "Saturation width cannot exceed result width"); 4021 4022 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT. 4023 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable 4024 // types, so this is hard to reach. 4025 if (DstVT.isScalableVector()) 4026 return SDValue(); 4027 4028 EVT SrcElementVT = SrcVT.getVectorElementType(); 4029 4030 // In the absence of FP16 support, promote f16 to f32 and saturate the result. 4031 if (SrcElementVT == MVT::f16 && 4032 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) { 4033 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements()); 4034 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), F32VT, SrcVal); 4035 SrcVT = F32VT; 4036 SrcElementVT = MVT::f32; 4037 SrcElementWidth = 32; 4038 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 && 4039 SrcElementVT != MVT::f16) 4040 return SDValue(); 4041 4042 SDLoc DL(Op); 4043 // Cases that we can emit directly. 4044 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth) 4045 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, 4046 DAG.getValueType(DstVT.getScalarType())); 4047 4048 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the 4049 // result. This is only valid if the legal cvt is larger than the saturate 4050 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize 4051 // (at least until sqxtn is selected). 4052 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64) 4053 return SDValue(); 4054 4055 EVT IntVT = SrcVT.changeVectorElementTypeToInteger(); 4056 SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal, 4057 DAG.getValueType(IntVT.getScalarType())); 4058 SDValue Sat; 4059 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) { 4060 SDValue MinC = DAG.getConstant( 4061 APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT); 4062 SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC); 4063 SDValue MaxC = DAG.getConstant( 4064 APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT); 4065 Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC); 4066 } else { 4067 SDValue MinC = DAG.getConstant( 4068 APInt::getAllOnesValue(SatWidth).zext(SrcElementWidth), DL, IntVT); 4069 Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC); 4070 } 4071 4072 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat); 4073 } 4074 4075 SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, 4076 SelectionDAG &DAG) const { 4077 // AArch64 FP-to-int conversions saturate to the destination register size, so 4078 // we can lower common saturating conversions to simple instructions. 4079 SDValue SrcVal = Op.getOperand(0); 4080 EVT SrcVT = SrcVal.getValueType(); 4081 4082 if (SrcVT.isVector()) 4083 return LowerVectorFP_TO_INT_SAT(Op, DAG); 4084 4085 EVT DstVT = Op.getValueType(); 4086 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); 4087 uint64_t SatWidth = SatVT.getScalarSizeInBits(); 4088 uint64_t DstWidth = DstVT.getScalarSizeInBits(); 4089 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width"); 4090 4091 // In the absence of FP16 support, promote f16 to f32 and saturate the result. 4092 if (SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) { 4093 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal); 4094 SrcVT = MVT::f32; 4095 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16) 4096 return SDValue(); 4097 4098 SDLoc DL(Op); 4099 // Cases that we can emit directly. 4100 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 || 4101 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) && 4102 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32)) 4103 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, 4104 DAG.getValueType(DstVT)); 4105 4106 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the 4107 // result. This is only valid if the legal cvt is larger than the saturate 4108 // width. 4109 if (DstWidth < SatWidth) 4110 return SDValue(); 4111 4112 SDValue NativeCvt = 4113 DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT)); 4114 SDValue Sat; 4115 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) { 4116 SDValue MinC = DAG.getConstant( 4117 APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT); 4118 SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC); 4119 SDValue MaxC = DAG.getConstant( 4120 APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT); 4121 Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC); 4122 } else { 4123 SDValue MinC = DAG.getConstant( 4124 APInt::getAllOnesValue(SatWidth).zext(DstWidth), DL, DstVT); 4125 Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC); 4126 } 4127 4128 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat); 4129 } 4130 4131 SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op, 4132 SelectionDAG &DAG) const { 4133 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. 4134 // Any additional optimization in this function should be recorded 4135 // in the cost tables. 4136 bool IsStrict = Op->isStrictFPOpcode(); 4137 EVT VT = Op.getValueType(); 4138 SDLoc dl(Op); 4139 SDValue In = Op.getOperand(IsStrict ? 1 : 0); 4140 EVT InVT = In.getValueType(); 4141 unsigned Opc = Op.getOpcode(); 4142 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP; 4143 4144 if (VT.isScalableVector()) { 4145 if (InVT.getVectorElementType() == MVT::i1) { 4146 // We can't directly extend an SVE predicate; extend it first. 4147 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 4148 EVT CastVT = getPromotedVTForPredicate(InVT); 4149 In = DAG.getNode(CastOpc, dl, CastVT, In); 4150 return DAG.getNode(Opc, dl, VT, In); 4151 } 4152 4153 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU 4154 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU; 4155 return LowerToPredicatedOp(Op, DAG, Opcode); 4156 } 4157 4158 if (useSVEForFixedLengthVectorVT(VT, 4159 Subtarget->forceStreamingCompatibleSVE()) || 4160 useSVEForFixedLengthVectorVT(InVT, 4161 Subtarget->forceStreamingCompatibleSVE())) 4162 return LowerFixedLengthIntToFPToSVE(Op, DAG); 4163 4164 uint64_t VTSize = VT.getFixedSizeInBits(); 4165 uint64_t InVTSize = InVT.getFixedSizeInBits(); 4166 if (VTSize < InVTSize) { 4167 MVT CastVT = 4168 MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()), 4169 InVT.getVectorNumElements()); 4170 if (IsStrict) { 4171 In = DAG.getNode(Opc, dl, {CastVT, MVT::Other}, 4172 {Op.getOperand(0), In}); 4173 return DAG.getNode( 4174 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other}, 4175 {In.getValue(1), In.getValue(0), DAG.getIntPtrConstant(0, dl)}); 4176 } 4177 In = DAG.getNode(Opc, dl, CastVT, In); 4178 return DAG.getNode(ISD::FP_ROUND, dl, VT, In, 4179 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)); 4180 } 4181 4182 if (VTSize > InVTSize) { 4183 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 4184 EVT CastVT = VT.changeVectorElementTypeToInteger(); 4185 In = DAG.getNode(CastOpc, dl, CastVT, In); 4186 if (IsStrict) 4187 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op.getOperand(0), In}); 4188 return DAG.getNode(Opc, dl, VT, In); 4189 } 4190 4191 // Use a scalar operation for conversions between single-element vectors of 4192 // the same size. 4193 if (VT.getVectorNumElements() == 1) { 4194 SDValue Extract = DAG.getNode( 4195 ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(), 4196 In, DAG.getConstant(0, dl, MVT::i64)); 4197 EVT ScalarVT = VT.getScalarType(); 4198 if (IsStrict) 4199 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other}, 4200 {Op.getOperand(0), Extract}); 4201 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract); 4202 } 4203 4204 return Op; 4205 } 4206 4207 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, 4208 SelectionDAG &DAG) const { 4209 if (Op.getValueType().isVector()) 4210 return LowerVectorINT_TO_FP(Op, DAG); 4211 4212 bool IsStrict = Op->isStrictFPOpcode(); 4213 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 4214 4215 // f16 conversions are promoted to f32 when full fp16 is not supported. 4216 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { 4217 SDLoc dl(Op); 4218 if (IsStrict) { 4219 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {MVT::f32, MVT::Other}, 4220 {Op.getOperand(0), SrcVal}); 4221 return DAG.getNode( 4222 ISD::STRICT_FP_ROUND, dl, {MVT::f16, MVT::Other}, 4223 {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)}); 4224 } 4225 return DAG.getNode( 4226 ISD::FP_ROUND, dl, MVT::f16, 4227 DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal), 4228 DAG.getIntPtrConstant(0, dl)); 4229 } 4230 4231 // i128 conversions are libcalls. 4232 if (SrcVal.getValueType() == MVT::i128) 4233 return SDValue(); 4234 4235 // Other conversions are legal, unless it's to the completely software-based 4236 // fp128. 4237 if (Op.getValueType() != MVT::f128) 4238 return Op; 4239 return SDValue(); 4240 } 4241 4242 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, 4243 SelectionDAG &DAG) const { 4244 // For iOS, we want to call an alternative entry point: __sincos_stret, 4245 // which returns the values in two S / D registers. 4246 SDLoc dl(Op); 4247 SDValue Arg = Op.getOperand(0); 4248 EVT ArgVT = Arg.getValueType(); 4249 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 4250 4251 ArgListTy Args; 4252 ArgListEntry Entry; 4253 4254 Entry.Node = Arg; 4255 Entry.Ty = ArgTy; 4256 Entry.IsSExt = false; 4257 Entry.IsZExt = false; 4258 Args.push_back(Entry); 4259 4260 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64 4261 : RTLIB::SINCOS_STRET_F32; 4262 const char *LibcallName = getLibcallName(LC); 4263 SDValue Callee = 4264 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout())); 4265 4266 StructType *RetTy = StructType::get(ArgTy, ArgTy); 4267 TargetLowering::CallLoweringInfo CLI(DAG); 4268 CLI.setDebugLoc(dl) 4269 .setChain(DAG.getEntryNode()) 4270 .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args)); 4271 4272 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 4273 return CallResult.first; 4274 } 4275 4276 static MVT getSVEContainerType(EVT ContentTy); 4277 4278 SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op, 4279 SelectionDAG &DAG) const { 4280 EVT OpVT = Op.getValueType(); 4281 EVT ArgVT = Op.getOperand(0).getValueType(); 4282 4283 if (useSVEForFixedLengthVectorVT(OpVT)) 4284 return LowerFixedLengthBitcastToSVE(Op, DAG); 4285 4286 if (OpVT.isScalableVector()) { 4287 // Bitcasting between unpacked vector types of different element counts is 4288 // not a NOP because the live elements are laid out differently. 4289 // 01234567 4290 // e.g. nxv2i32 = XX??XX?? 4291 // nxv4f16 = X?X?X?X? 4292 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount()) 4293 return SDValue(); 4294 4295 if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) { 4296 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() && 4297 "Expected int->fp bitcast!"); 4298 SDValue ExtResult = 4299 DAG.getNode(ISD::ANY_EXTEND, SDLoc(Op), getSVEContainerType(ArgVT), 4300 Op.getOperand(0)); 4301 return getSVESafeBitCast(OpVT, ExtResult, DAG); 4302 } 4303 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG); 4304 } 4305 4306 if (OpVT != MVT::f16 && OpVT != MVT::bf16) 4307 return SDValue(); 4308 4309 // Bitcasts between f16 and bf16 are legal. 4310 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16) 4311 return Op; 4312 4313 assert(ArgVT == MVT::i16); 4314 SDLoc DL(Op); 4315 4316 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0)); 4317 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op); 4318 return SDValue( 4319 DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, OpVT, Op, 4320 DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)), 4321 0); 4322 } 4323 4324 static EVT getExtensionTo64Bits(const EVT &OrigVT) { 4325 if (OrigVT.getSizeInBits() >= 64) 4326 return OrigVT; 4327 4328 assert(OrigVT.isSimple() && "Expecting a simple value type"); 4329 4330 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; 4331 switch (OrigSimpleTy) { 4332 default: llvm_unreachable("Unexpected Vector Type"); 4333 case MVT::v2i8: 4334 case MVT::v2i16: 4335 return MVT::v2i32; 4336 case MVT::v4i8: 4337 return MVT::v4i16; 4338 } 4339 } 4340 4341 static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, 4342 const EVT &OrigTy, 4343 const EVT &ExtTy, 4344 unsigned ExtOpcode) { 4345 // The vector originally had a size of OrigTy. It was then extended to ExtTy. 4346 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than 4347 // 64-bits we need to insert a new extension so that it will be 64-bits. 4348 assert(ExtTy.is128BitVector() && "Unexpected extension size"); 4349 if (OrigTy.getSizeInBits() >= 64) 4350 return N; 4351 4352 // Must extend size to at least 64 bits to be used as an operand for VMULL. 4353 EVT NewVT = getExtensionTo64Bits(OrigTy); 4354 4355 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); 4356 } 4357 4358 // Returns lane if Op extracts from a two-element vector and lane is constant 4359 // (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise. 4360 static std::optional<uint64_t> 4361 getConstantLaneNumOfExtractHalfOperand(SDValue &Op) { 4362 SDNode *OpNode = Op.getNode(); 4363 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 4364 return std::nullopt; 4365 4366 EVT VT = OpNode->getOperand(0).getValueType(); 4367 ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpNode->getOperand(1)); 4368 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C) 4369 return std::nullopt; 4370 4371 return C->getZExtValue(); 4372 } 4373 4374 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, 4375 bool isSigned) { 4376 EVT VT = N->getValueType(0); 4377 4378 if (N->getOpcode() != ISD::BUILD_VECTOR) 4379 return false; 4380 4381 for (const SDValue &Elt : N->op_values()) { 4382 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { 4383 unsigned EltSize = VT.getScalarSizeInBits(); 4384 unsigned HalfSize = EltSize / 2; 4385 if (isSigned) { 4386 if (!isIntN(HalfSize, C->getSExtValue())) 4387 return false; 4388 } else { 4389 if (!isUIntN(HalfSize, C->getZExtValue())) 4390 return false; 4391 } 4392 continue; 4393 } 4394 return false; 4395 } 4396 4397 return true; 4398 } 4399 4400 static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) { 4401 if (N->getOpcode() == ISD::SIGN_EXTEND || 4402 N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND) 4403 return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG, 4404 N->getOperand(0)->getValueType(0), 4405 N->getValueType(0), 4406 N->getOpcode()); 4407 4408 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); 4409 EVT VT = N->getValueType(0); 4410 SDLoc dl(N); 4411 unsigned EltSize = VT.getScalarSizeInBits() / 2; 4412 unsigned NumElts = VT.getVectorNumElements(); 4413 MVT TruncVT = MVT::getIntegerVT(EltSize); 4414 SmallVector<SDValue, 8> Ops; 4415 for (unsigned i = 0; i != NumElts; ++i) { 4416 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); 4417 const APInt &CInt = C->getAPIntValue(); 4418 // Element types smaller than 32 bits are not legal, so use i32 elements. 4419 // The values are implicitly truncated so sext vs. zext doesn't matter. 4420 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); 4421 } 4422 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops); 4423 } 4424 4425 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { 4426 return N->getOpcode() == ISD::SIGN_EXTEND || 4427 N->getOpcode() == ISD::ANY_EXTEND || 4428 isExtendedBUILD_VECTOR(N, DAG, true); 4429 } 4430 4431 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { 4432 return N->getOpcode() == ISD::ZERO_EXTEND || 4433 N->getOpcode() == ISD::ANY_EXTEND || 4434 isExtendedBUILD_VECTOR(N, DAG, false); 4435 } 4436 4437 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { 4438 unsigned Opcode = N->getOpcode(); 4439 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 4440 SDNode *N0 = N->getOperand(0).getNode(); 4441 SDNode *N1 = N->getOperand(1).getNode(); 4442 return N0->hasOneUse() && N1->hasOneUse() && 4443 isSignExtended(N0, DAG) && isSignExtended(N1, DAG); 4444 } 4445 return false; 4446 } 4447 4448 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { 4449 unsigned Opcode = N->getOpcode(); 4450 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 4451 SDNode *N0 = N->getOperand(0).getNode(); 4452 SDNode *N1 = N->getOperand(1).getNode(); 4453 return N0->hasOneUse() && N1->hasOneUse() && 4454 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); 4455 } 4456 return false; 4457 } 4458 4459 SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op, 4460 SelectionDAG &DAG) const { 4461 // The rounding mode is in bits 23:22 of the FPSCR. 4462 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 4463 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) 4464 // so that the shift + and get folded into a bitfield extract. 4465 SDLoc dl(Op); 4466 4467 SDValue Chain = Op.getOperand(0); 4468 SDValue FPCR_64 = DAG.getNode( 4469 ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other}, 4470 {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)}); 4471 Chain = FPCR_64.getValue(1); 4472 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64); 4473 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32, 4474 DAG.getConstant(1U << 22, dl, MVT::i32)); 4475 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, 4476 DAG.getConstant(22, dl, MVT::i32)); 4477 SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, 4478 DAG.getConstant(3, dl, MVT::i32)); 4479 return DAG.getMergeValues({AND, Chain}, dl); 4480 } 4481 4482 SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op, 4483 SelectionDAG &DAG) const { 4484 SDLoc DL(Op); 4485 SDValue Chain = Op->getOperand(0); 4486 SDValue RMValue = Op->getOperand(1); 4487 4488 // The rounding mode is in bits 23:22 of the FPCR. 4489 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping 4490 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is 4491 // ((arg - 1) & 3) << 22). 4492 // 4493 // The argument of llvm.set.rounding must be within the segment [0, 3], so 4494 // NearestTiesToAway (4) is not handled here. It is responsibility of the code 4495 // generated llvm.set.rounding to ensure this condition. 4496 4497 // Calculate new value of FPCR[23:22]. 4498 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue, 4499 DAG.getConstant(1, DL, MVT::i32)); 4500 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue, 4501 DAG.getConstant(0x3, DL, MVT::i32)); 4502 RMValue = 4503 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue, 4504 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32)); 4505 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue); 4506 4507 // Get current value of FPCR. 4508 SDValue Ops[] = { 4509 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)}; 4510 SDValue FPCR = 4511 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops); 4512 Chain = FPCR.getValue(1); 4513 FPCR = FPCR.getValue(0); 4514 4515 // Put new rounding mode into FPSCR[23:22]. 4516 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos); 4517 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR, 4518 DAG.getConstant(RMMask, DL, MVT::i64)); 4519 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue); 4520 SDValue Ops2[] = { 4521 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), 4522 FPCR}; 4523 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2); 4524 } 4525 4526 static unsigned selectUmullSmull(SDNode *&N0, SDNode *&N1, SelectionDAG &DAG, 4527 SDLoc DL, bool &IsMLA) { 4528 bool IsN0SExt = isSignExtended(N0, DAG); 4529 bool IsN1SExt = isSignExtended(N1, DAG); 4530 if (IsN0SExt && IsN1SExt) 4531 return AArch64ISD::SMULL; 4532 4533 bool IsN0ZExt = isZeroExtended(N0, DAG); 4534 bool IsN1ZExt = isZeroExtended(N1, DAG); 4535 4536 if (IsN0ZExt && IsN1ZExt) 4537 return AArch64ISD::UMULL; 4538 4539 // Select SMULL if we can replace zext with sext. 4540 if (((IsN0SExt && IsN1ZExt) || (IsN0ZExt && IsN1SExt)) && 4541 !isExtendedBUILD_VECTOR(N0, DAG, false) && 4542 !isExtendedBUILD_VECTOR(N1, DAG, false)) { 4543 SDValue ZextOperand; 4544 if (IsN0ZExt) 4545 ZextOperand = N0->getOperand(0); 4546 else 4547 ZextOperand = N1->getOperand(0); 4548 if (DAG.SignBitIsZero(ZextOperand)) { 4549 SDNode *NewSext = 4550 DAG.getSExtOrTrunc(ZextOperand, DL, N0->getValueType(0)).getNode(); 4551 if (IsN0ZExt) 4552 N0 = NewSext; 4553 else 4554 N1 = NewSext; 4555 return AArch64ISD::SMULL; 4556 } 4557 } 4558 4559 // Select UMULL if we can replace the other operand with an extend. 4560 if (IsN0ZExt || IsN1ZExt) { 4561 EVT VT = N0->getValueType(0); 4562 APInt Mask = APInt::getHighBitsSet(VT.getScalarSizeInBits(), 4563 VT.getScalarSizeInBits() / 2); 4564 if (DAG.MaskedValueIsZero(SDValue(IsN0ZExt ? N1 : N0, 0), Mask)) { 4565 EVT HalfVT; 4566 switch (VT.getSimpleVT().SimpleTy) { 4567 case MVT::v2i64: 4568 HalfVT = MVT::v2i32; 4569 break; 4570 case MVT::v4i32: 4571 HalfVT = MVT::v4i16; 4572 break; 4573 case MVT::v8i16: 4574 HalfVT = MVT::v8i8; 4575 break; 4576 default: 4577 return 0; 4578 } 4579 // Truncate and then extend the result. 4580 SDValue NewExt = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, 4581 SDValue(IsN0ZExt ? N1 : N0, 0)); 4582 NewExt = DAG.getZExtOrTrunc(NewExt, DL, VT); 4583 if (IsN0ZExt) 4584 N1 = NewExt.getNode(); 4585 else 4586 N0 = NewExt.getNode(); 4587 return AArch64ISD::UMULL; 4588 } 4589 } 4590 4591 if (!IsN1SExt && !IsN1ZExt) 4592 return 0; 4593 4594 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these 4595 // into (s/zext A * s/zext C) + (s/zext B * s/zext C) 4596 if (IsN1SExt && isAddSubSExt(N0, DAG)) { 4597 IsMLA = true; 4598 return AArch64ISD::SMULL; 4599 } 4600 if (IsN1ZExt && isAddSubZExt(N0, DAG)) { 4601 IsMLA = true; 4602 return AArch64ISD::UMULL; 4603 } 4604 if (IsN0ZExt && isAddSubZExt(N1, DAG)) { 4605 std::swap(N0, N1); 4606 IsMLA = true; 4607 return AArch64ISD::UMULL; 4608 } 4609 return 0; 4610 } 4611 4612 SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { 4613 EVT VT = Op.getValueType(); 4614 4615 // If SVE is available then i64 vector multiplications can also be made legal. 4616 bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64 || 4617 Subtarget->forceStreamingCompatibleSVE(); 4618 4619 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON)) 4620 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED); 4621 4622 // Multiplications are only custom-lowered for 128-bit vectors so that 4623 // VMULL can be detected. Otherwise v2i64 multiplications are not legal. 4624 assert(VT.is128BitVector() && VT.isInteger() && 4625 "unexpected type for custom-lowering ISD::MUL"); 4626 SDNode *N0 = Op.getOperand(0).getNode(); 4627 SDNode *N1 = Op.getOperand(1).getNode(); 4628 bool isMLA = false; 4629 SDLoc DL(Op); 4630 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA); 4631 4632 if (!NewOpc) { 4633 if (VT == MVT::v2i64) 4634 // Fall through to expand this. It is not legal. 4635 return SDValue(); 4636 else 4637 // Other vector multiplications are legal. 4638 return Op; 4639 } 4640 4641 // Legalize to a S/UMULL instruction 4642 SDValue Op0; 4643 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG); 4644 if (!isMLA) { 4645 Op0 = skipExtensionForVectorMULL(N0, DAG); 4646 assert(Op0.getValueType().is64BitVector() && 4647 Op1.getValueType().is64BitVector() && 4648 "unexpected types for extended operands to VMULL"); 4649 return DAG.getNode(NewOpc, DL, VT, Op0, Op1); 4650 } 4651 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during 4652 // isel lowering to take advantage of no-stall back to back s/umul + s/umla. 4653 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57 4654 SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG); 4655 SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG); 4656 EVT Op1VT = Op1.getValueType(); 4657 return DAG.getNode(N0->getOpcode(), DL, VT, 4658 DAG.getNode(NewOpc, DL, VT, 4659 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), 4660 DAG.getNode(NewOpc, DL, VT, 4661 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); 4662 } 4663 4664 static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, 4665 int Pattern) { 4666 if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all) 4667 return DAG.getConstant(1, DL, MVT::nxv1i1); 4668 return DAG.getNode(AArch64ISD::PTRUE, DL, VT, 4669 DAG.getTargetConstant(Pattern, DL, MVT::i32)); 4670 } 4671 4672 static SDValue optimizeWhile(SDValue Op, SelectionDAG &DAG, bool IsSigned, 4673 bool IsLess, bool IsEqual) { 4674 if (!isa<ConstantSDNode>(Op.getOperand(1)) || 4675 !isa<ConstantSDNode>(Op.getOperand(2))) 4676 return SDValue(); 4677 4678 SDLoc dl(Op); 4679 APInt X = Op.getConstantOperandAPInt(1); 4680 APInt Y = Op.getConstantOperandAPInt(2); 4681 APInt NumActiveElems; 4682 bool Overflow; 4683 if (IsLess) 4684 NumActiveElems = IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow); 4685 else 4686 NumActiveElems = IsSigned ? X.ssub_ov(Y, Overflow) : X.usub_ov(Y, Overflow); 4687 4688 if (Overflow) 4689 return SDValue(); 4690 4691 if (IsEqual) { 4692 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned); 4693 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow) 4694 : NumActiveElems.uadd_ov(One, Overflow); 4695 if (Overflow) 4696 return SDValue(); 4697 } 4698 4699 std::optional<unsigned> PredPattern = 4700 getSVEPredPatternFromNumElements(NumActiveElems.getZExtValue()); 4701 unsigned MinSVEVectorSize = std::max( 4702 DAG.getSubtarget<AArch64Subtarget>().getMinSVEVectorSizeInBits(), 128u); 4703 unsigned ElementSize = 128 / Op.getValueType().getVectorMinNumElements(); 4704 if (PredPattern != std::nullopt && 4705 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize)) 4706 return getPTrue(DAG, dl, Op.getValueType(), *PredPattern); 4707 4708 return SDValue(); 4709 } 4710 4711 // Returns a safe bitcast between two scalable vector predicates, where 4712 // any newly created lanes from a widening bitcast are defined as zero. 4713 static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) { 4714 SDLoc DL(Op); 4715 EVT InVT = Op.getValueType(); 4716 4717 assert(InVT.getVectorElementType() == MVT::i1 && 4718 VT.getVectorElementType() == MVT::i1 && 4719 "Expected a predicate-to-predicate bitcast"); 4720 assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && 4721 InVT.isScalableVector() && 4722 DAG.getTargetLoweringInfo().isTypeLegal(InVT) && 4723 "Only expect to cast between legal scalable predicate types!"); 4724 4725 // Return the operand if the cast isn't changing type, 4726 // e.g. <n x 16 x i1> -> <n x 16 x i1> 4727 if (InVT == VT) 4728 return Op; 4729 4730 SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op); 4731 4732 // We only have to zero the lanes if new lanes are being defined, e.g. when 4733 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the 4734 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then 4735 // we can return here. 4736 if (InVT.bitsGT(VT)) 4737 return Reinterpret; 4738 4739 // Check if the other lanes are already known to be zeroed by 4740 // construction. 4741 if (isZeroingInactiveLanes(Op)) 4742 return Reinterpret; 4743 4744 // Zero the newly introduced lanes. 4745 SDValue Mask = DAG.getConstant(1, DL, InVT); 4746 Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask); 4747 return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask); 4748 } 4749 4750 SDValue AArch64TargetLowering::getPStateSM(SelectionDAG &DAG, SDValue Chain, 4751 SMEAttrs Attrs, SDLoc DL, 4752 EVT VT) const { 4753 if (Attrs.hasStreamingInterfaceOrBody()) 4754 return DAG.getConstant(1, DL, VT); 4755 4756 if (Attrs.hasNonStreamingInterfaceAndBody()) 4757 return DAG.getConstant(0, DL, VT); 4758 4759 assert(Attrs.hasStreamingCompatibleInterface() && "Unexpected interface"); 4760 4761 SDValue Callee = DAG.getExternalSymbol("__arm_sme_state", 4762 getPointerTy(DAG.getDataLayout())); 4763 Type *Int64Ty = Type::getInt64Ty(*DAG.getContext()); 4764 Type *RetTy = StructType::get(Int64Ty, Int64Ty); 4765 TargetLowering::CallLoweringInfo CLI(DAG); 4766 ArgListTy Args; 4767 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee( 4768 CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2, 4769 RetTy, Callee, std::move(Args)); 4770 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 4771 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64); 4772 return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0), 4773 Mask); 4774 } 4775 4776 static std::optional<SMEAttrs> getCalleeAttrsFromExternalFunction(SDValue V) { 4777 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(V)) { 4778 StringRef S(ES->getSymbol()); 4779 if (S == "__arm_sme_state" || S == "__arm_tpidr2_save") 4780 return SMEAttrs(SMEAttrs::SM_Compatible | SMEAttrs::ZA_Preserved); 4781 if (S == "__arm_tpidr2_restore") 4782 return SMEAttrs(SMEAttrs::SM_Compatible | SMEAttrs::ZA_Shared); 4783 } 4784 return std::nullopt; 4785 } 4786 4787 SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op, 4788 SelectionDAG &DAG) const { 4789 unsigned IntNo = Op.getConstantOperandVal(1); 4790 SDLoc DL(Op); 4791 switch (IntNo) { 4792 default: 4793 return SDValue(); // Don't custom lower most intrinsics. 4794 case Intrinsic::aarch64_prefetch: { 4795 SDValue Chain = Op.getOperand(0); 4796 SDValue Addr = Op.getOperand(2); 4797 4798 unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 4799 unsigned Locality = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 4800 unsigned IsStream = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue(); 4801 unsigned IsData = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue(); 4802 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit 4803 (!IsData << 3) | // IsDataCache bit 4804 (Locality << 1) | // Cache level bits 4805 (unsigned)IsStream; // Stream bit 4806 4807 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain, 4808 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr); 4809 } 4810 case Intrinsic::aarch64_sme_za_enable: 4811 return DAG.getNode( 4812 AArch64ISD::SMSTART, DL, MVT::Other, 4813 Op->getOperand(0), // Chain 4814 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32), 4815 DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64)); 4816 case Intrinsic::aarch64_sme_za_disable: 4817 return DAG.getNode( 4818 AArch64ISD::SMSTOP, DL, MVT::Other, 4819 Op->getOperand(0), // Chain 4820 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32), 4821 DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64)); 4822 } 4823 } 4824 4825 SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, 4826 SelectionDAG &DAG) const { 4827 unsigned IntNo = Op.getConstantOperandVal(1); 4828 SDLoc DL(Op); 4829 switch (IntNo) { 4830 default: 4831 return SDValue(); // Don't custom lower most intrinsics. 4832 case Intrinsic::aarch64_mops_memset_tag: { 4833 auto Node = cast<MemIntrinsicSDNode>(Op.getNode()); 4834 SDValue Chain = Node->getChain(); 4835 SDValue Dst = Op.getOperand(2); 4836 SDValue Val = Op.getOperand(3); 4837 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64); 4838 SDValue Size = Op.getOperand(4); 4839 auto Alignment = Node->getMemOperand()->getAlign(); 4840 bool IsVol = Node->isVolatile(); 4841 auto DstPtrInfo = Node->getPointerInfo(); 4842 4843 const auto &SDI = 4844 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo()); 4845 SDValue MS = 4846 SDI.EmitMOPS(AArch64ISD::MOPS_MEMSET_TAGGING, DAG, DL, Chain, Dst, Val, 4847 Size, Alignment, IsVol, DstPtrInfo, MachinePointerInfo{}); 4848 4849 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the 4850 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise 4851 // LowerOperationWrapper will complain that the number of results has 4852 // changed. 4853 return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL); 4854 } 4855 } 4856 } 4857 4858 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, 4859 SelectionDAG &DAG) const { 4860 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 4861 SDLoc dl(Op); 4862 switch (IntNo) { 4863 default: return SDValue(); // Don't custom lower most intrinsics. 4864 case Intrinsic::thread_pointer: { 4865 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 4866 return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT); 4867 } 4868 case Intrinsic::aarch64_neon_abs: { 4869 EVT Ty = Op.getValueType(); 4870 if (Ty == MVT::i64) { 4871 SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, 4872 Op.getOperand(1)); 4873 Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result); 4874 return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result); 4875 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) { 4876 return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1)); 4877 } else { 4878 report_fatal_error("Unexpected type for AArch64 NEON intrinic"); 4879 } 4880 } 4881 case Intrinsic::aarch64_neon_pmull64: { 4882 SDValue LHS = Op.getOperand(1); 4883 SDValue RHS = Op.getOperand(2); 4884 4885 std::optional<uint64_t> LHSLane = 4886 getConstantLaneNumOfExtractHalfOperand(LHS); 4887 std::optional<uint64_t> RHSLane = 4888 getConstantLaneNumOfExtractHalfOperand(RHS); 4889 4890 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1"); 4891 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1"); 4892 4893 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2 4894 // instructions execute on SIMD registers. So canonicalize i64 to v1i64, 4895 // which ISel recognizes better. For example, generate a ldr into d* 4896 // registers as opposed to a GPR load followed by a fmov. 4897 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane, 4898 std::optional<uint64_t> OtherLane, 4899 const SDLoc &dl, 4900 SelectionDAG &DAG) -> SDValue { 4901 // If the operand is an higher half itself, rewrite it to 4902 // extract_high_v2i64; this way aarch64_neon_pmull64 could 4903 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}. 4904 if (NLane && *NLane == 1) 4905 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64, 4906 N.getOperand(0), DAG.getConstant(1, dl, MVT::i64)); 4907 4908 // Operand N is not a higher half but the other operand is. 4909 if (OtherLane && *OtherLane == 1) { 4910 // If this operand is a lower half, rewrite it to 4911 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to 4912 // align lanes of two operands. A roundtrip sequence (to move from lane 4913 // 1 to lane 0) is like this: 4914 // mov x8, v0.d[1] 4915 // fmov d0, x8 4916 if (NLane && *NLane == 0) 4917 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64, 4918 DAG.getNode(AArch64ISD::DUPLANE64, dl, MVT::v2i64, 4919 N.getOperand(0), 4920 DAG.getConstant(0, dl, MVT::i64)), 4921 DAG.getConstant(1, dl, MVT::i64)); 4922 4923 // Otherwise just dup from main to all lanes. 4924 return DAG.getNode(AArch64ISD::DUP, dl, MVT::v1i64, N); 4925 } 4926 4927 // Neither operand is an extract of higher half, so codegen may just use 4928 // the non-high version of PMULL instruction. Use v1i64 to represent i64. 4929 assert(N.getValueType() == MVT::i64 && 4930 "Intrinsic aarch64_neon_pmull64 requires i64 parameters"); 4931 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, N); 4932 }; 4933 4934 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, dl, DAG); 4935 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, dl, DAG); 4936 4937 return DAG.getNode(AArch64ISD::PMULL, dl, Op.getValueType(), LHS, RHS); 4938 } 4939 case Intrinsic::aarch64_neon_smax: 4940 return DAG.getNode(ISD::SMAX, dl, Op.getValueType(), 4941 Op.getOperand(1), Op.getOperand(2)); 4942 case Intrinsic::aarch64_neon_umax: 4943 return DAG.getNode(ISD::UMAX, dl, Op.getValueType(), 4944 Op.getOperand(1), Op.getOperand(2)); 4945 case Intrinsic::aarch64_neon_smin: 4946 return DAG.getNode(ISD::SMIN, dl, Op.getValueType(), 4947 Op.getOperand(1), Op.getOperand(2)); 4948 case Intrinsic::aarch64_neon_umin: 4949 return DAG.getNode(ISD::UMIN, dl, Op.getValueType(), 4950 Op.getOperand(1), Op.getOperand(2)); 4951 case Intrinsic::aarch64_neon_scalar_sqxtn: 4952 case Intrinsic::aarch64_neon_scalar_sqxtun: 4953 case Intrinsic::aarch64_neon_scalar_uqxtn: { 4954 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32); 4955 if (Op.getValueType() == MVT::i32) 4956 return DAG.getNode(ISD::BITCAST, dl, MVT::i32, 4957 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::f32, 4958 Op.getOperand(0), 4959 DAG.getNode(ISD::BITCAST, dl, MVT::f64, 4960 Op.getOperand(1)))); 4961 return SDValue(); 4962 } 4963 case Intrinsic::aarch64_sve_whilelo: 4964 return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/true, 4965 /*IsEqual=*/false); 4966 case Intrinsic::aarch64_sve_whilelt: 4967 return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/true, 4968 /*IsEqual=*/false); 4969 case Intrinsic::aarch64_sve_whilels: 4970 return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/true, 4971 /*IsEqual=*/true); 4972 case Intrinsic::aarch64_sve_whilele: 4973 return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/true, 4974 /*IsEqual=*/true); 4975 case Intrinsic::aarch64_sve_whilege: 4976 return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/false, 4977 /*IsEqual=*/true); 4978 case Intrinsic::aarch64_sve_whilegt: 4979 return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/false, 4980 /*IsEqual=*/false); 4981 case Intrinsic::aarch64_sve_whilehs: 4982 return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/false, 4983 /*IsEqual=*/true); 4984 case Intrinsic::aarch64_sve_whilehi: 4985 return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/false, 4986 /*IsEqual=*/false); 4987 case Intrinsic::aarch64_sve_sunpkhi: 4988 return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(), 4989 Op.getOperand(1)); 4990 case Intrinsic::aarch64_sve_sunpklo: 4991 return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(), 4992 Op.getOperand(1)); 4993 case Intrinsic::aarch64_sve_uunpkhi: 4994 return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(), 4995 Op.getOperand(1)); 4996 case Intrinsic::aarch64_sve_uunpklo: 4997 return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(), 4998 Op.getOperand(1)); 4999 case Intrinsic::aarch64_sve_clasta_n: 5000 return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(), 5001 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 5002 case Intrinsic::aarch64_sve_clastb_n: 5003 return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(), 5004 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 5005 case Intrinsic::aarch64_sve_lasta: 5006 return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(), 5007 Op.getOperand(1), Op.getOperand(2)); 5008 case Intrinsic::aarch64_sve_lastb: 5009 return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(), 5010 Op.getOperand(1), Op.getOperand(2)); 5011 case Intrinsic::aarch64_sve_rev: 5012 return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(), 5013 Op.getOperand(1)); 5014 case Intrinsic::aarch64_sve_tbl: 5015 return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(), 5016 Op.getOperand(1), Op.getOperand(2)); 5017 case Intrinsic::aarch64_sve_trn1: 5018 return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(), 5019 Op.getOperand(1), Op.getOperand(2)); 5020 case Intrinsic::aarch64_sve_trn2: 5021 return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(), 5022 Op.getOperand(1), Op.getOperand(2)); 5023 case Intrinsic::aarch64_sve_uzp1: 5024 return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(), 5025 Op.getOperand(1), Op.getOperand(2)); 5026 case Intrinsic::aarch64_sve_uzp2: 5027 return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(), 5028 Op.getOperand(1), Op.getOperand(2)); 5029 case Intrinsic::aarch64_sve_zip1: 5030 return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(), 5031 Op.getOperand(1), Op.getOperand(2)); 5032 case Intrinsic::aarch64_sve_zip2: 5033 return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(), 5034 Op.getOperand(1), Op.getOperand(2)); 5035 case Intrinsic::aarch64_sve_splice: 5036 return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(), 5037 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 5038 case Intrinsic::aarch64_sve_ptrue: 5039 return getPTrue(DAG, dl, Op.getValueType(), 5040 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()); 5041 case Intrinsic::aarch64_sve_clz: 5042 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(), 5043 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5044 case Intrinsic::aarch64_sme_cntsb: 5045 return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), 5046 DAG.getConstant(1, dl, MVT::i32)); 5047 case Intrinsic::aarch64_sme_cntsh: { 5048 SDValue One = DAG.getConstant(1, dl, MVT::i32); 5049 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), One); 5050 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, One); 5051 } 5052 case Intrinsic::aarch64_sme_cntsw: { 5053 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), 5054 DAG.getConstant(1, dl, MVT::i32)); 5055 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, 5056 DAG.getConstant(2, dl, MVT::i32)); 5057 } 5058 case Intrinsic::aarch64_sme_cntsd: { 5059 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), 5060 DAG.getConstant(1, dl, MVT::i32)); 5061 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, 5062 DAG.getConstant(3, dl, MVT::i32)); 5063 } 5064 case Intrinsic::aarch64_sve_cnt: { 5065 SDValue Data = Op.getOperand(3); 5066 // CTPOP only supports integer operands. 5067 if (Data.getValueType().isFloatingPoint()) 5068 Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data); 5069 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(), 5070 Op.getOperand(2), Data, Op.getOperand(1)); 5071 } 5072 case Intrinsic::aarch64_sve_dupq_lane: 5073 return LowerDUPQLane(Op, DAG); 5074 case Intrinsic::aarch64_sve_convert_from_svbool: 5075 return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG); 5076 case Intrinsic::aarch64_sve_convert_to_svbool: 5077 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG); 5078 case Intrinsic::aarch64_sve_fneg: 5079 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(), 5080 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5081 case Intrinsic::aarch64_sve_frintp: 5082 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(), 5083 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5084 case Intrinsic::aarch64_sve_frintm: 5085 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(), 5086 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5087 case Intrinsic::aarch64_sve_frinti: 5088 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(), 5089 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5090 case Intrinsic::aarch64_sve_frintx: 5091 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(), 5092 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5093 case Intrinsic::aarch64_sve_frinta: 5094 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(), 5095 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5096 case Intrinsic::aarch64_sve_frintn: 5097 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(), 5098 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5099 case Intrinsic::aarch64_sve_frintz: 5100 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(), 5101 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5102 case Intrinsic::aarch64_sve_ucvtf: 5103 return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, dl, 5104 Op.getValueType(), Op.getOperand(2), Op.getOperand(3), 5105 Op.getOperand(1)); 5106 case Intrinsic::aarch64_sve_scvtf: 5107 return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, dl, 5108 Op.getValueType(), Op.getOperand(2), Op.getOperand(3), 5109 Op.getOperand(1)); 5110 case Intrinsic::aarch64_sve_fcvtzu: 5111 return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, dl, 5112 Op.getValueType(), Op.getOperand(2), Op.getOperand(3), 5113 Op.getOperand(1)); 5114 case Intrinsic::aarch64_sve_fcvtzs: 5115 return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, dl, 5116 Op.getValueType(), Op.getOperand(2), Op.getOperand(3), 5117 Op.getOperand(1)); 5118 case Intrinsic::aarch64_sve_fsqrt: 5119 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(), 5120 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5121 case Intrinsic::aarch64_sve_frecpx: 5122 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(), 5123 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5124 case Intrinsic::aarch64_sve_frecpe_x: 5125 return DAG.getNode(AArch64ISD::FRECPE, dl, Op.getValueType(), 5126 Op.getOperand(1)); 5127 case Intrinsic::aarch64_sve_frecps_x: 5128 return DAG.getNode(AArch64ISD::FRECPS, dl, Op.getValueType(), 5129 Op.getOperand(1), Op.getOperand(2)); 5130 case Intrinsic::aarch64_sve_frsqrte_x: 5131 return DAG.getNode(AArch64ISD::FRSQRTE, dl, Op.getValueType(), 5132 Op.getOperand(1)); 5133 case Intrinsic::aarch64_sve_frsqrts_x: 5134 return DAG.getNode(AArch64ISD::FRSQRTS, dl, Op.getValueType(), 5135 Op.getOperand(1), Op.getOperand(2)); 5136 case Intrinsic::aarch64_sve_fabs: 5137 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(), 5138 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5139 case Intrinsic::aarch64_sve_abs: 5140 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(), 5141 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5142 case Intrinsic::aarch64_sve_neg: 5143 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(), 5144 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5145 case Intrinsic::aarch64_sve_insr: { 5146 SDValue Scalar = Op.getOperand(2); 5147 EVT ScalarTy = Scalar.getValueType(); 5148 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) 5149 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar); 5150 5151 return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(), 5152 Op.getOperand(1), Scalar); 5153 } 5154 case Intrinsic::aarch64_sve_rbit: 5155 return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, dl, 5156 Op.getValueType(), Op.getOperand(2), Op.getOperand(3), 5157 Op.getOperand(1)); 5158 case Intrinsic::aarch64_sve_revb: 5159 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(), 5160 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5161 case Intrinsic::aarch64_sve_revh: 5162 return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, dl, Op.getValueType(), 5163 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5164 case Intrinsic::aarch64_sve_revw: 5165 return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(), 5166 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5167 case Intrinsic::aarch64_sve_revd: 5168 return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, dl, Op.getValueType(), 5169 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5170 case Intrinsic::aarch64_sve_sxtb: 5171 return DAG.getNode( 5172 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), 5173 Op.getOperand(2), Op.getOperand(3), 5174 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)), 5175 Op.getOperand(1)); 5176 case Intrinsic::aarch64_sve_sxth: 5177 return DAG.getNode( 5178 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), 5179 Op.getOperand(2), Op.getOperand(3), 5180 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)), 5181 Op.getOperand(1)); 5182 case Intrinsic::aarch64_sve_sxtw: 5183 return DAG.getNode( 5184 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), 5185 Op.getOperand(2), Op.getOperand(3), 5186 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)), 5187 Op.getOperand(1)); 5188 case Intrinsic::aarch64_sve_uxtb: 5189 return DAG.getNode( 5190 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), 5191 Op.getOperand(2), Op.getOperand(3), 5192 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)), 5193 Op.getOperand(1)); 5194 case Intrinsic::aarch64_sve_uxth: 5195 return DAG.getNode( 5196 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), 5197 Op.getOperand(2), Op.getOperand(3), 5198 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)), 5199 Op.getOperand(1)); 5200 case Intrinsic::aarch64_sve_uxtw: 5201 return DAG.getNode( 5202 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), 5203 Op.getOperand(2), Op.getOperand(3), 5204 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)), 5205 Op.getOperand(1)); 5206 case Intrinsic::localaddress: { 5207 const auto &MF = DAG.getMachineFunction(); 5208 const auto *RegInfo = Subtarget->getRegisterInfo(); 5209 unsigned Reg = RegInfo->getLocalAddressRegister(MF); 5210 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, 5211 Op.getSimpleValueType()); 5212 } 5213 5214 case Intrinsic::eh_recoverfp: { 5215 // FIXME: This needs to be implemented to correctly handle highly aligned 5216 // stack objects. For now we simply return the incoming FP. Refer D53541 5217 // for more details. 5218 SDValue FnOp = Op.getOperand(1); 5219 SDValue IncomingFPOp = Op.getOperand(2); 5220 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp); 5221 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr); 5222 if (!Fn) 5223 report_fatal_error( 5224 "llvm.eh.recoverfp must take a function as the first argument"); 5225 return IncomingFPOp; 5226 } 5227 5228 case Intrinsic::aarch64_neon_vsri: 5229 case Intrinsic::aarch64_neon_vsli: { 5230 EVT Ty = Op.getValueType(); 5231 5232 if (!Ty.isVector()) 5233 report_fatal_error("Unexpected type for aarch64_neon_vsli"); 5234 5235 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits()); 5236 5237 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri; 5238 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI; 5239 return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2), 5240 Op.getOperand(3)); 5241 } 5242 5243 case Intrinsic::aarch64_neon_srhadd: 5244 case Intrinsic::aarch64_neon_urhadd: 5245 case Intrinsic::aarch64_neon_shadd: 5246 case Intrinsic::aarch64_neon_uhadd: { 5247 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd || 5248 IntNo == Intrinsic::aarch64_neon_shadd); 5249 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd || 5250 IntNo == Intrinsic::aarch64_neon_urhadd); 5251 unsigned Opcode = IsSignedAdd 5252 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS) 5253 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU); 5254 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), 5255 Op.getOperand(2)); 5256 } 5257 case Intrinsic::aarch64_neon_sabd: 5258 case Intrinsic::aarch64_neon_uabd: { 5259 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uabd ? ISD::ABDU 5260 : ISD::ABDS; 5261 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), 5262 Op.getOperand(2)); 5263 } 5264 case Intrinsic::aarch64_neon_saddlp: 5265 case Intrinsic::aarch64_neon_uaddlp: { 5266 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp 5267 ? AArch64ISD::UADDLP 5268 : AArch64ISD::SADDLP; 5269 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1)); 5270 } 5271 case Intrinsic::aarch64_neon_sdot: 5272 case Intrinsic::aarch64_neon_udot: 5273 case Intrinsic::aarch64_sve_sdot: 5274 case Intrinsic::aarch64_sve_udot: { 5275 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot || 5276 IntNo == Intrinsic::aarch64_sve_udot) 5277 ? AArch64ISD::UDOT 5278 : AArch64ISD::SDOT; 5279 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), 5280 Op.getOperand(2), Op.getOperand(3)); 5281 } 5282 case Intrinsic::get_active_lane_mask: { 5283 SDValue ID = 5284 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64); 5285 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), ID, 5286 Op.getOperand(1), Op.getOperand(2)); 5287 } 5288 } 5289 } 5290 5291 bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const { 5292 if (VT.getVectorElementType() == MVT::i8 || 5293 VT.getVectorElementType() == MVT::i16) { 5294 EltTy = MVT::i32; 5295 return true; 5296 } 5297 return false; 5298 } 5299 5300 bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT IndexVT, 5301 EVT DataVT) const { 5302 // SVE only supports implicit extension of 32-bit indices. 5303 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32) 5304 return false; 5305 5306 // Indices cannot be smaller than the main data type. 5307 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits()) 5308 return false; 5309 5310 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit 5311 // element container type, which would violate the previous clause. 5312 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2; 5313 } 5314 5315 bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { 5316 return ExtVal.getValueType().isScalableVector() || 5317 useSVEForFixedLengthVectorVT( 5318 ExtVal.getValueType(), 5319 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()); 5320 } 5321 5322 unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) { 5323 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = { 5324 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false), 5325 AArch64ISD::GLD1_MERGE_ZERO}, 5326 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true), 5327 AArch64ISD::GLD1_UXTW_MERGE_ZERO}, 5328 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false), 5329 AArch64ISD::GLD1_MERGE_ZERO}, 5330 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true), 5331 AArch64ISD::GLD1_SXTW_MERGE_ZERO}, 5332 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false), 5333 AArch64ISD::GLD1_SCALED_MERGE_ZERO}, 5334 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true), 5335 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO}, 5336 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false), 5337 AArch64ISD::GLD1_SCALED_MERGE_ZERO}, 5338 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true), 5339 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO}, 5340 }; 5341 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend); 5342 return AddrModes.find(Key)->second; 5343 } 5344 5345 unsigned getSignExtendedGatherOpcode(unsigned Opcode) { 5346 switch (Opcode) { 5347 default: 5348 llvm_unreachable("unimplemented opcode"); 5349 return Opcode; 5350 case AArch64ISD::GLD1_MERGE_ZERO: 5351 return AArch64ISD::GLD1S_MERGE_ZERO; 5352 case AArch64ISD::GLD1_IMM_MERGE_ZERO: 5353 return AArch64ISD::GLD1S_IMM_MERGE_ZERO; 5354 case AArch64ISD::GLD1_UXTW_MERGE_ZERO: 5355 return AArch64ISD::GLD1S_UXTW_MERGE_ZERO; 5356 case AArch64ISD::GLD1_SXTW_MERGE_ZERO: 5357 return AArch64ISD::GLD1S_SXTW_MERGE_ZERO; 5358 case AArch64ISD::GLD1_SCALED_MERGE_ZERO: 5359 return AArch64ISD::GLD1S_SCALED_MERGE_ZERO; 5360 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO: 5361 return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO; 5362 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO: 5363 return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO; 5364 } 5365 } 5366 5367 SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op, 5368 SelectionDAG &DAG) const { 5369 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op); 5370 5371 SDLoc DL(Op); 5372 SDValue Chain = MGT->getChain(); 5373 SDValue PassThru = MGT->getPassThru(); 5374 SDValue Mask = MGT->getMask(); 5375 SDValue BasePtr = MGT->getBasePtr(); 5376 SDValue Index = MGT->getIndex(); 5377 SDValue Scale = MGT->getScale(); 5378 EVT VT = Op.getValueType(); 5379 EVT MemVT = MGT->getMemoryVT(); 5380 ISD::LoadExtType ExtType = MGT->getExtensionType(); 5381 ISD::MemIndexType IndexType = MGT->getIndexType(); 5382 5383 // SVE supports zero (and so undef) passthrough values only, everything else 5384 // must be handled manually by an explicit select on the load's output. 5385 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) { 5386 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale}; 5387 SDValue Load = 5388 DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops, 5389 MGT->getMemOperand(), IndexType, ExtType); 5390 SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru); 5391 return DAG.getMergeValues({Select, Load.getValue(1)}, DL); 5392 } 5393 5394 bool IsScaled = MGT->isIndexScaled(); 5395 bool IsSigned = MGT->isIndexSigned(); 5396 5397 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else 5398 // must be calculated before hand. 5399 uint64_t ScaleVal = cast<ConstantSDNode>(Scale)->getZExtValue(); 5400 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) { 5401 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types"); 5402 EVT IndexVT = Index.getValueType(); 5403 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index, 5404 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT)); 5405 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType()); 5406 5407 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale}; 5408 return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops, 5409 MGT->getMemOperand(), IndexType, ExtType); 5410 } 5411 5412 // Lower fixed length gather to a scalable equivalent. 5413 if (VT.isFixedLengthVector()) { 5414 assert(Subtarget->useSVEForFixedLengthVectors() && 5415 "Cannot lower when not using SVE for fixed vectors!"); 5416 5417 // NOTE: Handle floating-point as if integer then bitcast the result. 5418 EVT DataVT = VT.changeVectorElementTypeToInteger(); 5419 MemVT = MemVT.changeVectorElementTypeToInteger(); 5420 5421 // Find the smallest integer fixed length vector we can use for the gather. 5422 EVT PromotedVT = VT.changeVectorElementType(MVT::i32); 5423 if (DataVT.getVectorElementType() == MVT::i64 || 5424 Index.getValueType().getVectorElementType() == MVT::i64 || 5425 Mask.getValueType().getVectorElementType() == MVT::i64) 5426 PromotedVT = VT.changeVectorElementType(MVT::i64); 5427 5428 // Promote vector operands except for passthrough, which we know is either 5429 // undef or zero, and thus best constructed directly. 5430 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 5431 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index); 5432 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask); 5433 5434 // A promoted result type forces the need for an extending load. 5435 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD) 5436 ExtType = ISD::EXTLOAD; 5437 5438 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT); 5439 5440 // Convert fixed length vector operands to scalable. 5441 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType()); 5442 Index = convertToScalableVector(DAG, ContainerVT, Index); 5443 Mask = convertFixedMaskToScalableVector(Mask, DAG); 5444 PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT) 5445 : DAG.getConstant(0, DL, ContainerVT); 5446 5447 // Emit equivalent scalable vector gather. 5448 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale}; 5449 SDValue Load = 5450 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL, 5451 Ops, MGT->getMemOperand(), IndexType, ExtType); 5452 5453 // Extract fixed length data then convert to the required result type. 5454 SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load); 5455 Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result); 5456 if (VT.isFloatingPoint()) 5457 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result); 5458 5459 return DAG.getMergeValues({Result, Load.getValue(1)}, DL); 5460 } 5461 5462 // Everything else is legal. 5463 return Op; 5464 } 5465 5466 SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op, 5467 SelectionDAG &DAG) const { 5468 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op); 5469 5470 SDLoc DL(Op); 5471 SDValue Chain = MSC->getChain(); 5472 SDValue StoreVal = MSC->getValue(); 5473 SDValue Mask = MSC->getMask(); 5474 SDValue BasePtr = MSC->getBasePtr(); 5475 SDValue Index = MSC->getIndex(); 5476 SDValue Scale = MSC->getScale(); 5477 EVT VT = StoreVal.getValueType(); 5478 EVT MemVT = MSC->getMemoryVT(); 5479 ISD::MemIndexType IndexType = MSC->getIndexType(); 5480 bool Truncating = MSC->isTruncatingStore(); 5481 5482 bool IsScaled = MSC->isIndexScaled(); 5483 bool IsSigned = MSC->isIndexSigned(); 5484 5485 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else 5486 // must be calculated before hand. 5487 uint64_t ScaleVal = cast<ConstantSDNode>(Scale)->getZExtValue(); 5488 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) { 5489 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types"); 5490 EVT IndexVT = Index.getValueType(); 5491 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index, 5492 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT)); 5493 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType()); 5494 5495 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale}; 5496 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops, 5497 MSC->getMemOperand(), IndexType, Truncating); 5498 } 5499 5500 // Lower fixed length scatter to a scalable equivalent. 5501 if (VT.isFixedLengthVector()) { 5502 assert(Subtarget->useSVEForFixedLengthVectors() && 5503 "Cannot lower when not using SVE for fixed vectors!"); 5504 5505 // Once bitcast we treat floating-point scatters as if integer. 5506 if (VT.isFloatingPoint()) { 5507 VT = VT.changeVectorElementTypeToInteger(); 5508 MemVT = MemVT.changeVectorElementTypeToInteger(); 5509 StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal); 5510 } 5511 5512 // Find the smallest integer fixed length vector we can use for the scatter. 5513 EVT PromotedVT = VT.changeVectorElementType(MVT::i32); 5514 if (VT.getVectorElementType() == MVT::i64 || 5515 Index.getValueType().getVectorElementType() == MVT::i64 || 5516 Mask.getValueType().getVectorElementType() == MVT::i64) 5517 PromotedVT = VT.changeVectorElementType(MVT::i64); 5518 5519 // Promote vector operands. 5520 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 5521 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index); 5522 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask); 5523 StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal); 5524 5525 // A promoted value type forces the need for a truncating store. 5526 if (PromotedVT != VT) 5527 Truncating = true; 5528 5529 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT); 5530 5531 // Convert fixed length vector operands to scalable. 5532 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType()); 5533 Index = convertToScalableVector(DAG, ContainerVT, Index); 5534 Mask = convertFixedMaskToScalableVector(Mask, DAG); 5535 StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal); 5536 5537 // Emit equivalent scalable vector scatter. 5538 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale}; 5539 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops, 5540 MSC->getMemOperand(), IndexType, Truncating); 5541 } 5542 5543 // Everything else is legal. 5544 return Op; 5545 } 5546 5547 SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const { 5548 SDLoc DL(Op); 5549 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op); 5550 assert(LoadNode && "Expected custom lowering of a masked load node"); 5551 EVT VT = Op->getValueType(0); 5552 5553 if (useSVEForFixedLengthVectorVT( 5554 VT, 5555 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) 5556 return LowerFixedLengthVectorMLoadToSVE(Op, DAG); 5557 5558 SDValue PassThru = LoadNode->getPassThru(); 5559 SDValue Mask = LoadNode->getMask(); 5560 5561 if (PassThru->isUndef() || isZerosVector(PassThru.getNode())) 5562 return Op; 5563 5564 SDValue Load = DAG.getMaskedLoad( 5565 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(), 5566 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(), 5567 LoadNode->getMemOperand(), LoadNode->getAddressingMode(), 5568 LoadNode->getExtensionType()); 5569 5570 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru); 5571 5572 return DAG.getMergeValues({Result, Load.getValue(1)}, DL); 5573 } 5574 5575 // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16. 5576 static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, 5577 EVT VT, EVT MemVT, 5578 SelectionDAG &DAG) { 5579 assert(VT.isVector() && "VT should be a vector type"); 5580 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16); 5581 5582 SDValue Value = ST->getValue(); 5583 5584 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract 5585 // the word lane which represent the v4i8 subvector. It optimizes the store 5586 // to: 5587 // 5588 // xtn v0.8b, v0.8h 5589 // str s0, [x0] 5590 5591 SDValue Undef = DAG.getUNDEF(MVT::i16); 5592 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL, 5593 {Undef, Undef, Undef, Undef}); 5594 5595 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, 5596 Value, UndefVec); 5597 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt); 5598 5599 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc); 5600 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, 5601 Trunc, DAG.getConstant(0, DL, MVT::i64)); 5602 5603 return DAG.getStore(ST->getChain(), DL, ExtractTrunc, 5604 ST->getBasePtr(), ST->getMemOperand()); 5605 } 5606 5607 // Custom lowering for any store, vector or scalar and/or default or with 5608 // a truncate operations. Currently only custom lower truncate operation 5609 // from vector v4i16 to v4i8 or volatile stores of i128. 5610 SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, 5611 SelectionDAG &DAG) const { 5612 SDLoc Dl(Op); 5613 StoreSDNode *StoreNode = cast<StoreSDNode>(Op); 5614 assert (StoreNode && "Can only custom lower store nodes"); 5615 5616 SDValue Value = StoreNode->getValue(); 5617 5618 EVT VT = Value.getValueType(); 5619 EVT MemVT = StoreNode->getMemoryVT(); 5620 5621 if (VT.isVector()) { 5622 if (useSVEForFixedLengthVectorVT( 5623 VT, 5624 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) 5625 return LowerFixedLengthVectorStoreToSVE(Op, DAG); 5626 5627 unsigned AS = StoreNode->getAddressSpace(); 5628 Align Alignment = StoreNode->getAlign(); 5629 if (Alignment < MemVT.getStoreSize() && 5630 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment, 5631 StoreNode->getMemOperand()->getFlags(), 5632 nullptr)) { 5633 return scalarizeVectorStore(StoreNode, DAG); 5634 } 5635 5636 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 && 5637 MemVT == MVT::v4i8) { 5638 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG); 5639 } 5640 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of 5641 // the custom lowering, as there are no un-paired non-temporal stores and 5642 // legalization will break up 256 bit inputs. 5643 ElementCount EC = MemVT.getVectorElementCount(); 5644 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u && 5645 EC.isKnownEven() && 5646 ((MemVT.getScalarSizeInBits() == 8u || 5647 MemVT.getScalarSizeInBits() == 16u || 5648 MemVT.getScalarSizeInBits() == 32u || 5649 MemVT.getScalarSizeInBits() == 64u))) { 5650 SDValue Lo = 5651 DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl, 5652 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), 5653 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64)); 5654 SDValue Hi = 5655 DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl, 5656 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), 5657 StoreNode->getValue(), 5658 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64)); 5659 SDValue Result = DAG.getMemIntrinsicNode( 5660 AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other), 5661 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()}, 5662 StoreNode->getMemoryVT(), StoreNode->getMemOperand()); 5663 return Result; 5664 } 5665 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) { 5666 return LowerStore128(Op, DAG); 5667 } else if (MemVT == MVT::i64x8) { 5668 SDValue Value = StoreNode->getValue(); 5669 assert(Value->getValueType(0) == MVT::i64x8); 5670 SDValue Chain = StoreNode->getChain(); 5671 SDValue Base = StoreNode->getBasePtr(); 5672 EVT PtrVT = Base.getValueType(); 5673 for (unsigned i = 0; i < 8; i++) { 5674 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64, 5675 Value, DAG.getConstant(i, Dl, MVT::i32)); 5676 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base, 5677 DAG.getConstant(i * 8, Dl, PtrVT)); 5678 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(), 5679 StoreNode->getOriginalAlign()); 5680 } 5681 return Chain; 5682 } 5683 5684 return SDValue(); 5685 } 5686 5687 /// Lower atomic or volatile 128-bit stores to a single STP instruction. 5688 SDValue AArch64TargetLowering::LowerStore128(SDValue Op, 5689 SelectionDAG &DAG) const { 5690 MemSDNode *StoreNode = cast<MemSDNode>(Op); 5691 assert(StoreNode->getMemoryVT() == MVT::i128); 5692 assert(StoreNode->isVolatile() || StoreNode->isAtomic()); 5693 assert(!StoreNode->isAtomic() || 5694 StoreNode->getMergedOrdering() == AtomicOrdering::Unordered || 5695 StoreNode->getMergedOrdering() == AtomicOrdering::Monotonic); 5696 5697 SDValue Value = StoreNode->getOpcode() == ISD::STORE 5698 ? StoreNode->getOperand(1) 5699 : StoreNode->getOperand(2); 5700 SDLoc DL(Op); 5701 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Value, 5702 DAG.getConstant(0, DL, MVT::i64)); 5703 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Value, 5704 DAG.getConstant(1, DL, MVT::i64)); 5705 SDValue Result = DAG.getMemIntrinsicNode( 5706 AArch64ISD::STP, DL, DAG.getVTList(MVT::Other), 5707 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()}, 5708 StoreNode->getMemoryVT(), StoreNode->getMemOperand()); 5709 return Result; 5710 } 5711 5712 SDValue AArch64TargetLowering::LowerLOAD(SDValue Op, 5713 SelectionDAG &DAG) const { 5714 SDLoc DL(Op); 5715 LoadSDNode *LoadNode = cast<LoadSDNode>(Op); 5716 assert(LoadNode && "Expected custom lowering of a load node"); 5717 5718 if (LoadNode->getMemoryVT() == MVT::i64x8) { 5719 SmallVector<SDValue, 8> Ops; 5720 SDValue Base = LoadNode->getBasePtr(); 5721 SDValue Chain = LoadNode->getChain(); 5722 EVT PtrVT = Base.getValueType(); 5723 for (unsigned i = 0; i < 8; i++) { 5724 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base, 5725 DAG.getConstant(i * 8, DL, PtrVT)); 5726 SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr, 5727 LoadNode->getPointerInfo(), 5728 LoadNode->getOriginalAlign()); 5729 Ops.push_back(Part); 5730 Chain = SDValue(Part.getNode(), 1); 5731 } 5732 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops); 5733 return DAG.getMergeValues({Loaded, Chain}, DL); 5734 } 5735 5736 // Custom lowering for extending v4i8 vector loads. 5737 EVT VT = Op->getValueType(0); 5738 assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32"); 5739 5740 if (LoadNode->getMemoryVT() != MVT::v4i8) 5741 return SDValue(); 5742 5743 unsigned ExtType; 5744 if (LoadNode->getExtensionType() == ISD::SEXTLOAD) 5745 ExtType = ISD::SIGN_EXTEND; 5746 else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD || 5747 LoadNode->getExtensionType() == ISD::EXTLOAD) 5748 ExtType = ISD::ZERO_EXTEND; 5749 else 5750 return SDValue(); 5751 5752 SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(), 5753 LoadNode->getBasePtr(), MachinePointerInfo()); 5754 SDValue Chain = Load.getValue(1); 5755 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load); 5756 SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec); 5757 SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC); 5758 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext, 5759 DAG.getConstant(0, DL, MVT::i64)); 5760 if (VT == MVT::v4i32) 5761 Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext); 5762 return DAG.getMergeValues({Ext, Chain}, DL); 5763 } 5764 5765 // Generate SUBS and CSEL for integer abs. 5766 SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const { 5767 MVT VT = Op.getSimpleValueType(); 5768 5769 if (VT.isVector()) 5770 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU); 5771 5772 SDLoc DL(Op); 5773 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), 5774 Op.getOperand(0)); 5775 // Generate SUBS & CSEL. 5776 SDValue Cmp = 5777 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32), 5778 Op.getOperand(0), DAG.getConstant(0, DL, VT)); 5779 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg, 5780 DAG.getConstant(AArch64CC::PL, DL, MVT::i32), 5781 Cmp.getValue(1)); 5782 } 5783 5784 static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) { 5785 SDValue Chain = Op.getOperand(0); 5786 SDValue Cond = Op.getOperand(1); 5787 SDValue Dest = Op.getOperand(2); 5788 5789 AArch64CC::CondCode CC; 5790 if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) { 5791 SDLoc dl(Op); 5792 SDValue CCVal = DAG.getConstant(CC, dl, MVT::i32); 5793 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, 5794 Cmp); 5795 } 5796 5797 return SDValue(); 5798 } 5799 5800 SDValue AArch64TargetLowering::LowerOperation(SDValue Op, 5801 SelectionDAG &DAG) const { 5802 LLVM_DEBUG(dbgs() << "Custom lowering: "); 5803 LLVM_DEBUG(Op.dump()); 5804 5805 switch (Op.getOpcode()) { 5806 default: 5807 llvm_unreachable("unimplemented operand"); 5808 return SDValue(); 5809 case ISD::BITCAST: 5810 return LowerBITCAST(Op, DAG); 5811 case ISD::GlobalAddress: 5812 return LowerGlobalAddress(Op, DAG); 5813 case ISD::GlobalTLSAddress: 5814 return LowerGlobalTLSAddress(Op, DAG); 5815 case ISD::SETCC: 5816 case ISD::STRICT_FSETCC: 5817 case ISD::STRICT_FSETCCS: 5818 return LowerSETCC(Op, DAG); 5819 case ISD::SETCCCARRY: 5820 return LowerSETCCCARRY(Op, DAG); 5821 case ISD::BRCOND: 5822 return LowerBRCOND(Op, DAG); 5823 case ISD::BR_CC: 5824 return LowerBR_CC(Op, DAG); 5825 case ISD::SELECT: 5826 return LowerSELECT(Op, DAG); 5827 case ISD::SELECT_CC: 5828 return LowerSELECT_CC(Op, DAG); 5829 case ISD::JumpTable: 5830 return LowerJumpTable(Op, DAG); 5831 case ISD::BR_JT: 5832 return LowerBR_JT(Op, DAG); 5833 case ISD::ConstantPool: 5834 return LowerConstantPool(Op, DAG); 5835 case ISD::BlockAddress: 5836 return LowerBlockAddress(Op, DAG); 5837 case ISD::VASTART: 5838 return LowerVASTART(Op, DAG); 5839 case ISD::VACOPY: 5840 return LowerVACOPY(Op, DAG); 5841 case ISD::VAARG: 5842 return LowerVAARG(Op, DAG); 5843 case ISD::ADDCARRY: 5844 return lowerADDSUBCARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/); 5845 case ISD::SUBCARRY: 5846 return lowerADDSUBCARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/); 5847 case ISD::SADDO_CARRY: 5848 return lowerADDSUBCARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/); 5849 case ISD::SSUBO_CARRY: 5850 return lowerADDSUBCARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/); 5851 case ISD::SADDO: 5852 case ISD::UADDO: 5853 case ISD::SSUBO: 5854 case ISD::USUBO: 5855 case ISD::SMULO: 5856 case ISD::UMULO: 5857 return LowerXALUO(Op, DAG); 5858 case ISD::FADD: 5859 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED); 5860 case ISD::FSUB: 5861 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED); 5862 case ISD::FMUL: 5863 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED); 5864 case ISD::FMA: 5865 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED); 5866 case ISD::FDIV: 5867 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED); 5868 case ISD::FNEG: 5869 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU); 5870 case ISD::FCEIL: 5871 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU); 5872 case ISD::FFLOOR: 5873 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU); 5874 case ISD::FNEARBYINT: 5875 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU); 5876 case ISD::FRINT: 5877 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU); 5878 case ISD::FROUND: 5879 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU); 5880 case ISD::FROUNDEVEN: 5881 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU); 5882 case ISD::FTRUNC: 5883 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU); 5884 case ISD::FSQRT: 5885 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU); 5886 case ISD::FABS: 5887 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU); 5888 case ISD::FP_ROUND: 5889 case ISD::STRICT_FP_ROUND: 5890 return LowerFP_ROUND(Op, DAG); 5891 case ISD::FP_EXTEND: 5892 return LowerFP_EXTEND(Op, DAG); 5893 case ISD::FRAMEADDR: 5894 return LowerFRAMEADDR(Op, DAG); 5895 case ISD::SPONENTRY: 5896 return LowerSPONENTRY(Op, DAG); 5897 case ISD::RETURNADDR: 5898 return LowerRETURNADDR(Op, DAG); 5899 case ISD::ADDROFRETURNADDR: 5900 return LowerADDROFRETURNADDR(Op, DAG); 5901 case ISD::CONCAT_VECTORS: 5902 return LowerCONCAT_VECTORS(Op, DAG); 5903 case ISD::INSERT_VECTOR_ELT: 5904 return LowerINSERT_VECTOR_ELT(Op, DAG); 5905 case ISD::EXTRACT_VECTOR_ELT: 5906 return LowerEXTRACT_VECTOR_ELT(Op, DAG); 5907 case ISD::BUILD_VECTOR: 5908 return LowerBUILD_VECTOR(Op, DAG); 5909 case ISD::ZERO_EXTEND_VECTOR_INREG: 5910 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG); 5911 case ISD::VECTOR_SHUFFLE: 5912 return LowerVECTOR_SHUFFLE(Op, DAG); 5913 case ISD::SPLAT_VECTOR: 5914 return LowerSPLAT_VECTOR(Op, DAG); 5915 case ISD::EXTRACT_SUBVECTOR: 5916 return LowerEXTRACT_SUBVECTOR(Op, DAG); 5917 case ISD::INSERT_SUBVECTOR: 5918 return LowerINSERT_SUBVECTOR(Op, DAG); 5919 case ISD::SDIV: 5920 case ISD::UDIV: 5921 return LowerDIV(Op, DAG); 5922 case ISD::SMIN: 5923 case ISD::UMIN: 5924 case ISD::SMAX: 5925 case ISD::UMAX: 5926 return LowerMinMax(Op, DAG); 5927 case ISD::SRA: 5928 case ISD::SRL: 5929 case ISD::SHL: 5930 return LowerVectorSRA_SRL_SHL(Op, DAG); 5931 case ISD::SHL_PARTS: 5932 case ISD::SRL_PARTS: 5933 case ISD::SRA_PARTS: 5934 return LowerShiftParts(Op, DAG); 5935 case ISD::CTPOP: 5936 case ISD::PARITY: 5937 return LowerCTPOP_PARITY(Op, DAG); 5938 case ISD::FCOPYSIGN: 5939 return LowerFCOPYSIGN(Op, DAG); 5940 case ISD::OR: 5941 return LowerVectorOR(Op, DAG); 5942 case ISD::XOR: 5943 return LowerXOR(Op, DAG); 5944 case ISD::PREFETCH: 5945 return LowerPREFETCH(Op, DAG); 5946 case ISD::SINT_TO_FP: 5947 case ISD::UINT_TO_FP: 5948 case ISD::STRICT_SINT_TO_FP: 5949 case ISD::STRICT_UINT_TO_FP: 5950 return LowerINT_TO_FP(Op, DAG); 5951 case ISD::FP_TO_SINT: 5952 case ISD::FP_TO_UINT: 5953 case ISD::STRICT_FP_TO_SINT: 5954 case ISD::STRICT_FP_TO_UINT: 5955 return LowerFP_TO_INT(Op, DAG); 5956 case ISD::FP_TO_SINT_SAT: 5957 case ISD::FP_TO_UINT_SAT: 5958 return LowerFP_TO_INT_SAT(Op, DAG); 5959 case ISD::FSINCOS: 5960 return LowerFSINCOS(Op, DAG); 5961 case ISD::GET_ROUNDING: 5962 return LowerGET_ROUNDING(Op, DAG); 5963 case ISD::SET_ROUNDING: 5964 return LowerSET_ROUNDING(Op, DAG); 5965 case ISD::MUL: 5966 return LowerMUL(Op, DAG); 5967 case ISD::MULHS: 5968 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED); 5969 case ISD::MULHU: 5970 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED); 5971 case ISD::INTRINSIC_W_CHAIN: 5972 return LowerINTRINSIC_W_CHAIN(Op, DAG); 5973 case ISD::INTRINSIC_WO_CHAIN: 5974 return LowerINTRINSIC_WO_CHAIN(Op, DAG); 5975 case ISD::INTRINSIC_VOID: 5976 return LowerINTRINSIC_VOID(Op, DAG); 5977 case ISD::ATOMIC_STORE: 5978 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) { 5979 assert(Subtarget->hasLSE2()); 5980 return LowerStore128(Op, DAG); 5981 } 5982 return SDValue(); 5983 case ISD::STORE: 5984 return LowerSTORE(Op, DAG); 5985 case ISD::MSTORE: 5986 return LowerFixedLengthVectorMStoreToSVE(Op, DAG); 5987 case ISD::MGATHER: 5988 return LowerMGATHER(Op, DAG); 5989 case ISD::MSCATTER: 5990 return LowerMSCATTER(Op, DAG); 5991 case ISD::VECREDUCE_SEQ_FADD: 5992 return LowerVECREDUCE_SEQ_FADD(Op, DAG); 5993 case ISD::VECREDUCE_ADD: 5994 case ISD::VECREDUCE_AND: 5995 case ISD::VECREDUCE_OR: 5996 case ISD::VECREDUCE_XOR: 5997 case ISD::VECREDUCE_SMAX: 5998 case ISD::VECREDUCE_SMIN: 5999 case ISD::VECREDUCE_UMAX: 6000 case ISD::VECREDUCE_UMIN: 6001 case ISD::VECREDUCE_FADD: 6002 case ISD::VECREDUCE_FMAX: 6003 case ISD::VECREDUCE_FMIN: 6004 return LowerVECREDUCE(Op, DAG); 6005 case ISD::ATOMIC_LOAD_SUB: 6006 return LowerATOMIC_LOAD_SUB(Op, DAG); 6007 case ISD::ATOMIC_LOAD_AND: 6008 return LowerATOMIC_LOAD_AND(Op, DAG); 6009 case ISD::DYNAMIC_STACKALLOC: 6010 return LowerDYNAMIC_STACKALLOC(Op, DAG); 6011 case ISD::VSCALE: 6012 return LowerVSCALE(Op, DAG); 6013 case ISD::ANY_EXTEND: 6014 case ISD::SIGN_EXTEND: 6015 case ISD::ZERO_EXTEND: 6016 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG); 6017 case ISD::SIGN_EXTEND_INREG: { 6018 // Only custom lower when ExtraVT has a legal byte based element type. 6019 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); 6020 EVT ExtraEltVT = ExtraVT.getVectorElementType(); 6021 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) && 6022 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64)) 6023 return SDValue(); 6024 6025 return LowerToPredicatedOp(Op, DAG, 6026 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU); 6027 } 6028 case ISD::TRUNCATE: 6029 return LowerTRUNCATE(Op, DAG); 6030 case ISD::MLOAD: 6031 return LowerMLOAD(Op, DAG); 6032 case ISD::LOAD: 6033 if (useSVEForFixedLengthVectorVT(Op.getValueType(), 6034 Subtarget->forceStreamingCompatibleSVE())) 6035 return LowerFixedLengthVectorLoadToSVE(Op, DAG); 6036 return LowerLOAD(Op, DAG); 6037 case ISD::ADD: 6038 case ISD::AND: 6039 case ISD::SUB: 6040 return LowerToScalableOp(Op, DAG); 6041 case ISD::FMAXIMUM: 6042 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED); 6043 case ISD::FMAXNUM: 6044 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED); 6045 case ISD::FMINIMUM: 6046 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED); 6047 case ISD::FMINNUM: 6048 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED); 6049 case ISD::VSELECT: 6050 return LowerFixedLengthVectorSelectToSVE(Op, DAG); 6051 case ISD::ABS: 6052 return LowerABS(Op, DAG); 6053 case ISD::ABDS: 6054 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED); 6055 case ISD::ABDU: 6056 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED); 6057 case ISD::AVGFLOORS: 6058 return LowerToPredicatedOp(Op, DAG, AArch64ISD::HADDS_PRED); 6059 case ISD::AVGFLOORU: 6060 return LowerToPredicatedOp(Op, DAG, AArch64ISD::HADDU_PRED); 6061 case ISD::AVGCEILS: 6062 return LowerToPredicatedOp(Op, DAG, AArch64ISD::RHADDS_PRED); 6063 case ISD::AVGCEILU: 6064 return LowerToPredicatedOp(Op, DAG, AArch64ISD::RHADDU_PRED); 6065 case ISD::BITREVERSE: 6066 return LowerBitreverse(Op, DAG); 6067 case ISD::BSWAP: 6068 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU); 6069 case ISD::CTLZ: 6070 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU); 6071 case ISD::CTTZ: 6072 return LowerCTTZ(Op, DAG); 6073 case ISD::VECTOR_SPLICE: 6074 return LowerVECTOR_SPLICE(Op, DAG); 6075 case ISD::STRICT_LROUND: 6076 case ISD::STRICT_LLROUND: 6077 case ISD::STRICT_LRINT: 6078 case ISD::STRICT_LLRINT: { 6079 assert(Op.getOperand(1).getValueType() == MVT::f16 && 6080 "Expected custom lowering of rounding operations only for f16"); 6081 SDLoc DL(Op); 6082 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other}, 6083 {Op.getOperand(0), Op.getOperand(1)}); 6084 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other}, 6085 {Ext.getValue(1), Ext.getValue(0)}); 6086 } 6087 case ISD::WRITE_REGISTER: { 6088 assert(Op.getOperand(2).getValueType() == MVT::i128 && 6089 "WRITE_REGISTER custom lowering is only for 128-bit sysregs"); 6090 SDLoc DL(Op); 6091 6092 SDValue Chain = Op.getOperand(0); 6093 SDValue SysRegName = Op.getOperand(1); 6094 SDValue Pair = Op.getOperand(2); 6095 6096 SDValue PairLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Pair, 6097 DAG.getConstant(0, DL, MVT::i32)); 6098 SDValue PairHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Pair, 6099 DAG.getConstant(1, DL, MVT::i32)); 6100 6101 // chain = MSRR(chain, sysregname, lo, hi) 6102 SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain, 6103 SysRegName, PairLo, PairHi); 6104 6105 return Result; 6106 } 6107 } 6108 } 6109 6110 bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const { 6111 return !Subtarget->useSVEForFixedLengthVectors(); 6112 } 6113 6114 bool AArch64TargetLowering::isVScaleKnownToBeAPowerOfTwo() const { 6115 return true; 6116 } 6117 6118 bool AArch64TargetLowering::useSVEForFixedLengthVectorVT( 6119 EVT VT, bool OverrideNEON) const { 6120 if (!VT.isFixedLengthVector() || !VT.isSimple()) 6121 return false; 6122 6123 // Don't use SVE for vectors we cannot scalarize if required. 6124 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { 6125 // Fixed length predicates should be promoted to i8. 6126 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work. 6127 case MVT::i1: 6128 default: 6129 return false; 6130 case MVT::i8: 6131 case MVT::i16: 6132 case MVT::i32: 6133 case MVT::i64: 6134 case MVT::f16: 6135 case MVT::f32: 6136 case MVT::f64: 6137 break; 6138 } 6139 6140 // All SVE implementations support NEON sized vectors. 6141 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector())) 6142 return Subtarget->hasSVE(); 6143 6144 // Ensure NEON MVTs only belong to a single register class. 6145 if (VT.getFixedSizeInBits() <= 128) 6146 return false; 6147 6148 // Ensure wider than NEON code generation is enabled. 6149 if (!Subtarget->useSVEForFixedLengthVectors()) 6150 return false; 6151 6152 // Don't use SVE for types that don't fit. 6153 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits()) 6154 return false; 6155 6156 // TODO: Perhaps an artificial restriction, but worth having whilst getting 6157 // the base fixed length SVE support in place. 6158 if (!VT.isPow2VectorType()) 6159 return false; 6160 6161 return true; 6162 } 6163 6164 //===----------------------------------------------------------------------===// 6165 // Calling Convention Implementation 6166 //===----------------------------------------------------------------------===// 6167 6168 static unsigned getIntrinsicID(const SDNode *N) { 6169 unsigned Opcode = N->getOpcode(); 6170 switch (Opcode) { 6171 default: 6172 return Intrinsic::not_intrinsic; 6173 case ISD::INTRINSIC_WO_CHAIN: { 6174 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 6175 if (IID < Intrinsic::num_intrinsics) 6176 return IID; 6177 return Intrinsic::not_intrinsic; 6178 } 6179 } 6180 } 6181 6182 bool AArch64TargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0, 6183 SDValue N1) const { 6184 if (!N0.hasOneUse()) 6185 return false; 6186 6187 unsigned IID = getIntrinsicID(N1.getNode()); 6188 // Avoid reassociating expressions that can be lowered to smlal/umlal. 6189 if (IID == Intrinsic::aarch64_neon_umull || 6190 N1.getOpcode() == AArch64ISD::UMULL || 6191 IID == Intrinsic::aarch64_neon_smull || 6192 N1.getOpcode() == AArch64ISD::SMULL) 6193 return N0.getOpcode() != ISD::ADD; 6194 6195 return true; 6196 } 6197 6198 /// Selects the correct CCAssignFn for a given CallingConvention value. 6199 CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, 6200 bool IsVarArg) const { 6201 switch (CC) { 6202 default: 6203 report_fatal_error("Unsupported calling convention."); 6204 case CallingConv::WebKit_JS: 6205 return CC_AArch64_WebKit_JS; 6206 case CallingConv::GHC: 6207 return CC_AArch64_GHC; 6208 case CallingConv::C: 6209 case CallingConv::Fast: 6210 case CallingConv::PreserveMost: 6211 case CallingConv::CXX_FAST_TLS: 6212 case CallingConv::Swift: 6213 case CallingConv::SwiftTail: 6214 case CallingConv::Tail: 6215 if (Subtarget->isTargetWindows() && IsVarArg) { 6216 if (Subtarget->isWindowsArm64EC()) 6217 return CC_AArch64_Arm64EC_VarArg; 6218 return CC_AArch64_Win64_VarArg; 6219 } 6220 if (!Subtarget->isTargetDarwin()) 6221 return CC_AArch64_AAPCS; 6222 if (!IsVarArg) 6223 return CC_AArch64_DarwinPCS; 6224 return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg 6225 : CC_AArch64_DarwinPCS_VarArg; 6226 case CallingConv::Win64: 6227 if (IsVarArg) { 6228 if (Subtarget->isWindowsArm64EC()) 6229 return CC_AArch64_Arm64EC_VarArg; 6230 return CC_AArch64_Win64_VarArg; 6231 } 6232 return CC_AArch64_AAPCS; 6233 case CallingConv::CFGuard_Check: 6234 return CC_AArch64_Win64_CFGuard_Check; 6235 case CallingConv::AArch64_VectorCall: 6236 case CallingConv::AArch64_SVE_VectorCall: 6237 case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0: 6238 case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2: 6239 return CC_AArch64_AAPCS; 6240 } 6241 } 6242 6243 CCAssignFn * 6244 AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const { 6245 return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS 6246 : RetCC_AArch64_AAPCS; 6247 } 6248 6249 6250 unsigned 6251 AArch64TargetLowering::allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL, 6252 SelectionDAG &DAG) const { 6253 MachineFunction &MF = DAG.getMachineFunction(); 6254 MachineFrameInfo &MFI = MF.getFrameInfo(); 6255 6256 // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case) 6257 SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, 6258 DAG.getConstant(1, DL, MVT::i32)); 6259 SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N); 6260 SDValue Ops[] = {Chain, NN, DAG.getConstant(1, DL, MVT::i64)}; 6261 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other); 6262 SDValue Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, VTs, Ops); 6263 Chain = Buffer.getValue(1); 6264 MFI.CreateVariableSizedObject(Align(1), nullptr); 6265 6266 // Allocate an additional TPIDR2 object on the stack (16 bytes) 6267 unsigned TPIDR2Obj = MFI.CreateStackObject(16, Align(16), false); 6268 6269 // Store the buffer pointer to the TPIDR2 stack object. 6270 MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj); 6271 SDValue Ptr = DAG.getFrameIndex( 6272 TPIDR2Obj, 6273 DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); 6274 Chain = DAG.getStore(Chain, DL, Buffer, Ptr, MPI); 6275 6276 return TPIDR2Obj; 6277 } 6278 6279 SDValue AArch64TargetLowering::LowerFormalArguments( 6280 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 6281 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, 6282 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 6283 MachineFunction &MF = DAG.getMachineFunction(); 6284 const Function &F = MF.getFunction(); 6285 MachineFrameInfo &MFI = MF.getFrameInfo(); 6286 bool IsWin64 = Subtarget->isCallingConvWin64(F.getCallingConv()); 6287 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 6288 6289 SmallVector<ISD::OutputArg, 4> Outs; 6290 GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs, 6291 DAG.getTargetLoweringInfo(), MF.getDataLayout()); 6292 if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); })) 6293 FuncInfo->setIsSVECC(true); 6294 6295 // Assign locations to all of the incoming arguments. 6296 SmallVector<CCValAssign, 16> ArgLocs; 6297 DenseMap<unsigned, SDValue> CopiedRegs; 6298 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); 6299 6300 // At this point, Ins[].VT may already be promoted to i32. To correctly 6301 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and 6302 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. 6303 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here 6304 // we use a special version of AnalyzeFormalArguments to pass in ValVT and 6305 // LocVT. 6306 unsigned NumArgs = Ins.size(); 6307 Function::const_arg_iterator CurOrigArg = F.arg_begin(); 6308 unsigned CurArgIdx = 0; 6309 for (unsigned i = 0; i != NumArgs; ++i) { 6310 MVT ValVT = Ins[i].VT; 6311 if (Ins[i].isOrigArg()) { 6312 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx); 6313 CurArgIdx = Ins[i].getOrigArgIndex(); 6314 6315 // Get type of the original argument. 6316 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(), 6317 /*AllowUnknown*/ true); 6318 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other; 6319 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. 6320 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) 6321 ValVT = MVT::i8; 6322 else if (ActualMVT == MVT::i16) 6323 ValVT = MVT::i16; 6324 } 6325 bool UseVarArgCC = false; 6326 if (IsWin64) 6327 UseVarArgCC = isVarArg; 6328 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC); 6329 bool Res = 6330 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo); 6331 assert(!Res && "Call operand has unhandled type"); 6332 (void)Res; 6333 } 6334 6335 SMEAttrs Attrs(MF.getFunction()); 6336 bool IsLocallyStreaming = 6337 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody(); 6338 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value"); 6339 SDValue Glue = Chain.getValue(1); 6340 6341 SmallVector<SDValue, 16> ArgValues; 6342 unsigned ExtraArgLocs = 0; 6343 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 6344 CCValAssign &VA = ArgLocs[i - ExtraArgLocs]; 6345 6346 if (Ins[i].Flags.isByVal()) { 6347 // Byval is used for HFAs in the PCS, but the system should work in a 6348 // non-compliant manner for larger structs. 6349 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 6350 int Size = Ins[i].Flags.getByValSize(); 6351 unsigned NumRegs = (Size + 7) / 8; 6352 6353 // FIXME: This works on big-endian for composite byvals, which are the common 6354 // case. It should also work for fundamental types too. 6355 unsigned FrameIdx = 6356 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false); 6357 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT); 6358 InVals.push_back(FrameIdxN); 6359 6360 continue; 6361 } 6362 6363 if (Ins[i].Flags.isSwiftAsync()) 6364 MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true); 6365 6366 SDValue ArgValue; 6367 if (VA.isRegLoc()) { 6368 // Arguments stored in registers. 6369 EVT RegVT = VA.getLocVT(); 6370 const TargetRegisterClass *RC; 6371 6372 if (RegVT == MVT::i32) 6373 RC = &AArch64::GPR32RegClass; 6374 else if (RegVT == MVT::i64) 6375 RC = &AArch64::GPR64RegClass; 6376 else if (RegVT == MVT::f16 || RegVT == MVT::bf16) 6377 RC = &AArch64::FPR16RegClass; 6378 else if (RegVT == MVT::f32) 6379 RC = &AArch64::FPR32RegClass; 6380 else if (RegVT == MVT::f64 || RegVT.is64BitVector()) 6381 RC = &AArch64::FPR64RegClass; 6382 else if (RegVT == MVT::f128 || RegVT.is128BitVector()) 6383 RC = &AArch64::FPR128RegClass; 6384 else if (RegVT.isScalableVector() && 6385 RegVT.getVectorElementType() == MVT::i1) { 6386 FuncInfo->setIsSVECC(true); 6387 RC = &AArch64::PPRRegClass; 6388 } else if (RegVT.isScalableVector()) { 6389 FuncInfo->setIsSVECC(true); 6390 RC = &AArch64::ZPRRegClass; 6391 } else 6392 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); 6393 6394 // Transform the arguments in physical registers into virtual ones. 6395 Register Reg = MF.addLiveIn(VA.getLocReg(), RC); 6396 6397 if (IsLocallyStreaming) { 6398 // LocallyStreamingFunctions must insert the SMSTART in the correct 6399 // position, so we use Glue to ensure no instructions can be scheduled 6400 // between the chain of: 6401 // t0: ch,glue = EntryNode 6402 // t1: res,ch,glue = CopyFromReg 6403 // ... 6404 // tn: res,ch,glue = CopyFromReg t(n-1), .. 6405 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2 6406 // ^^^^^^ 6407 // This will be the new Chain/Root node. 6408 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue); 6409 Glue = ArgValue.getValue(2); 6410 } else 6411 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT); 6412 6413 // If this is an 8, 16 or 32-bit value, it is really passed promoted 6414 // to 64 bits. Insert an assert[sz]ext to capture this, then 6415 // truncate to the right size. 6416 switch (VA.getLocInfo()) { 6417 default: 6418 llvm_unreachable("Unknown loc info!"); 6419 case CCValAssign::Full: 6420 break; 6421 case CCValAssign::Indirect: 6422 assert((VA.getValVT().isScalableVector() || 6423 Subtarget->isWindowsArm64EC()) && 6424 "Indirect arguments should be scalable on most subtargets"); 6425 break; 6426 case CCValAssign::BCvt: 6427 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue); 6428 break; 6429 case CCValAssign::AExt: 6430 case CCValAssign::SExt: 6431 case CCValAssign::ZExt: 6432 break; 6433 case CCValAssign::AExtUpper: 6434 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue, 6435 DAG.getConstant(32, DL, RegVT)); 6436 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT()); 6437 break; 6438 } 6439 } else { // VA.isRegLoc() 6440 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem"); 6441 unsigned ArgOffset = VA.getLocMemOffset(); 6442 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect 6443 ? VA.getLocVT().getSizeInBits() 6444 : VA.getValVT().getSizeInBits()) / 8; 6445 6446 uint32_t BEAlign = 0; 6447 if (!Subtarget->isLittleEndian() && ArgSize < 8 && 6448 !Ins[i].Flags.isInConsecutiveRegs()) 6449 BEAlign = 8 - ArgSize; 6450 6451 SDValue FIN; 6452 MachinePointerInfo PtrInfo; 6453 if (isVarArg && Subtarget->isWindowsArm64EC()) { 6454 // In the ARM64EC varargs convention, fixed arguments on the stack are 6455 // accessed relative to x4, not sp. 6456 unsigned ObjOffset = ArgOffset + BEAlign; 6457 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass); 6458 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); 6459 FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val, 6460 DAG.getConstant(ObjOffset, DL, MVT::i64)); 6461 PtrInfo = MachinePointerInfo::getUnknownStack(MF); 6462 } else { 6463 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true); 6464 6465 // Create load nodes to retrieve arguments from the stack. 6466 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 6467 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 6468 } 6469 6470 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) 6471 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; 6472 MVT MemVT = VA.getValVT(); 6473 6474 switch (VA.getLocInfo()) { 6475 default: 6476 break; 6477 case CCValAssign::Trunc: 6478 case CCValAssign::BCvt: 6479 MemVT = VA.getLocVT(); 6480 break; 6481 case CCValAssign::Indirect: 6482 assert((VA.getValVT().isScalableVector() || 6483 Subtarget->isWindowsArm64EC()) && 6484 "Indirect arguments should be scalable on most subtargets"); 6485 MemVT = VA.getLocVT(); 6486 break; 6487 case CCValAssign::SExt: 6488 ExtType = ISD::SEXTLOAD; 6489 break; 6490 case CCValAssign::ZExt: 6491 ExtType = ISD::ZEXTLOAD; 6492 break; 6493 case CCValAssign::AExt: 6494 ExtType = ISD::EXTLOAD; 6495 break; 6496 } 6497 6498 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo, 6499 MemVT); 6500 } 6501 6502 if (VA.getLocInfo() == CCValAssign::Indirect) { 6503 assert( 6504 (VA.getValVT().isScalableVector() || Subtarget->isWindowsArm64EC()) && 6505 "Indirect arguments should be scalable on most subtargets"); 6506 6507 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue(); 6508 unsigned NumParts = 1; 6509 if (Ins[i].Flags.isInConsecutiveRegs()) { 6510 assert(!Ins[i].Flags.isInConsecutiveRegsLast()); 6511 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast()) 6512 ++NumParts; 6513 } 6514 6515 MVT PartLoad = VA.getValVT(); 6516 SDValue Ptr = ArgValue; 6517 6518 // Ensure we generate all loads for each tuple part, whilst updating the 6519 // pointer after each load correctly using vscale. 6520 while (NumParts > 0) { 6521 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo()); 6522 InVals.push_back(ArgValue); 6523 NumParts--; 6524 if (NumParts > 0) { 6525 SDValue BytesIncrement; 6526 if (PartLoad.isScalableVector()) { 6527 BytesIncrement = DAG.getVScale( 6528 DL, Ptr.getValueType(), 6529 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize)); 6530 } else { 6531 BytesIncrement = DAG.getConstant( 6532 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL, 6533 Ptr.getValueType()); 6534 } 6535 SDNodeFlags Flags; 6536 Flags.setNoUnsignedWrap(true); 6537 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, 6538 BytesIncrement, Flags); 6539 ExtraArgLocs++; 6540 i++; 6541 } 6542 } 6543 } else { 6544 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer()) 6545 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(), 6546 ArgValue, DAG.getValueType(MVT::i32)); 6547 6548 // i1 arguments are zero-extended to i8 by the caller. Emit a 6549 // hint to reflect this. 6550 if (Ins[i].isOrigArg()) { 6551 Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex()); 6552 if (OrigArg->getType()->isIntegerTy(1)) { 6553 if (!Ins[i].Flags.isZExt()) { 6554 ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL, 6555 ArgValue.getValueType(), ArgValue); 6556 } 6557 } 6558 } 6559 6560 InVals.push_back(ArgValue); 6561 } 6562 } 6563 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size()); 6564 6565 // Insert the SMSTART if this is a locally streaming function and 6566 // make sure it is Glued to the last CopyFromReg value. 6567 if (IsLocallyStreaming) { 6568 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 6569 Chain = DAG.getNode( 6570 AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), 6571 {DAG.getRoot(), 6572 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32), 6573 DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64), 6574 DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask()), Glue}); 6575 // Ensure that the SMSTART happens after the CopyWithChain such that its 6576 // chain result is used. 6577 for (unsigned I=0; I<InVals.size(); ++I) { 6578 Register Reg = MF.getRegInfo().createVirtualRegister( 6579 getRegClassFor(InVals[I].getValueType().getSimpleVT())); 6580 Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]); 6581 InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg, 6582 InVals[I].getValueType()); 6583 } 6584 } 6585 6586 // varargs 6587 if (isVarArg) { 6588 if (!Subtarget->isTargetDarwin() || IsWin64) { 6589 // The AAPCS variadic function ABI is identical to the non-variadic 6590 // one. As a result there may be more arguments in registers and we should 6591 // save them for future reference. 6592 // Win64 variadic functions also pass arguments in registers, but all float 6593 // arguments are passed in integer registers. 6594 saveVarArgRegisters(CCInfo, DAG, DL, Chain); 6595 } 6596 6597 // This will point to the next argument passed via stack. 6598 unsigned StackOffset = CCInfo.getNextStackOffset(); 6599 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32 6600 StackOffset = alignTo(StackOffset, Subtarget->isTargetILP32() ? 4 : 8); 6601 FuncInfo->setVarArgsStackOffset(StackOffset); 6602 FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true)); 6603 6604 if (MFI.hasMustTailInVarArgFunc()) { 6605 SmallVector<MVT, 2> RegParmTypes; 6606 RegParmTypes.push_back(MVT::i64); 6607 RegParmTypes.push_back(MVT::f128); 6608 // Compute the set of forwarded registers. The rest are scratch. 6609 SmallVectorImpl<ForwardedRegister> &Forwards = 6610 FuncInfo->getForwardedMustTailRegParms(); 6611 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, 6612 CC_AArch64_AAPCS); 6613 6614 // Conservatively forward X8, since it might be used for aggregate return. 6615 if (!CCInfo.isAllocated(AArch64::X8)) { 6616 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass); 6617 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64)); 6618 } 6619 } 6620 } 6621 6622 // On Windows, InReg pointers must be returned, so record the pointer in a 6623 // virtual register at the start of the function so it can be returned in the 6624 // epilogue. 6625 if (IsWin64) { 6626 for (unsigned I = 0, E = Ins.size(); I != E; ++I) { 6627 if (Ins[I].Flags.isInReg() && Ins[I].Flags.isSRet()) { 6628 assert(!FuncInfo->getSRetReturnReg()); 6629 6630 MVT PtrTy = getPointerTy(DAG.getDataLayout()); 6631 Register Reg = 6632 MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); 6633 FuncInfo->setSRetReturnReg(Reg); 6634 6635 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]); 6636 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain); 6637 break; 6638 } 6639 } 6640 } 6641 6642 unsigned StackArgSize = CCInfo.getNextStackOffset(); 6643 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 6644 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) { 6645 // This is a non-standard ABI so by fiat I say we're allowed to make full 6646 // use of the stack area to be popped, which must be aligned to 16 bytes in 6647 // any case: 6648 StackArgSize = alignTo(StackArgSize, 16); 6649 6650 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding 6651 // a multiple of 16. 6652 FuncInfo->setArgumentStackToRestore(StackArgSize); 6653 6654 // This realignment carries over to the available bytes below. Our own 6655 // callers will guarantee the space is free by giving an aligned value to 6656 // CALLSEQ_START. 6657 } 6658 // Even if we're not expected to free up the space, it's useful to know how 6659 // much is there while considering tail calls (because we can reuse it). 6660 FuncInfo->setBytesInStackArgArea(StackArgSize); 6661 6662 if (Subtarget->hasCustomCallingConv()) 6663 Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF); 6664 6665 // Conservatively assume the function requires the lazy-save mechanism. 6666 if (SMEAttrs(MF.getFunction()).hasZAState()) { 6667 unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG); 6668 FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj); 6669 } 6670 6671 return Chain; 6672 } 6673 6674 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, 6675 SelectionDAG &DAG, 6676 const SDLoc &DL, 6677 SDValue &Chain) const { 6678 MachineFunction &MF = DAG.getMachineFunction(); 6679 MachineFrameInfo &MFI = MF.getFrameInfo(); 6680 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 6681 auto PtrVT = getPointerTy(DAG.getDataLayout()); 6682 bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()); 6683 6684 SmallVector<SDValue, 8> MemOps; 6685 6686 static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2, 6687 AArch64::X3, AArch64::X4, AArch64::X5, 6688 AArch64::X6, AArch64::X7 }; 6689 unsigned NumGPRArgRegs = std::size(GPRArgRegs); 6690 if (Subtarget->isWindowsArm64EC()) { 6691 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs 6692 // functions. 6693 NumGPRArgRegs = 4; 6694 } 6695 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs); 6696 6697 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR); 6698 int GPRIdx = 0; 6699 if (GPRSaveSize != 0) { 6700 if (IsWin64) { 6701 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false); 6702 if (GPRSaveSize & 15) 6703 // The extra size here, if triggered, will always be 8. 6704 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false); 6705 } else 6706 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false); 6707 6708 SDValue FIN; 6709 if (Subtarget->isWindowsArm64EC()) { 6710 // With the Arm64EC ABI, we reserve the save area as usual, but we 6711 // compute its address relative to x4. For a normal AArch64->AArch64 6712 // call, x4 == sp on entry, but calls from an entry thunk can pass in a 6713 // different address. 6714 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass); 6715 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); 6716 FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val, 6717 DAG.getConstant(GPRSaveSize, DL, MVT::i64)); 6718 } else { 6719 FIN = DAG.getFrameIndex(GPRIdx, PtrVT); 6720 } 6721 6722 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) { 6723 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass); 6724 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); 6725 SDValue Store = 6726 DAG.getStore(Val.getValue(1), DL, Val, FIN, 6727 IsWin64 ? MachinePointerInfo::getFixedStack( 6728 MF, GPRIdx, (i - FirstVariadicGPR) * 8) 6729 : MachinePointerInfo::getStack(MF, i * 8)); 6730 MemOps.push_back(Store); 6731 FIN = 6732 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT)); 6733 } 6734 } 6735 FuncInfo->setVarArgsGPRIndex(GPRIdx); 6736 FuncInfo->setVarArgsGPRSize(GPRSaveSize); 6737 6738 if (Subtarget->hasFPARMv8() && !IsWin64) { 6739 static const MCPhysReg FPRArgRegs[] = { 6740 AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, 6741 AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7}; 6742 static const unsigned NumFPRArgRegs = std::size(FPRArgRegs); 6743 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs); 6744 6745 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); 6746 int FPRIdx = 0; 6747 if (FPRSaveSize != 0) { 6748 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false); 6749 6750 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT); 6751 6752 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) { 6753 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass); 6754 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); 6755 6756 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN, 6757 MachinePointerInfo::getStack(MF, i * 16)); 6758 MemOps.push_back(Store); 6759 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, 6760 DAG.getConstant(16, DL, PtrVT)); 6761 } 6762 } 6763 FuncInfo->setVarArgsFPRIndex(FPRIdx); 6764 FuncInfo->setVarArgsFPRSize(FPRSaveSize); 6765 } 6766 6767 if (!MemOps.empty()) { 6768 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); 6769 } 6770 } 6771 6772 /// LowerCallResult - Lower the result values of a call into the 6773 /// appropriate copies out of appropriate physical registers. 6774 SDValue AArch64TargetLowering::LowerCallResult( 6775 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, 6776 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL, 6777 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn, 6778 SDValue ThisVal) const { 6779 DenseMap<unsigned, SDValue> CopiedRegs; 6780 // Copy all of the result registers out of their specified physreg. 6781 for (unsigned i = 0; i != RVLocs.size(); ++i) { 6782 CCValAssign VA = RVLocs[i]; 6783 6784 // Pass 'this' value directly from the argument to return value, to avoid 6785 // reg unit interference 6786 if (i == 0 && isThisReturn) { 6787 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 && 6788 "unexpected return calling convention register assignment"); 6789 InVals.push_back(ThisVal); 6790 continue; 6791 } 6792 6793 // Avoid copying a physreg twice since RegAllocFast is incompetent and only 6794 // allows one use of a physreg per block. 6795 SDValue Val = CopiedRegs.lookup(VA.getLocReg()); 6796 if (!Val) { 6797 Val = 6798 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag); 6799 Chain = Val.getValue(1); 6800 InFlag = Val.getValue(2); 6801 CopiedRegs[VA.getLocReg()] = Val; 6802 } 6803 6804 switch (VA.getLocInfo()) { 6805 default: 6806 llvm_unreachable("Unknown loc info!"); 6807 case CCValAssign::Full: 6808 break; 6809 case CCValAssign::BCvt: 6810 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); 6811 break; 6812 case CCValAssign::AExtUpper: 6813 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val, 6814 DAG.getConstant(32, DL, VA.getLocVT())); 6815 [[fallthrough]]; 6816 case CCValAssign::AExt: 6817 [[fallthrough]]; 6818 case CCValAssign::ZExt: 6819 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT()); 6820 break; 6821 } 6822 6823 InVals.push_back(Val); 6824 } 6825 6826 return Chain; 6827 } 6828 6829 /// Return true if the calling convention is one that we can guarantee TCO for. 6830 static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) { 6831 return (CC == CallingConv::Fast && GuaranteeTailCalls) || 6832 CC == CallingConv::Tail || CC == CallingConv::SwiftTail; 6833 } 6834 6835 /// Return true if we might ever do TCO for calls with this calling convention. 6836 static bool mayTailCallThisCC(CallingConv::ID CC) { 6837 switch (CC) { 6838 case CallingConv::C: 6839 case CallingConv::AArch64_SVE_VectorCall: 6840 case CallingConv::PreserveMost: 6841 case CallingConv::Swift: 6842 case CallingConv::SwiftTail: 6843 case CallingConv::Tail: 6844 case CallingConv::Fast: 6845 return true; 6846 default: 6847 return false; 6848 } 6849 } 6850 6851 static void analyzeCallOperands(const AArch64TargetLowering &TLI, 6852 const AArch64Subtarget *Subtarget, 6853 const TargetLowering::CallLoweringInfo &CLI, 6854 CCState &CCInfo) { 6855 const SelectionDAG &DAG = CLI.DAG; 6856 CallingConv::ID CalleeCC = CLI.CallConv; 6857 bool IsVarArg = CLI.IsVarArg; 6858 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; 6859 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC); 6860 6861 unsigned NumArgs = Outs.size(); 6862 for (unsigned i = 0; i != NumArgs; ++i) { 6863 MVT ArgVT = Outs[i].VT; 6864 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; 6865 6866 bool UseVarArgCC = false; 6867 if (IsVarArg) { 6868 // On Windows, the fixed arguments in a vararg call are passed in GPRs 6869 // too, so use the vararg CC to force them to integer registers. 6870 if (IsCalleeWin64) { 6871 UseVarArgCC = true; 6872 } else { 6873 UseVarArgCC = !Outs[i].IsFixed; 6874 } 6875 } 6876 6877 if (!UseVarArgCC) { 6878 // Get type of the original argument. 6879 EVT ActualVT = 6880 TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty, 6881 /*AllowUnknown*/ true); 6882 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT; 6883 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. 6884 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) 6885 ArgVT = MVT::i8; 6886 else if (ActualMVT == MVT::i16) 6887 ArgVT = MVT::i16; 6888 } 6889 6890 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC); 6891 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo); 6892 assert(!Res && "Call operand has unhandled type"); 6893 (void)Res; 6894 } 6895 } 6896 6897 bool AArch64TargetLowering::isEligibleForTailCallOptimization( 6898 const CallLoweringInfo &CLI) const { 6899 CallingConv::ID CalleeCC = CLI.CallConv; 6900 if (!mayTailCallThisCC(CalleeCC)) 6901 return false; 6902 6903 SDValue Callee = CLI.Callee; 6904 bool IsVarArg = CLI.IsVarArg; 6905 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; 6906 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals; 6907 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; 6908 const SelectionDAG &DAG = CLI.DAG; 6909 MachineFunction &MF = DAG.getMachineFunction(); 6910 const Function &CallerF = MF.getFunction(); 6911 CallingConv::ID CallerCC = CallerF.getCallingConv(); 6912 6913 // SME Streaming functions are not eligible for TCO as they may require 6914 // the streaming mode or ZA to be restored after returning from the call. 6915 SMEAttrs CallerAttrs(MF.getFunction()); 6916 auto CalleeAttrs = CLI.CB ? SMEAttrs(*CLI.CB) : SMEAttrs(SMEAttrs::Normal); 6917 if (CallerAttrs.requiresSMChange(CalleeAttrs) || 6918 CallerAttrs.requiresLazySave(CalleeAttrs)) 6919 return false; 6920 6921 // Functions using the C or Fast calling convention that have an SVE signature 6922 // preserve more registers and should assume the SVE_VectorCall CC. 6923 // The check for matching callee-saved regs will determine whether it is 6924 // eligible for TCO. 6925 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) && 6926 MF.getInfo<AArch64FunctionInfo>()->isSVECC()) 6927 CallerCC = CallingConv::AArch64_SVE_VectorCall; 6928 6929 bool CCMatch = CallerCC == CalleeCC; 6930 6931 // When using the Windows calling convention on a non-windows OS, we want 6932 // to back up and restore X18 in such functions; we can't do a tail call 6933 // from those functions. 6934 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() && 6935 CalleeCC != CallingConv::Win64) 6936 return false; 6937 6938 // Byval parameters hand the function a pointer directly into the stack area 6939 // we want to reuse during a tail call. Working around this *is* possible (see 6940 // X86) but less efficient and uglier in LowerCall. 6941 for (Function::const_arg_iterator i = CallerF.arg_begin(), 6942 e = CallerF.arg_end(); 6943 i != e; ++i) { 6944 if (i->hasByValAttr()) 6945 return false; 6946 6947 // On Windows, "inreg" attributes signify non-aggregate indirect returns. 6948 // In this case, it is necessary to save/restore X0 in the callee. Tail 6949 // call opt interferes with this. So we disable tail call opt when the 6950 // caller has an argument with "inreg" attribute. 6951 6952 // FIXME: Check whether the callee also has an "inreg" argument. 6953 if (i->hasInRegAttr()) 6954 return false; 6955 } 6956 6957 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt)) 6958 return CCMatch; 6959 6960 // Externally-defined functions with weak linkage should not be 6961 // tail-called on AArch64 when the OS does not support dynamic 6962 // pre-emption of symbols, as the AAELF spec requires normal calls 6963 // to undefined weak functions to be replaced with a NOP or jump to the 6964 // next instruction. The behaviour of branch instructions in this 6965 // situation (as used for tail calls) is implementation-defined, so we 6966 // cannot rely on the linker replacing the tail call with a return. 6967 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 6968 const GlobalValue *GV = G->getGlobal(); 6969 const Triple &TT = getTargetMachine().getTargetTriple(); 6970 if (GV->hasExternalWeakLinkage() && 6971 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) 6972 return false; 6973 } 6974 6975 // Now we search for cases where we can use a tail call without changing the 6976 // ABI. Sibcall is used in some places (particularly gcc) to refer to this 6977 // concept. 6978 6979 // I want anyone implementing a new calling convention to think long and hard 6980 // about this assert. 6981 assert((!IsVarArg || CalleeCC == CallingConv::C) && 6982 "Unexpected variadic calling convention"); 6983 6984 LLVMContext &C = *DAG.getContext(); 6985 // Check that the call results are passed in the same way. 6986 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, 6987 CCAssignFnForCall(CalleeCC, IsVarArg), 6988 CCAssignFnForCall(CallerCC, IsVarArg))) 6989 return false; 6990 // The callee has to preserve all registers the caller needs to preserve. 6991 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 6992 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); 6993 if (!CCMatch) { 6994 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); 6995 if (Subtarget->hasCustomCallingConv()) { 6996 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved); 6997 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved); 6998 } 6999 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) 7000 return false; 7001 } 7002 7003 // Nothing more to check if the callee is taking no arguments 7004 if (Outs.empty()) 7005 return true; 7006 7007 SmallVector<CCValAssign, 16> ArgLocs; 7008 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C); 7009 7010 analyzeCallOperands(*this, Subtarget, CLI, CCInfo); 7011 7012 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) { 7013 // When we are musttail, additional checks have been done and we can safely ignore this check 7014 // At least two cases here: if caller is fastcc then we can't have any 7015 // memory arguments (we'd be expected to clean up the stack afterwards). If 7016 // caller is C then we could potentially use its argument area. 7017 7018 // FIXME: for now we take the most conservative of these in both cases: 7019 // disallow all variadic memory operands. 7020 for (const CCValAssign &ArgLoc : ArgLocs) 7021 if (!ArgLoc.isRegLoc()) 7022 return false; 7023 } 7024 7025 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 7026 7027 // If any of the arguments is passed indirectly, it must be SVE, so the 7028 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to 7029 // allocate space on the stack. That is why we determine this explicitly here 7030 // the call cannot be a tailcall. 7031 if (llvm::any_of(ArgLocs, [&](CCValAssign &A) { 7032 assert((A.getLocInfo() != CCValAssign::Indirect || 7033 A.getValVT().isScalableVector() || 7034 Subtarget->isWindowsArm64EC()) && 7035 "Expected value to be scalable"); 7036 return A.getLocInfo() == CCValAssign::Indirect; 7037 })) 7038 return false; 7039 7040 // If the stack arguments for this call do not fit into our own save area then 7041 // the call cannot be made tail. 7042 if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) 7043 return false; 7044 7045 const MachineRegisterInfo &MRI = MF.getRegInfo(); 7046 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) 7047 return false; 7048 7049 return true; 7050 } 7051 7052 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain, 7053 SelectionDAG &DAG, 7054 MachineFrameInfo &MFI, 7055 int ClobberedFI) const { 7056 SmallVector<SDValue, 8> ArgChains; 7057 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI); 7058 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1; 7059 7060 // Include the original chain at the beginning of the list. When this is 7061 // used by target LowerCall hooks, this helps legalize find the 7062 // CALLSEQ_BEGIN node. 7063 ArgChains.push_back(Chain); 7064 7065 // Add a chain value for each stack argument corresponding 7066 for (SDNode *U : DAG.getEntryNode().getNode()->uses()) 7067 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) 7068 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) 7069 if (FI->getIndex() < 0) { 7070 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex()); 7071 int64_t InLastByte = InFirstByte; 7072 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1; 7073 7074 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || 7075 (FirstByte <= InFirstByte && InFirstByte <= LastByte)) 7076 ArgChains.push_back(SDValue(L, 1)); 7077 } 7078 7079 // Build a tokenfactor for all the chains. 7080 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); 7081 } 7082 7083 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC, 7084 bool TailCallOpt) const { 7085 return (CallCC == CallingConv::Fast && TailCallOpt) || 7086 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail; 7087 } 7088 7089 // Check if the value is zero-extended from i1 to i8 7090 static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) { 7091 unsigned SizeInBits = Arg.getValueType().getSizeInBits(); 7092 if (SizeInBits < 8) 7093 return false; 7094 7095 APInt RequredZero(SizeInBits, 0xFE); 7096 KnownBits Bits = DAG.computeKnownBits(Arg, 4); 7097 bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero; 7098 return ZExtBool; 7099 } 7100 7101 SDValue AArch64TargetLowering::changeStreamingMode( 7102 SelectionDAG &DAG, SDLoc DL, bool Enable, 7103 SDValue Chain, SDValue InFlag, SDValue PStateSM, bool Entry) const { 7104 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 7105 SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask()); 7106 SDValue MSROp = 7107 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32); 7108 7109 SDValue ExpectedSMVal = 7110 DAG.getTargetConstant(Entry ? Enable : !Enable, DL, MVT::i64); 7111 SmallVector<SDValue> Ops = {Chain, MSROp, PStateSM, ExpectedSMVal, RegMask}; 7112 7113 if (InFlag) 7114 Ops.push_back(InFlag); 7115 7116 unsigned Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP; 7117 return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops); 7118 } 7119 7120 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain, 7121 /// and add input and output parameter nodes. 7122 SDValue 7123 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, 7124 SmallVectorImpl<SDValue> &InVals) const { 7125 SelectionDAG &DAG = CLI.DAG; 7126 SDLoc &DL = CLI.DL; 7127 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; 7128 SmallVector<SDValue, 32> &OutVals = CLI.OutVals; 7129 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; 7130 SDValue Chain = CLI.Chain; 7131 SDValue Callee = CLI.Callee; 7132 bool &IsTailCall = CLI.IsTailCall; 7133 CallingConv::ID &CallConv = CLI.CallConv; 7134 bool IsVarArg = CLI.IsVarArg; 7135 7136 MachineFunction &MF = DAG.getMachineFunction(); 7137 MachineFunction::CallSiteInfo CSInfo; 7138 bool IsThisReturn = false; 7139 7140 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 7141 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 7142 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType; 7143 bool IsSibCall = false; 7144 bool GuardWithBTI = false; 7145 7146 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) && 7147 !Subtarget->noBTIAtReturnTwice()) { 7148 GuardWithBTI = FuncInfo->branchTargetEnforcement(); 7149 } 7150 7151 // Analyze operands of the call, assigning locations to each operand. 7152 SmallVector<CCValAssign, 16> ArgLocs; 7153 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); 7154 7155 if (IsVarArg) { 7156 unsigned NumArgs = Outs.size(); 7157 7158 for (unsigned i = 0; i != NumArgs; ++i) { 7159 if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector()) 7160 report_fatal_error("Passing SVE types to variadic functions is " 7161 "currently not supported"); 7162 } 7163 } 7164 7165 analyzeCallOperands(*this, Subtarget, CLI, CCInfo); 7166 7167 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv); 7168 // Assign locations to each value returned by this call. 7169 SmallVector<CCValAssign, 16> RVLocs; 7170 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs, 7171 *DAG.getContext()); 7172 RetCCInfo.AnalyzeCallResult(Ins, RetCC); 7173 7174 // Check callee args/returns for SVE registers and set calling convention 7175 // accordingly. 7176 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) { 7177 auto HasSVERegLoc = [](CCValAssign &Loc) { 7178 if (!Loc.isRegLoc()) 7179 return false; 7180 return AArch64::ZPRRegClass.contains(Loc.getLocReg()) || 7181 AArch64::PPRRegClass.contains(Loc.getLocReg()); 7182 }; 7183 if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc)) 7184 CallConv = CallingConv::AArch64_SVE_VectorCall; 7185 } 7186 7187 if (IsTailCall) { 7188 // Check if it's really possible to do a tail call. 7189 IsTailCall = isEligibleForTailCallOptimization(CLI); 7190 7191 // A sibling call is one where we're under the usual C ABI and not planning 7192 // to change that but can still do a tail call: 7193 if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail && 7194 CallConv != CallingConv::SwiftTail) 7195 IsSibCall = true; 7196 7197 if (IsTailCall) 7198 ++NumTailCalls; 7199 } 7200 7201 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) 7202 report_fatal_error("failed to perform tail call elimination on a call " 7203 "site marked musttail"); 7204 7205 // Get a count of how many bytes are to be pushed on the stack. 7206 unsigned NumBytes = CCInfo.getNextStackOffset(); 7207 7208 if (IsSibCall) { 7209 // Since we're not changing the ABI to make this a tail call, the memory 7210 // operands are already available in the caller's incoming argument space. 7211 NumBytes = 0; 7212 } 7213 7214 // FPDiff is the byte offset of the call's argument area from the callee's. 7215 // Stores to callee stack arguments will be placed in FixedStackSlots offset 7216 // by this amount for a tail call. In a sibling call it must be 0 because the 7217 // caller will deallocate the entire stack and the callee still expects its 7218 // arguments to begin at SP+0. Completely unused for non-tail calls. 7219 int FPDiff = 0; 7220 7221 if (IsTailCall && !IsSibCall) { 7222 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea(); 7223 7224 // Since callee will pop argument stack as a tail call, we must keep the 7225 // popped size 16-byte aligned. 7226 NumBytes = alignTo(NumBytes, 16); 7227 7228 // FPDiff will be negative if this tail call requires more space than we 7229 // would automatically have in our incoming argument space. Positive if we 7230 // can actually shrink the stack. 7231 FPDiff = NumReusableBytes - NumBytes; 7232 7233 // Update the required reserved area if this is the tail call requiring the 7234 // most argument stack space. 7235 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff) 7236 FuncInfo->setTailCallReservedStack(-FPDiff); 7237 7238 // The stack pointer must be 16-byte aligned at all times it's used for a 7239 // memory operation, which in practice means at *all* times and in 7240 // particular across call boundaries. Therefore our own arguments started at 7241 // a 16-byte aligned SP and the delta applied for the tail call should 7242 // satisfy the same constraint. 7243 assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); 7244 } 7245 7246 // Determine whether we need any streaming mode changes. 7247 SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction()); 7248 if (CLI.CB) 7249 CalleeAttrs = SMEAttrs(*CLI.CB); 7250 else if (std::optional<SMEAttrs> Attrs = 7251 getCalleeAttrsFromExternalFunction(CLI.Callee)) 7252 CalleeAttrs = *Attrs; 7253 7254 bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs); 7255 7256 MachineFrameInfo &MFI = MF.getFrameInfo(); 7257 if (RequiresLazySave) { 7258 // Set up a lazy save mechanism by storing the runtime live slices 7259 // (worst-case N*N) to the TPIDR2 stack object. 7260 SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, 7261 DAG.getConstant(1, DL, MVT::i32)); 7262 SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N); 7263 unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj(); 7264 7265 MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj); 7266 SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj, 7267 DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); 7268 SDValue BufferPtrAddr = 7269 DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr, 7270 DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType())); 7271 Chain = DAG.getTruncStore(Chain, DL, NN, BufferPtrAddr, MPI, MVT::i16); 7272 Chain = DAG.getNode( 7273 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain, 7274 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), 7275 TPIDR2ObjAddr); 7276 } 7277 7278 SDValue PStateSM; 7279 std::optional<bool> RequiresSMChange = 7280 CallerAttrs.requiresSMChange(CalleeAttrs); 7281 if (RequiresSMChange) 7282 PStateSM = getPStateSM(DAG, Chain, CallerAttrs, DL, MVT::i64); 7283 7284 // Adjust the stack pointer for the new arguments... 7285 // These operations are automatically eliminated by the prolog/epilog pass 7286 if (!IsSibCall) 7287 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL); 7288 7289 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, 7290 getPointerTy(DAG.getDataLayout())); 7291 7292 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 7293 SmallSet<unsigned, 8> RegsUsed; 7294 SmallVector<SDValue, 8> MemOpChains; 7295 auto PtrVT = getPointerTy(DAG.getDataLayout()); 7296 7297 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) { 7298 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms(); 7299 for (const auto &F : Forwards) { 7300 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT); 7301 RegsToPass.emplace_back(F.PReg, Val); 7302 } 7303 } 7304 7305 // Walk the register/memloc assignments, inserting copies/loads. 7306 unsigned ExtraArgLocs = 0; 7307 for (unsigned i = 0, e = Outs.size(); i != e; ++i) { 7308 CCValAssign &VA = ArgLocs[i - ExtraArgLocs]; 7309 SDValue Arg = OutVals[i]; 7310 ISD::ArgFlagsTy Flags = Outs[i].Flags; 7311 7312 // Promote the value if needed. 7313 switch (VA.getLocInfo()) { 7314 default: 7315 llvm_unreachable("Unknown loc info!"); 7316 case CCValAssign::Full: 7317 break; 7318 case CCValAssign::SExt: 7319 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); 7320 break; 7321 case CCValAssign::ZExt: 7322 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); 7323 break; 7324 case CCValAssign::AExt: 7325 if (Outs[i].ArgVT == MVT::i1) { 7326 // AAPCS requires i1 to be zero-extended to 8-bits by the caller. 7327 // 7328 // Check if we actually have to do this, because the value may 7329 // already be zero-extended. 7330 // 7331 // We cannot just emit a (zext i8 (trunc (assert-zext i8))) 7332 // and rely on DAGCombiner to fold this, because the following 7333 // (anyext i32) is combined with (zext i8) in DAG.getNode: 7334 // 7335 // (ext (zext x)) -> (zext x) 7336 // 7337 // This will give us (zext i32), which we cannot remove, so 7338 // try to check this beforehand. 7339 if (!checkZExtBool(Arg, DAG)) { 7340 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); 7341 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg); 7342 } 7343 } 7344 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); 7345 break; 7346 case CCValAssign::AExtUpper: 7347 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits"); 7348 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); 7349 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg, 7350 DAG.getConstant(32, DL, VA.getLocVT())); 7351 break; 7352 case CCValAssign::BCvt: 7353 Arg = DAG.getBitcast(VA.getLocVT(), Arg); 7354 break; 7355 case CCValAssign::Trunc: 7356 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT()); 7357 break; 7358 case CCValAssign::FPExt: 7359 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg); 7360 break; 7361 case CCValAssign::Indirect: 7362 bool isScalable = VA.getValVT().isScalableVector(); 7363 assert((isScalable || Subtarget->isWindowsArm64EC()) && 7364 "Indirect arguments should be scalable on most subtargets"); 7365 7366 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue(); 7367 uint64_t PartSize = StoreSize; 7368 unsigned NumParts = 1; 7369 if (Outs[i].Flags.isInConsecutiveRegs()) { 7370 assert(!Outs[i].Flags.isInConsecutiveRegsLast()); 7371 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast()) 7372 ++NumParts; 7373 StoreSize *= NumParts; 7374 } 7375 7376 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext()); 7377 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty); 7378 int FI = MFI.CreateStackObject(StoreSize, Alignment, false); 7379 if (isScalable) 7380 MFI.setStackID(FI, TargetStackID::ScalableVector); 7381 7382 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI); 7383 SDValue Ptr = DAG.getFrameIndex( 7384 FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); 7385 SDValue SpillSlot = Ptr; 7386 7387 // Ensure we generate all stores for each tuple part, whilst updating the 7388 // pointer after each store correctly using vscale. 7389 while (NumParts) { 7390 Chain = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI); 7391 NumParts--; 7392 if (NumParts > 0) { 7393 SDValue BytesIncrement; 7394 if (isScalable) { 7395 BytesIncrement = DAG.getVScale( 7396 DL, Ptr.getValueType(), 7397 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize)); 7398 } else { 7399 BytesIncrement = DAG.getConstant( 7400 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL, 7401 Ptr.getValueType()); 7402 } 7403 SDNodeFlags Flags; 7404 Flags.setNoUnsignedWrap(true); 7405 7406 MPI = MachinePointerInfo(MPI.getAddrSpace()); 7407 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, 7408 BytesIncrement, Flags); 7409 ExtraArgLocs++; 7410 i++; 7411 } 7412 } 7413 7414 Arg = SpillSlot; 7415 break; 7416 } 7417 7418 if (VA.isRegLoc()) { 7419 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() && 7420 Outs[0].VT == MVT::i64) { 7421 assert(VA.getLocVT() == MVT::i64 && 7422 "unexpected calling convention register assignment"); 7423 assert(!Ins.empty() && Ins[0].VT == MVT::i64 && 7424 "unexpected use of 'returned'"); 7425 IsThisReturn = true; 7426 } 7427 if (RegsUsed.count(VA.getLocReg())) { 7428 // If this register has already been used then we're trying to pack 7429 // parts of an [N x i32] into an X-register. The extension type will 7430 // take care of putting the two halves in the right place but we have to 7431 // combine them. 7432 SDValue &Bits = 7433 llvm::find_if(RegsToPass, 7434 [=](const std::pair<unsigned, SDValue> &Elt) { 7435 return Elt.first == VA.getLocReg(); 7436 }) 7437 ->second; 7438 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg); 7439 // Call site info is used for function's parameter entry value 7440 // tracking. For now we track only simple cases when parameter 7441 // is transferred through whole register. 7442 llvm::erase_if(CSInfo, [&VA](MachineFunction::ArgRegPair ArgReg) { 7443 return ArgReg.Reg == VA.getLocReg(); 7444 }); 7445 } else { 7446 // Add an extra level of indirection for streaming mode changes by 7447 // using a pseudo copy node that cannot be rematerialised between a 7448 // smstart/smstop and the call by the simple register coalescer. 7449 if (RequiresSMChange && isa<FrameIndexSDNode>(Arg)) 7450 Arg = DAG.getNode(AArch64ISD::OBSCURE_COPY, DL, MVT::i64, Arg); 7451 RegsToPass.emplace_back(VA.getLocReg(), Arg); 7452 RegsUsed.insert(VA.getLocReg()); 7453 const TargetOptions &Options = DAG.getTarget().Options; 7454 if (Options.EmitCallSiteInfo) 7455 CSInfo.emplace_back(VA.getLocReg(), i); 7456 } 7457 } else { 7458 assert(VA.isMemLoc()); 7459 7460 SDValue DstAddr; 7461 MachinePointerInfo DstInfo; 7462 7463 // FIXME: This works on big-endian for composite byvals, which are the 7464 // common case. It should also work for fundamental types too. 7465 uint32_t BEAlign = 0; 7466 unsigned OpSize; 7467 if (VA.getLocInfo() == CCValAssign::Indirect || 7468 VA.getLocInfo() == CCValAssign::Trunc) 7469 OpSize = VA.getLocVT().getFixedSizeInBits(); 7470 else 7471 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 7472 : VA.getValVT().getSizeInBits(); 7473 OpSize = (OpSize + 7) / 8; 7474 if (!Subtarget->isLittleEndian() && !Flags.isByVal() && 7475 !Flags.isInConsecutiveRegs()) { 7476 if (OpSize < 8) 7477 BEAlign = 8 - OpSize; 7478 } 7479 unsigned LocMemOffset = VA.getLocMemOffset(); 7480 int32_t Offset = LocMemOffset + BEAlign; 7481 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); 7482 PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); 7483 7484 if (IsTailCall) { 7485 Offset = Offset + FPDiff; 7486 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true); 7487 7488 DstAddr = DAG.getFrameIndex(FI, PtrVT); 7489 DstInfo = MachinePointerInfo::getFixedStack(MF, FI); 7490 7491 // Make sure any stack arguments overlapping with where we're storing 7492 // are loaded before this eventual operation. Otherwise they'll be 7493 // clobbered. 7494 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI); 7495 } else { 7496 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); 7497 7498 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); 7499 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset); 7500 } 7501 7502 if (Outs[i].Flags.isByVal()) { 7503 SDValue SizeNode = 7504 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64); 7505 SDValue Cpy = DAG.getMemcpy( 7506 Chain, DL, DstAddr, Arg, SizeNode, 7507 Outs[i].Flags.getNonZeroByValAlign(), 7508 /*isVol = */ false, /*AlwaysInline = */ false, 7509 /*isTailCall = */ false, DstInfo, MachinePointerInfo()); 7510 7511 MemOpChains.push_back(Cpy); 7512 } else { 7513 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already 7514 // promoted to a legal register type i32, we should truncate Arg back to 7515 // i1/i8/i16. 7516 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 || 7517 VA.getValVT() == MVT::i16) 7518 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg); 7519 7520 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo); 7521 MemOpChains.push_back(Store); 7522 } 7523 } 7524 } 7525 7526 if (IsVarArg && Subtarget->isWindowsArm64EC()) { 7527 // For vararg calls, the Arm64EC ABI requires values in x4 and x5 7528 // describing the argument list. x4 contains the address of the 7529 // first stack parameter. x5 contains the size in bytes of all parameters 7530 // passed on the stack. 7531 RegsToPass.emplace_back(AArch64::X4, StackPtr); 7532 RegsToPass.emplace_back(AArch64::X5, 7533 DAG.getConstant(NumBytes, DL, MVT::i64)); 7534 } 7535 7536 if (!MemOpChains.empty()) 7537 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); 7538 7539 SDValue InFlag; 7540 if (RequiresSMChange) { 7541 SDValue NewChain = changeStreamingMode(DAG, DL, *RequiresSMChange, Chain, 7542 InFlag, PStateSM, true); 7543 Chain = NewChain.getValue(0); 7544 InFlag = NewChain.getValue(1); 7545 } 7546 7547 // Build a sequence of copy-to-reg nodes chained together with token chain 7548 // and flag operands which copy the outgoing args into the appropriate regs. 7549 for (auto &RegToPass : RegsToPass) { 7550 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first, 7551 RegToPass.second, InFlag); 7552 InFlag = Chain.getValue(1); 7553 } 7554 7555 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every 7556 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol 7557 // node so that legalize doesn't hack it. 7558 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 7559 auto GV = G->getGlobal(); 7560 unsigned OpFlags = 7561 Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()); 7562 if (OpFlags & AArch64II::MO_GOT) { 7563 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags); 7564 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); 7565 } else { 7566 const GlobalValue *GV = G->getGlobal(); 7567 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); 7568 } 7569 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 7570 if (getTargetMachine().getCodeModel() == CodeModel::Large && 7571 Subtarget->isTargetMachO()) { 7572 const char *Sym = S->getSymbol(); 7573 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT); 7574 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); 7575 } else { 7576 const char *Sym = S->getSymbol(); 7577 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0); 7578 } 7579 } 7580 7581 // We don't usually want to end the call-sequence here because we would tidy 7582 // the frame up *after* the call, however in the ABI-changing tail-call case 7583 // we've carefully laid out the parameters so that when sp is reset they'll be 7584 // in the correct location. 7585 if (IsTailCall && !IsSibCall) { 7586 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InFlag, DL); 7587 InFlag = Chain.getValue(1); 7588 } 7589 7590 std::vector<SDValue> Ops; 7591 Ops.push_back(Chain); 7592 Ops.push_back(Callee); 7593 7594 if (IsTailCall) { 7595 // Each tail call may have to adjust the stack by a different amount, so 7596 // this information must travel along with the operation for eventual 7597 // consumption by emitEpilogue. 7598 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32)); 7599 } 7600 7601 // Add argument registers to the end of the list so that they are known live 7602 // into the call. 7603 for (auto &RegToPass : RegsToPass) 7604 Ops.push_back(DAG.getRegister(RegToPass.first, 7605 RegToPass.second.getValueType())); 7606 7607 // Add a register mask operand representing the call-preserved registers. 7608 const uint32_t *Mask; 7609 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 7610 if (IsThisReturn) { 7611 // For 'this' returns, use the X0-preserving mask if applicable 7612 Mask = TRI->getThisReturnPreservedMask(MF, CallConv); 7613 if (!Mask) { 7614 IsThisReturn = false; 7615 Mask = TRI->getCallPreservedMask(MF, CallConv); 7616 } 7617 } else 7618 Mask = TRI->getCallPreservedMask(MF, CallConv); 7619 7620 if (Subtarget->hasCustomCallingConv()) 7621 TRI->UpdateCustomCallPreservedMask(MF, &Mask); 7622 7623 if (TRI->isAnyArgRegReserved(MF)) 7624 TRI->emitReservedArgRegCallError(MF); 7625 7626 assert(Mask && "Missing call preserved mask for calling convention"); 7627 Ops.push_back(DAG.getRegisterMask(Mask)); 7628 7629 if (InFlag.getNode()) 7630 Ops.push_back(InFlag); 7631 7632 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 7633 7634 // If we're doing a tall call, use a TC_RETURN here rather than an 7635 // actual call instruction. 7636 if (IsTailCall) { 7637 MF.getFrameInfo().setHasTailCall(); 7638 SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops); 7639 7640 if (IsCFICall) 7641 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue()); 7642 7643 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); 7644 return Ret; 7645 } 7646 7647 unsigned CallOpc = AArch64ISD::CALL; 7648 // Calls with operand bundle "clang.arc.attachedcall" are special. They should 7649 // be expanded to the call, directly followed by a special marker sequence and 7650 // a call to an ObjC library function. Use CALL_RVMARKER to do that. 7651 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) { 7652 assert(!IsTailCall && 7653 "tail calls cannot be marked with clang.arc.attachedcall"); 7654 CallOpc = AArch64ISD::CALL_RVMARKER; 7655 7656 // Add a target global address for the retainRV/claimRV runtime function 7657 // just before the call target. 7658 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB); 7659 auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT); 7660 Ops.insert(Ops.begin() + 1, GA); 7661 } else if (GuardWithBTI) 7662 CallOpc = AArch64ISD::CALL_BTI; 7663 7664 // Returns a chain and a flag for retval copy to use. 7665 Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops); 7666 7667 if (IsCFICall) 7668 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue()); 7669 7670 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); 7671 InFlag = Chain.getValue(1); 7672 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); 7673 7674 uint64_t CalleePopBytes = 7675 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0; 7676 7677 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InFlag, DL); 7678 InFlag = Chain.getValue(1); 7679 7680 // Handle result values, copying them out of physregs into vregs that we 7681 // return. 7682 SDValue Result = LowerCallResult(Chain, InFlag, CallConv, IsVarArg, RVLocs, 7683 DL, DAG, InVals, IsThisReturn, 7684 IsThisReturn ? OutVals[0] : SDValue()); 7685 7686 if (!Ins.empty()) 7687 InFlag = Result.getValue(Result->getNumValues() - 1); 7688 7689 if (RequiresSMChange) { 7690 assert(PStateSM && "Expected a PStateSM to be set"); 7691 Result = changeStreamingMode(DAG, DL, !*RequiresSMChange, Result, InFlag, 7692 PStateSM, false); 7693 } 7694 7695 if (RequiresLazySave) { 7696 // Unconditionally resume ZA. 7697 Result = DAG.getNode( 7698 AArch64ISD::SMSTART, DL, MVT::Other, Result, 7699 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32), 7700 DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64)); 7701 7702 // Conditionally restore the lazy save using a pseudo node. 7703 unsigned FI = FuncInfo->getLazySaveTPIDR2Obj(); 7704 SDValue RegMask = DAG.getRegisterMask( 7705 TRI->SMEABISupportRoutinesCallPreservedMaskFromX0()); 7706 SDValue RestoreRoutine = DAG.getTargetExternalSymbol( 7707 "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout())); 7708 SDValue TPIDR2_EL0 = DAG.getNode( 7709 ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result, 7710 DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32)); 7711 7712 // Copy the address of the TPIDR2 block into X0 before 'calling' the 7713 // RESTORE_ZA pseudo. 7714 SDValue Glue; 7715 SDValue TPIDR2Block = DAG.getFrameIndex( 7716 FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); 7717 Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue); 7718 Result = DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other, 7719 {Result, TPIDR2_EL0, 7720 DAG.getRegister(AArch64::X0, MVT::i64), 7721 RestoreRoutine, 7722 RegMask, 7723 Result.getValue(1)}); 7724 7725 // Finally reset the TPIDR2_EL0 register to 0. 7726 Result = DAG.getNode( 7727 ISD::INTRINSIC_VOID, DL, MVT::Other, Result, 7728 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), 7729 DAG.getConstant(0, DL, MVT::i64)); 7730 } 7731 7732 if (RequiresSMChange || RequiresLazySave) { 7733 for (unsigned I = 0; I < InVals.size(); ++I) { 7734 // The smstart/smstop is chained as part of the call, but when the 7735 // resulting chain is discarded (which happens when the call is not part 7736 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the 7737 // smstart/smstop is chained to the result value. We can do that by doing 7738 // a vreg -> vreg copy. 7739 Register Reg = MF.getRegInfo().createVirtualRegister( 7740 getRegClassFor(InVals[I].getValueType().getSimpleVT())); 7741 SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]); 7742 InVals[I] = DAG.getCopyFromReg(X, DL, Reg, 7743 InVals[I].getValueType()); 7744 } 7745 } 7746 7747 return Result; 7748 } 7749 7750 bool AArch64TargetLowering::CanLowerReturn( 7751 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, 7752 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { 7753 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv); 7754 SmallVector<CCValAssign, 16> RVLocs; 7755 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 7756 return CCInfo.CheckReturn(Outs, RetCC); 7757 } 7758 7759 SDValue 7760 AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 7761 bool isVarArg, 7762 const SmallVectorImpl<ISD::OutputArg> &Outs, 7763 const SmallVectorImpl<SDValue> &OutVals, 7764 const SDLoc &DL, SelectionDAG &DAG) const { 7765 auto &MF = DAG.getMachineFunction(); 7766 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 7767 7768 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv); 7769 SmallVector<CCValAssign, 16> RVLocs; 7770 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); 7771 CCInfo.AnalyzeReturn(Outs, RetCC); 7772 7773 // Copy the result values into the output registers. 7774 SDValue Flag; 7775 SmallVector<std::pair<unsigned, SDValue>, 4> RetVals; 7776 SmallSet<unsigned, 4> RegsUsed; 7777 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); 7778 ++i, ++realRVLocIdx) { 7779 CCValAssign &VA = RVLocs[i]; 7780 assert(VA.isRegLoc() && "Can only return in registers!"); 7781 SDValue Arg = OutVals[realRVLocIdx]; 7782 7783 switch (VA.getLocInfo()) { 7784 default: 7785 llvm_unreachable("Unknown loc info!"); 7786 case CCValAssign::Full: 7787 if (Outs[i].ArgVT == MVT::i1) { 7788 // AAPCS requires i1 to be zero-extended to i8 by the producer of the 7789 // value. This is strictly redundant on Darwin (which uses "zeroext 7790 // i1"), but will be optimised out before ISel. 7791 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); 7792 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); 7793 } 7794 break; 7795 case CCValAssign::BCvt: 7796 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); 7797 break; 7798 case CCValAssign::AExt: 7799 case CCValAssign::ZExt: 7800 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT()); 7801 break; 7802 case CCValAssign::AExtUpper: 7803 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits"); 7804 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT()); 7805 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg, 7806 DAG.getConstant(32, DL, VA.getLocVT())); 7807 break; 7808 } 7809 7810 if (RegsUsed.count(VA.getLocReg())) { 7811 SDValue &Bits = 7812 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) { 7813 return Elt.first == VA.getLocReg(); 7814 })->second; 7815 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg); 7816 } else { 7817 RetVals.emplace_back(VA.getLocReg(), Arg); 7818 RegsUsed.insert(VA.getLocReg()); 7819 } 7820 } 7821 7822 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 7823 7824 // Emit SMSTOP before returning from a locally streaming function 7825 SMEAttrs FuncAttrs(MF.getFunction()); 7826 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) { 7827 Chain = DAG.getNode( 7828 AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain, 7829 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32), 7830 DAG.getConstant(1, DL, MVT::i64), DAG.getConstant(0, DL, MVT::i64), 7831 DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask())); 7832 Flag = Chain.getValue(1); 7833 } 7834 7835 SmallVector<SDValue, 4> RetOps(1, Chain); 7836 for (auto &RetVal : RetVals) { 7837 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Flag); 7838 Flag = Chain.getValue(1); 7839 RetOps.push_back( 7840 DAG.getRegister(RetVal.first, RetVal.second.getValueType())); 7841 } 7842 7843 // Windows AArch64 ABIs require that for returning structs by value we copy 7844 // the sret argument into X0 for the return. 7845 // We saved the argument into a virtual register in the entry block, 7846 // so now we copy the value out and into X0. 7847 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) { 7848 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg, 7849 getPointerTy(MF.getDataLayout())); 7850 7851 unsigned RetValReg = AArch64::X0; 7852 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Flag); 7853 Flag = Chain.getValue(1); 7854 7855 RetOps.push_back( 7856 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); 7857 } 7858 7859 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF); 7860 if (I) { 7861 for (; *I; ++I) { 7862 if (AArch64::GPR64RegClass.contains(*I)) 7863 RetOps.push_back(DAG.getRegister(*I, MVT::i64)); 7864 else if (AArch64::FPR64RegClass.contains(*I)) 7865 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); 7866 else 7867 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 7868 } 7869 } 7870 7871 RetOps[0] = Chain; // Update chain. 7872 7873 // Add the flag if we have it. 7874 if (Flag.getNode()) 7875 RetOps.push_back(Flag); 7876 7877 return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps); 7878 } 7879 7880 //===----------------------------------------------------------------------===// 7881 // Other Lowering Code 7882 //===----------------------------------------------------------------------===// 7883 7884 SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty, 7885 SelectionDAG &DAG, 7886 unsigned Flag) const { 7887 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty, 7888 N->getOffset(), Flag); 7889 } 7890 7891 SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty, 7892 SelectionDAG &DAG, 7893 unsigned Flag) const { 7894 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag); 7895 } 7896 7897 SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty, 7898 SelectionDAG &DAG, 7899 unsigned Flag) const { 7900 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(), 7901 N->getOffset(), Flag); 7902 } 7903 7904 SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty, 7905 SelectionDAG &DAG, 7906 unsigned Flag) const { 7907 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag); 7908 } 7909 7910 // (loadGOT sym) 7911 template <class NodeTy> 7912 SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG, 7913 unsigned Flags) const { 7914 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n"); 7915 SDLoc DL(N); 7916 EVT Ty = getPointerTy(DAG.getDataLayout()); 7917 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags); 7918 // FIXME: Once remat is capable of dealing with instructions with register 7919 // operands, expand this into two nodes instead of using a wrapper node. 7920 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr); 7921 } 7922 7923 // (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym)) 7924 template <class NodeTy> 7925 SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG, 7926 unsigned Flags) const { 7927 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n"); 7928 SDLoc DL(N); 7929 EVT Ty = getPointerTy(DAG.getDataLayout()); 7930 const unsigned char MO_NC = AArch64II::MO_NC; 7931 return DAG.getNode( 7932 AArch64ISD::WrapperLarge, DL, Ty, 7933 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags), 7934 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags), 7935 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags), 7936 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags)); 7937 } 7938 7939 // (addlow (adrp %hi(sym)) %lo(sym)) 7940 template <class NodeTy> 7941 SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG, 7942 unsigned Flags) const { 7943 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n"); 7944 SDLoc DL(N); 7945 EVT Ty = getPointerTy(DAG.getDataLayout()); 7946 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags); 7947 SDValue Lo = getTargetNode(N, Ty, DAG, 7948 AArch64II::MO_PAGEOFF | AArch64II::MO_NC | Flags); 7949 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi); 7950 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo); 7951 } 7952 7953 // (adr sym) 7954 template <class NodeTy> 7955 SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG, 7956 unsigned Flags) const { 7957 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n"); 7958 SDLoc DL(N); 7959 EVT Ty = getPointerTy(DAG.getDataLayout()); 7960 SDValue Sym = getTargetNode(N, Ty, DAG, Flags); 7961 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym); 7962 } 7963 7964 SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, 7965 SelectionDAG &DAG) const { 7966 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); 7967 const GlobalValue *GV = GN->getGlobal(); 7968 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 7969 7970 if (OpFlags != AArch64II::MO_NO_FLAG) 7971 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 && 7972 "unexpected offset in global node"); 7973 7974 // This also catches the large code model case for Darwin, and tiny code 7975 // model with got relocations. 7976 if ((OpFlags & AArch64II::MO_GOT) != 0) { 7977 return getGOT(GN, DAG, OpFlags); 7978 } 7979 7980 SDValue Result; 7981 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 7982 Result = getAddrLarge(GN, DAG, OpFlags); 7983 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) { 7984 Result = getAddrTiny(GN, DAG, OpFlags); 7985 } else { 7986 Result = getAddr(GN, DAG, OpFlags); 7987 } 7988 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 7989 SDLoc DL(GN); 7990 if (OpFlags & (AArch64II::MO_DLLIMPORT | AArch64II::MO_DLLIMPORTAUX | 7991 AArch64II::MO_COFFSTUB)) 7992 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, 7993 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 7994 return Result; 7995 } 7996 7997 /// Convert a TLS address reference into the correct sequence of loads 7998 /// and calls to compute the variable's address (for Darwin, currently) and 7999 /// return an SDValue containing the final node. 8000 8001 /// Darwin only has one TLS scheme which must be capable of dealing with the 8002 /// fully general situation, in the worst case. This means: 8003 /// + "extern __thread" declaration. 8004 /// + Defined in a possibly unknown dynamic library. 8005 /// 8006 /// The general system is that each __thread variable has a [3 x i64] descriptor 8007 /// which contains information used by the runtime to calculate the address. The 8008 /// only part of this the compiler needs to know about is the first xword, which 8009 /// contains a function pointer that must be called with the address of the 8010 /// entire descriptor in "x0". 8011 /// 8012 /// Since this descriptor may be in a different unit, in general even the 8013 /// descriptor must be accessed via an indirect load. The "ideal" code sequence 8014 /// is: 8015 /// adrp x0, _var@TLVPPAGE 8016 /// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor 8017 /// ldr x1, [x0] ; x1 contains 1st entry of descriptor, 8018 /// ; the function pointer 8019 /// blr x1 ; Uses descriptor address in x0 8020 /// ; Address of _var is now in x0. 8021 /// 8022 /// If the address of _var's descriptor *is* known to the linker, then it can 8023 /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for 8024 /// a slight efficiency gain. 8025 SDValue 8026 AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, 8027 SelectionDAG &DAG) const { 8028 assert(Subtarget->isTargetDarwin() && 8029 "This function expects a Darwin target"); 8030 8031 SDLoc DL(Op); 8032 MVT PtrVT = getPointerTy(DAG.getDataLayout()); 8033 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout()); 8034 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 8035 8036 SDValue TLVPAddr = 8037 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); 8038 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr); 8039 8040 // The first entry in the descriptor is a function pointer that we must call 8041 // to obtain the address of the variable. 8042 SDValue Chain = DAG.getEntryNode(); 8043 SDValue FuncTLVGet = DAG.getLoad( 8044 PtrMemVT, DL, Chain, DescAddr, 8045 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 8046 Align(PtrMemVT.getSizeInBits() / 8), 8047 MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable); 8048 Chain = FuncTLVGet.getValue(1); 8049 8050 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer. 8051 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT); 8052 8053 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 8054 MFI.setAdjustsStack(true); 8055 8056 // TLS calls preserve all registers except those that absolutely must be 8057 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be 8058 // silly). 8059 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 8060 const uint32_t *Mask = TRI->getTLSCallPreservedMask(); 8061 if (Subtarget->hasCustomCallingConv()) 8062 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask); 8063 8064 // Finally, we can make the call. This is just a degenerate version of a 8065 // normal AArch64 call node: x0 takes the address of the descriptor, and 8066 // returns the address of the variable in this thread. 8067 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue()); 8068 Chain = 8069 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), 8070 Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64), 8071 DAG.getRegisterMask(Mask), Chain.getValue(1)); 8072 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1)); 8073 } 8074 8075 /// Convert a thread-local variable reference into a sequence of instructions to 8076 /// compute the variable's address for the local exec TLS model of ELF targets. 8077 /// The sequence depends on the maximum TLS area size. 8078 SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV, 8079 SDValue ThreadBase, 8080 const SDLoc &DL, 8081 SelectionDAG &DAG) const { 8082 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 8083 SDValue TPOff, Addr; 8084 8085 switch (DAG.getTarget().Options.TLSSize) { 8086 default: 8087 llvm_unreachable("Unexpected TLS size"); 8088 8089 case 12: { 8090 // mrs x0, TPIDR_EL0 8091 // add x0, x0, :tprel_lo12:a 8092 SDValue Var = DAG.getTargetGlobalAddress( 8093 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF); 8094 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase, 8095 Var, 8096 DAG.getTargetConstant(0, DL, MVT::i32)), 8097 0); 8098 } 8099 8100 case 24: { 8101 // mrs x0, TPIDR_EL0 8102 // add x0, x0, :tprel_hi12:a 8103 // add x0, x0, :tprel_lo12_nc:a 8104 SDValue HiVar = DAG.getTargetGlobalAddress( 8105 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); 8106 SDValue LoVar = DAG.getTargetGlobalAddress( 8107 GV, DL, PtrVT, 0, 8108 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 8109 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase, 8110 HiVar, 8111 DAG.getTargetConstant(0, DL, MVT::i32)), 8112 0); 8113 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr, 8114 LoVar, 8115 DAG.getTargetConstant(0, DL, MVT::i32)), 8116 0); 8117 } 8118 8119 case 32: { 8120 // mrs x1, TPIDR_EL0 8121 // movz x0, #:tprel_g1:a 8122 // movk x0, #:tprel_g0_nc:a 8123 // add x0, x1, x0 8124 SDValue HiVar = DAG.getTargetGlobalAddress( 8125 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1); 8126 SDValue LoVar = DAG.getTargetGlobalAddress( 8127 GV, DL, PtrVT, 0, 8128 AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC); 8129 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar, 8130 DAG.getTargetConstant(16, DL, MVT::i32)), 8131 0); 8132 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar, 8133 DAG.getTargetConstant(0, DL, MVT::i32)), 8134 0); 8135 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); 8136 } 8137 8138 case 48: { 8139 // mrs x1, TPIDR_EL0 8140 // movz x0, #:tprel_g2:a 8141 // movk x0, #:tprel_g1_nc:a 8142 // movk x0, #:tprel_g0_nc:a 8143 // add x0, x1, x0 8144 SDValue HiVar = DAG.getTargetGlobalAddress( 8145 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2); 8146 SDValue MiVar = DAG.getTargetGlobalAddress( 8147 GV, DL, PtrVT, 0, 8148 AArch64II::MO_TLS | AArch64II::MO_G1 | AArch64II::MO_NC); 8149 SDValue LoVar = DAG.getTargetGlobalAddress( 8150 GV, DL, PtrVT, 0, 8151 AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC); 8152 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar, 8153 DAG.getTargetConstant(32, DL, MVT::i32)), 8154 0); 8155 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar, 8156 DAG.getTargetConstant(16, DL, MVT::i32)), 8157 0); 8158 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar, 8159 DAG.getTargetConstant(0, DL, MVT::i32)), 8160 0); 8161 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); 8162 } 8163 } 8164 } 8165 8166 /// When accessing thread-local variables under either the general-dynamic or 8167 /// local-dynamic system, we make a "TLS-descriptor" call. The variable will 8168 /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry 8169 /// is a function pointer to carry out the resolution. 8170 /// 8171 /// The sequence is: 8172 /// adrp x0, :tlsdesc:var 8173 /// ldr x1, [x0, #:tlsdesc_lo12:var] 8174 /// add x0, x0, #:tlsdesc_lo12:var 8175 /// .tlsdesccall var 8176 /// blr x1 8177 /// (TPIDR_EL0 offset now in x0) 8178 /// 8179 /// The above sequence must be produced unscheduled, to enable the linker to 8180 /// optimize/relax this sequence. 8181 /// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the 8182 /// above sequence, and expanded really late in the compilation flow, to ensure 8183 /// the sequence is produced as per above. 8184 SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, 8185 const SDLoc &DL, 8186 SelectionDAG &DAG) const { 8187 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 8188 8189 SDValue Chain = DAG.getEntryNode(); 8190 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 8191 8192 Chain = 8193 DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr}); 8194 SDValue Glue = Chain.getValue(1); 8195 8196 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue); 8197 } 8198 8199 SDValue 8200 AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, 8201 SelectionDAG &DAG) const { 8202 assert(Subtarget->isTargetELF() && "This function expects an ELF target"); 8203 8204 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 8205 8206 TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal()); 8207 8208 if (!EnableAArch64ELFLocalDynamicTLSGeneration) { 8209 if (Model == TLSModel::LocalDynamic) 8210 Model = TLSModel::GeneralDynamic; 8211 } 8212 8213 if (getTargetMachine().getCodeModel() == CodeModel::Large && 8214 Model != TLSModel::LocalExec) 8215 report_fatal_error("ELF TLS only supported in small memory model or " 8216 "in local exec TLS model"); 8217 // Different choices can be made for the maximum size of the TLS area for a 8218 // module. For the small address model, the default TLS size is 16MiB and the 8219 // maximum TLS size is 4GiB. 8220 // FIXME: add tiny and large code model support for TLS access models other 8221 // than local exec. We currently generate the same code as small for tiny, 8222 // which may be larger than needed. 8223 8224 SDValue TPOff; 8225 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 8226 SDLoc DL(Op); 8227 const GlobalValue *GV = GA->getGlobal(); 8228 8229 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT); 8230 8231 if (Model == TLSModel::LocalExec) { 8232 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG); 8233 } else if (Model == TLSModel::InitialExec) { 8234 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); 8235 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff); 8236 } else if (Model == TLSModel::LocalDynamic) { 8237 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS 8238 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate 8239 // the beginning of the module's TLS region, followed by a DTPREL offset 8240 // calculation. 8241 8242 // These accesses will need deduplicating if there's more than one. 8243 AArch64FunctionInfo *MFI = 8244 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); 8245 MFI->incNumLocalDynamicTLSAccesses(); 8246 8247 // The call needs a relocation too for linker relaxation. It doesn't make 8248 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of 8249 // the address. 8250 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, 8251 AArch64II::MO_TLS); 8252 8253 // Now we can calculate the offset from TPIDR_EL0 to this module's 8254 // thread-local area. 8255 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); 8256 8257 // Now use :dtprel_whatever: operations to calculate this variable's offset 8258 // in its thread-storage area. 8259 SDValue HiVar = DAG.getTargetGlobalAddress( 8260 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); 8261 SDValue LoVar = DAG.getTargetGlobalAddress( 8262 GV, DL, MVT::i64, 0, 8263 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 8264 8265 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar, 8266 DAG.getTargetConstant(0, DL, MVT::i32)), 8267 0); 8268 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar, 8269 DAG.getTargetConstant(0, DL, MVT::i32)), 8270 0); 8271 } else if (Model == TLSModel::GeneralDynamic) { 8272 // The call needs a relocation too for linker relaxation. It doesn't make 8273 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of 8274 // the address. 8275 SDValue SymAddr = 8276 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); 8277 8278 // Finally we can make a call to calculate the offset from tpidr_el0. 8279 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); 8280 } else 8281 llvm_unreachable("Unsupported ELF TLS access model"); 8282 8283 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); 8284 } 8285 8286 SDValue 8287 AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op, 8288 SelectionDAG &DAG) const { 8289 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering"); 8290 8291 SDValue Chain = DAG.getEntryNode(); 8292 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 8293 SDLoc DL(Op); 8294 8295 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64); 8296 8297 // Load the ThreadLocalStoragePointer from the TEB 8298 // A pointer to the TLS array is located at offset 0x58 from the TEB. 8299 SDValue TLSArray = 8300 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL)); 8301 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo()); 8302 Chain = TLSArray.getValue(1); 8303 8304 // Load the TLS index from the C runtime; 8305 // This does the same as getAddr(), but without having a GlobalAddressSDNode. 8306 // This also does the same as LOADgot, but using a generic i32 load, 8307 // while LOADgot only loads i64. 8308 SDValue TLSIndexHi = 8309 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE); 8310 SDValue TLSIndexLo = DAG.getTargetExternalSymbol( 8311 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 8312 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi); 8313 SDValue TLSIndex = 8314 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo); 8315 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo()); 8316 Chain = TLSIndex.getValue(1); 8317 8318 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8 8319 // offset into the TLSArray. 8320 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex); 8321 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex, 8322 DAG.getConstant(3, DL, PtrVT)); 8323 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain, 8324 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot), 8325 MachinePointerInfo()); 8326 Chain = TLS.getValue(1); 8327 8328 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 8329 const GlobalValue *GV = GA->getGlobal(); 8330 SDValue TGAHi = DAG.getTargetGlobalAddress( 8331 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); 8332 SDValue TGALo = DAG.getTargetGlobalAddress( 8333 GV, DL, PtrVT, 0, 8334 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 8335 8336 // Add the offset from the start of the .tls section (section base). 8337 SDValue Addr = 8338 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi, 8339 DAG.getTargetConstant(0, DL, MVT::i32)), 8340 0); 8341 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo); 8342 return Addr; 8343 } 8344 8345 SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, 8346 SelectionDAG &DAG) const { 8347 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 8348 if (DAG.getTarget().useEmulatedTLS()) 8349 return LowerToTLSEmulatedModel(GA, DAG); 8350 8351 if (Subtarget->isTargetDarwin()) 8352 return LowerDarwinGlobalTLSAddress(Op, DAG); 8353 if (Subtarget->isTargetELF()) 8354 return LowerELFGlobalTLSAddress(Op, DAG); 8355 if (Subtarget->isTargetWindows()) 8356 return LowerWindowsGlobalTLSAddress(Op, DAG); 8357 8358 llvm_unreachable("Unexpected platform trying to use TLS"); 8359 } 8360 8361 // Looks through \param Val to determine the bit that can be used to 8362 // check the sign of the value. It returns the unextended value and 8363 // the sign bit position. 8364 std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) { 8365 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG) 8366 return {Val.getOperand(0), 8367 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() - 8368 1}; 8369 8370 if (Val.getOpcode() == ISD::SIGN_EXTEND) 8371 return {Val.getOperand(0), 8372 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1}; 8373 8374 return {Val, Val.getValueSizeInBits() - 1}; 8375 } 8376 8377 SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { 8378 SDValue Chain = Op.getOperand(0); 8379 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 8380 SDValue LHS = Op.getOperand(2); 8381 SDValue RHS = Op.getOperand(3); 8382 SDValue Dest = Op.getOperand(4); 8383 SDLoc dl(Op); 8384 8385 MachineFunction &MF = DAG.getMachineFunction(); 8386 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions 8387 // will not be produced, as they are conditional branch instructions that do 8388 // not set flags. 8389 bool ProduceNonFlagSettingCondBr = 8390 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening); 8391 8392 // Handle f128 first, since lowering it will result in comparing the return 8393 // value of a libcall against zero, which is just what the rest of LowerBR_CC 8394 // is expecting to deal with. 8395 if (LHS.getValueType() == MVT::f128) { 8396 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS); 8397 8398 // If softenSetCCOperands returned a scalar, we need to compare the result 8399 // against zero to select between true and false values. 8400 if (!RHS.getNode()) { 8401 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 8402 CC = ISD::SETNE; 8403 } 8404 } 8405 8406 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 8407 // instruction. 8408 if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) && 8409 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 8410 // Only lower legal XALUO ops. 8411 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) 8412 return SDValue(); 8413 8414 // The actual operation with overflow check. 8415 AArch64CC::CondCode OFCC; 8416 SDValue Value, Overflow; 8417 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG); 8418 8419 if (CC == ISD::SETNE) 8420 OFCC = getInvertedCondCode(OFCC); 8421 SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32); 8422 8423 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, 8424 Overflow); 8425 } 8426 8427 if (LHS.getValueType().isInteger()) { 8428 assert((LHS.getValueType() == RHS.getValueType()) && 8429 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); 8430 8431 // If the RHS of the comparison is zero, we can potentially fold this 8432 // to a specialized branch. 8433 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS); 8434 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) { 8435 if (CC == ISD::SETEQ) { 8436 // See if we can use a TBZ to fold in an AND as well. 8437 // TBZ has a smaller branch displacement than CBZ. If the offset is 8438 // out of bounds, a late MI-layer pass rewrites branches. 8439 // 403.gcc is an example that hits this case. 8440 if (LHS.getOpcode() == ISD::AND && 8441 isa<ConstantSDNode>(LHS.getOperand(1)) && 8442 isPowerOf2_64(LHS.getConstantOperandVal(1))) { 8443 SDValue Test = LHS.getOperand(0); 8444 uint64_t Mask = LHS.getConstantOperandVal(1); 8445 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test, 8446 DAG.getConstant(Log2_64(Mask), dl, MVT::i64), 8447 Dest); 8448 } 8449 8450 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest); 8451 } else if (CC == ISD::SETNE) { 8452 // See if we can use a TBZ to fold in an AND as well. 8453 // TBZ has a smaller branch displacement than CBZ. If the offset is 8454 // out of bounds, a late MI-layer pass rewrites branches. 8455 // 403.gcc is an example that hits this case. 8456 if (LHS.getOpcode() == ISD::AND && 8457 isa<ConstantSDNode>(LHS.getOperand(1)) && 8458 isPowerOf2_64(LHS.getConstantOperandVal(1))) { 8459 SDValue Test = LHS.getOperand(0); 8460 uint64_t Mask = LHS.getConstantOperandVal(1); 8461 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test, 8462 DAG.getConstant(Log2_64(Mask), dl, MVT::i64), 8463 Dest); 8464 } 8465 8466 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest); 8467 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) { 8468 // Don't combine AND since emitComparison converts the AND to an ANDS 8469 // (a.k.a. TST) and the test in the test bit and branch instruction 8470 // becomes redundant. This would also increase register pressure. 8471 uint64_t SignBitPos; 8472 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS); 8473 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS, 8474 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest); 8475 } 8476 } 8477 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT && 8478 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) { 8479 // Don't combine AND since emitComparison converts the AND to an ANDS 8480 // (a.k.a. TST) and the test in the test bit and branch instruction 8481 // becomes redundant. This would also increase register pressure. 8482 uint64_t SignBitPos; 8483 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS); 8484 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS, 8485 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest); 8486 } 8487 8488 SDValue CCVal; 8489 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); 8490 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, 8491 Cmp); 8492 } 8493 8494 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 || 8495 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); 8496 8497 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally 8498 // clean. Some of them require two branches to implement. 8499 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 8500 AArch64CC::CondCode CC1, CC2; 8501 changeFPCCToAArch64CC(CC, CC1, CC2); 8502 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 8503 SDValue BR1 = 8504 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp); 8505 if (CC2 != AArch64CC::AL) { 8506 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); 8507 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val, 8508 Cmp); 8509 } 8510 8511 return BR1; 8512 } 8513 8514 SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, 8515 SelectionDAG &DAG) const { 8516 if (!Subtarget->hasNEON()) 8517 return SDValue(); 8518 8519 EVT VT = Op.getValueType(); 8520 EVT IntVT = VT.changeTypeToInteger(); 8521 SDLoc DL(Op); 8522 8523 SDValue In1 = Op.getOperand(0); 8524 SDValue In2 = Op.getOperand(1); 8525 EVT SrcVT = In2.getValueType(); 8526 8527 if (!SrcVT.bitsEq(VT)) 8528 In2 = DAG.getFPExtendOrRound(In2, DL, VT); 8529 8530 if (VT.isScalableVector()) 8531 IntVT = 8532 getPackedSVEVectorVT(VT.getVectorElementType().changeTypeToInteger()); 8533 8534 if (VT.isFixedLengthVector() && 8535 useSVEForFixedLengthVectorVT(VT, Subtarget->forceStreamingCompatibleSVE())) { 8536 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 8537 8538 In1 = convertToScalableVector(DAG, ContainerVT, In1); 8539 In2 = convertToScalableVector(DAG, ContainerVT, In2); 8540 8541 SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2); 8542 return convertFromScalableVector(DAG, VT, Res); 8543 } 8544 8545 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) { 8546 if (VT.isScalableVector()) 8547 return getSVESafeBitCast(VT, Op, DAG); 8548 8549 return DAG.getBitcast(VT, Op); 8550 }; 8551 8552 SDValue VecVal1, VecVal2; 8553 EVT VecVT; 8554 auto SetVecVal = [&](int Idx = -1) { 8555 if (!VT.isVector()) { 8556 VecVal1 = 8557 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1); 8558 VecVal2 = 8559 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2); 8560 } else { 8561 VecVal1 = BitCast(VecVT, In1, DAG); 8562 VecVal2 = BitCast(VecVT, In2, DAG); 8563 } 8564 }; 8565 if (VT.isVector()) { 8566 VecVT = IntVT; 8567 SetVecVal(); 8568 } else if (VT == MVT::f64) { 8569 VecVT = MVT::v2i64; 8570 SetVecVal(AArch64::dsub); 8571 } else if (VT == MVT::f32) { 8572 VecVT = MVT::v4i32; 8573 SetVecVal(AArch64::ssub); 8574 } else if (VT == MVT::f16) { 8575 VecVT = MVT::v8i16; 8576 SetVecVal(AArch64::hsub); 8577 } else { 8578 llvm_unreachable("Invalid type for copysign!"); 8579 } 8580 8581 unsigned BitWidth = In1.getScalarValueSizeInBits(); 8582 SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT); 8583 8584 // We want to materialize a mask with every bit but the high bit set, but the 8585 // AdvSIMD immediate moves cannot materialize that in a single instruction for 8586 // 64-bit elements. Instead, materialize all bits set and then negate that. 8587 if (VT == MVT::f64 || VT == MVT::v2f64) { 8588 SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT); 8589 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV); 8590 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV); 8591 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV); 8592 } 8593 8594 SDValue BSP = 8595 DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2); 8596 if (VT == MVT::f16) 8597 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP); 8598 if (VT == MVT::f32) 8599 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP); 8600 if (VT == MVT::f64) 8601 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP); 8602 8603 return BitCast(VT, BSP, DAG); 8604 } 8605 8606 SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op, 8607 SelectionDAG &DAG) const { 8608 if (DAG.getMachineFunction().getFunction().hasFnAttribute( 8609 Attribute::NoImplicitFloat)) 8610 return SDValue(); 8611 8612 if (!Subtarget->hasNEON()) 8613 return SDValue(); 8614 8615 bool IsParity = Op.getOpcode() == ISD::PARITY; 8616 SDValue Val = Op.getOperand(0); 8617 SDLoc DL(Op); 8618 EVT VT = Op.getValueType(); 8619 8620 // for i32, general parity function using EORs is more efficient compared to 8621 // using floating point 8622 if (VT == MVT::i32 && IsParity) 8623 return SDValue(); 8624 8625 // If there is no CNT instruction available, GPR popcount can 8626 // be more efficiently lowered to the following sequence that uses 8627 // AdvSIMD registers/instructions as long as the copies to/from 8628 // the AdvSIMD registers are cheap. 8629 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd 8630 // CNT V0.8B, V0.8B // 8xbyte pop-counts 8631 // ADDV B0, V0.8B // sum 8xbyte pop-counts 8632 // UMOV X0, V0.B[0] // copy byte result back to integer reg 8633 if (VT == MVT::i32 || VT == MVT::i64) { 8634 if (VT == MVT::i32) 8635 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val); 8636 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val); 8637 8638 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val); 8639 SDValue UaddLV = DAG.getNode( 8640 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, 8641 DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop); 8642 8643 if (IsParity) 8644 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV, 8645 DAG.getConstant(1, DL, MVT::i32)); 8646 8647 if (VT == MVT::i64) 8648 UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV); 8649 return UaddLV; 8650 } else if (VT == MVT::i128) { 8651 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val); 8652 8653 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val); 8654 SDValue UaddLV = DAG.getNode( 8655 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, 8656 DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop); 8657 8658 if (IsParity) 8659 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV, 8660 DAG.getConstant(1, DL, MVT::i32)); 8661 8662 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV); 8663 } 8664 8665 assert(!IsParity && "ISD::PARITY of vector types not supported"); 8666 8667 if (VT.isScalableVector() || 8668 useSVEForFixedLengthVectorVT(VT, 8669 Subtarget->forceStreamingCompatibleSVE())) 8670 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU); 8671 8672 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 || 8673 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) && 8674 "Unexpected type for custom ctpop lowering"); 8675 8676 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; 8677 Val = DAG.getBitcast(VT8Bit, Val); 8678 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val); 8679 8680 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds. 8681 unsigned EltSize = 8; 8682 unsigned NumElts = VT.is64BitVector() ? 8 : 16; 8683 while (EltSize != VT.getScalarSizeInBits()) { 8684 EltSize *= 2; 8685 NumElts /= 2; 8686 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts); 8687 Val = DAG.getNode( 8688 ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, 8689 DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val); 8690 } 8691 8692 return Val; 8693 } 8694 8695 SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { 8696 EVT VT = Op.getValueType(); 8697 assert(VT.isScalableVector() || 8698 useSVEForFixedLengthVectorVT( 8699 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())); 8700 8701 SDLoc DL(Op); 8702 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0)); 8703 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT); 8704 } 8705 8706 SDValue AArch64TargetLowering::LowerMinMax(SDValue Op, 8707 SelectionDAG &DAG) const { 8708 8709 EVT VT = Op.getValueType(); 8710 SDLoc DL(Op); 8711 unsigned Opcode = Op.getOpcode(); 8712 ISD::CondCode CC; 8713 switch (Opcode) { 8714 default: 8715 llvm_unreachable("Wrong instruction"); 8716 case ISD::SMAX: 8717 CC = ISD::SETGT; 8718 break; 8719 case ISD::SMIN: 8720 CC = ISD::SETLT; 8721 break; 8722 case ISD::UMAX: 8723 CC = ISD::SETUGT; 8724 break; 8725 case ISD::UMIN: 8726 CC = ISD::SETULT; 8727 break; 8728 } 8729 8730 if (VT.isScalableVector() || 8731 useSVEForFixedLengthVectorVT( 8732 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) { 8733 switch (Opcode) { 8734 default: 8735 llvm_unreachable("Wrong instruction"); 8736 case ISD::SMAX: 8737 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED); 8738 case ISD::SMIN: 8739 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED); 8740 case ISD::UMAX: 8741 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED); 8742 case ISD::UMIN: 8743 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED); 8744 } 8745 } 8746 8747 SDValue Op0 = Op.getOperand(0); 8748 SDValue Op1 = Op.getOperand(1); 8749 SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC); 8750 return DAG.getSelect(DL, VT, Cond, Op0, Op1); 8751 } 8752 8753 SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op, 8754 SelectionDAG &DAG) const { 8755 EVT VT = Op.getValueType(); 8756 8757 if (VT.isScalableVector() || 8758 useSVEForFixedLengthVectorVT( 8759 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) 8760 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU); 8761 8762 SDLoc DL(Op); 8763 SDValue REVB; 8764 MVT VST; 8765 8766 switch (VT.getSimpleVT().SimpleTy) { 8767 default: 8768 llvm_unreachable("Invalid type for bitreverse!"); 8769 8770 case MVT::v2i32: { 8771 VST = MVT::v8i8; 8772 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0)); 8773 8774 break; 8775 } 8776 8777 case MVT::v4i32: { 8778 VST = MVT::v16i8; 8779 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0)); 8780 8781 break; 8782 } 8783 8784 case MVT::v1i64: { 8785 VST = MVT::v8i8; 8786 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0)); 8787 8788 break; 8789 } 8790 8791 case MVT::v2i64: { 8792 VST = MVT::v16i8; 8793 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0)); 8794 8795 break; 8796 } 8797 } 8798 8799 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, 8800 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB)); 8801 } 8802 8803 // Check whether the continuous comparison sequence. 8804 static bool 8805 isOrXorChain(SDValue N, unsigned &Num, 8806 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) { 8807 if (Num == MaxXors) 8808 return false; 8809 8810 // Skip the one-use zext 8811 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse()) 8812 N = N->getOperand(0); 8813 8814 // The leaf node must be XOR 8815 if (N->getOpcode() == ISD::XOR) { 8816 WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1))); 8817 Num++; 8818 return true; 8819 } 8820 8821 // All the non-leaf nodes must be OR. 8822 if (N->getOpcode() != ISD::OR || !N->hasOneUse()) 8823 return false; 8824 8825 if (isOrXorChain(N->getOperand(0), Num, WorkList) && 8826 isOrXorChain(N->getOperand(1), Num, WorkList)) 8827 return true; 8828 return false; 8829 } 8830 8831 // Transform chains of ORs and XORs, which usually outlined by memcmp/bmp. 8832 static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG) { 8833 SDValue LHS = N->getOperand(0); 8834 SDValue RHS = N->getOperand(1); 8835 SDLoc DL(N); 8836 EVT VT = N->getValueType(0); 8837 SmallVector<std::pair<SDValue, SDValue>, 16> WorkList; 8838 8839 // Only handle integer compares. 8840 if (N->getOpcode() != ISD::SETCC) 8841 return SDValue(); 8842 8843 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get(); 8844 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as: 8845 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag 8846 unsigned NumXors = 0; 8847 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) && 8848 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() && 8849 isOrXorChain(LHS, NumXors, WorkList)) { 8850 SDValue XOR0, XOR1; 8851 std::tie(XOR0, XOR1) = WorkList[0]; 8852 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR; 8853 SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond); 8854 for (unsigned I = 1; I < WorkList.size(); I++) { 8855 std::tie(XOR0, XOR1) = WorkList[I]; 8856 SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond); 8857 Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain); 8858 } 8859 8860 // Exit early by inverting the condition, which help reduce indentations. 8861 return Cmp; 8862 } 8863 8864 return SDValue(); 8865 } 8866 8867 SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 8868 8869 if (Op.getValueType().isVector()) 8870 return LowerVSETCC(Op, DAG); 8871 8872 bool IsStrict = Op->isStrictFPOpcode(); 8873 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS; 8874 unsigned OpNo = IsStrict ? 1 : 0; 8875 SDValue Chain; 8876 if (IsStrict) 8877 Chain = Op.getOperand(0); 8878 SDValue LHS = Op.getOperand(OpNo + 0); 8879 SDValue RHS = Op.getOperand(OpNo + 1); 8880 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get(); 8881 SDLoc dl(Op); 8882 8883 // We chose ZeroOrOneBooleanContents, so use zero and one. 8884 EVT VT = Op.getValueType(); 8885 SDValue TVal = DAG.getConstant(1, dl, VT); 8886 SDValue FVal = DAG.getConstant(0, dl, VT); 8887 8888 // Handle f128 first, since one possible outcome is a normal integer 8889 // comparison which gets picked up by the next if statement. 8890 if (LHS.getValueType() == MVT::f128) { 8891 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain, 8892 IsSignaling); 8893 8894 // If softenSetCCOperands returned a scalar, use it. 8895 if (!RHS.getNode()) { 8896 assert(LHS.getValueType() == Op.getValueType() && 8897 "Unexpected setcc expansion!"); 8898 return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS; 8899 } 8900 } 8901 8902 if (LHS.getValueType().isInteger()) { 8903 SDValue CCVal; 8904 SDValue Cmp = getAArch64Cmp( 8905 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl); 8906 8907 // Note that we inverted the condition above, so we reverse the order of 8908 // the true and false operands here. This will allow the setcc to be 8909 // matched to a single CSINC instruction. 8910 SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp); 8911 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res; 8912 } 8913 8914 // Now we know we're dealing with FP values. 8915 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 || 8916 LHS.getValueType() == MVT::f64); 8917 8918 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead 8919 // and do the comparison. 8920 SDValue Cmp; 8921 if (IsStrict) 8922 Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling); 8923 else 8924 Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 8925 8926 AArch64CC::CondCode CC1, CC2; 8927 changeFPCCToAArch64CC(CC, CC1, CC2); 8928 SDValue Res; 8929 if (CC2 == AArch64CC::AL) { 8930 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1, 8931 CC2); 8932 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 8933 8934 // Note that we inverted the condition above, so we reverse the order of 8935 // the true and false operands here. This will allow the setcc to be 8936 // matched to a single CSINC instruction. 8937 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp); 8938 } else { 8939 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't 8940 // totally clean. Some of them require two CSELs to implement. As is in 8941 // this case, we emit the first CSEL and then emit a second using the output 8942 // of the first as the RHS. We're effectively OR'ing the two CC's together. 8943 8944 // FIXME: It would be nice if we could match the two CSELs to two CSINCs. 8945 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 8946 SDValue CS1 = 8947 DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); 8948 8949 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); 8950 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); 8951 } 8952 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res; 8953 } 8954 8955 SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op, 8956 SelectionDAG &DAG) const { 8957 8958 SDValue LHS = Op.getOperand(0); 8959 SDValue RHS = Op.getOperand(1); 8960 EVT VT = LHS.getValueType(); 8961 if (VT != MVT::i32 && VT != MVT::i64) 8962 return SDValue(); 8963 8964 SDLoc DL(Op); 8965 SDValue Carry = Op.getOperand(2); 8966 // SBCS uses a carry not a borrow so the carry flag should be inverted first. 8967 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true); 8968 SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue), 8969 LHS, RHS, InvCarry); 8970 8971 EVT OpVT = Op.getValueType(); 8972 SDValue TVal = DAG.getConstant(1, DL, OpVT); 8973 SDValue FVal = DAG.getConstant(0, DL, OpVT); 8974 8975 ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get(); 8976 ISD::CondCode CondInv = ISD::getSetCCInverse(Cond, VT); 8977 SDValue CCVal = 8978 DAG.getConstant(changeIntCCToAArch64CC(CondInv), DL, MVT::i32); 8979 // Inputs are swapped because the condition is inverted. This will allow 8980 // matching with a single CSINC instruction. 8981 return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal, 8982 Cmp.getValue(1)); 8983 } 8984 8985 SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, 8986 SDValue RHS, SDValue TVal, 8987 SDValue FVal, const SDLoc &dl, 8988 SelectionDAG &DAG) const { 8989 // Handle f128 first, because it will result in a comparison of some RTLIB 8990 // call result against zero. 8991 if (LHS.getValueType() == MVT::f128) { 8992 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS); 8993 8994 // If softenSetCCOperands returned a scalar, we need to compare the result 8995 // against zero to select between true and false values. 8996 if (!RHS.getNode()) { 8997 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 8998 CC = ISD::SETNE; 8999 } 9000 } 9001 9002 // Also handle f16, for which we need to do a f32 comparison. 9003 if (LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { 9004 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); 9005 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); 9006 } 9007 9008 // Next, handle integers. 9009 if (LHS.getValueType().isInteger()) { 9010 assert((LHS.getValueType() == RHS.getValueType()) && 9011 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); 9012 9013 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal); 9014 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal); 9015 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS); 9016 // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform 9017 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the 9018 // supported types. 9019 if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal && 9020 CTVal->isOne() && CFVal->isAllOnes() && 9021 LHS.getValueType() == TVal.getValueType()) { 9022 EVT VT = LHS.getValueType(); 9023 SDValue Shift = 9024 DAG.getNode(ISD::SRA, dl, VT, LHS, 9025 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT)); 9026 return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT)); 9027 } 9028 9029 unsigned Opcode = AArch64ISD::CSEL; 9030 9031 // If both the TVal and the FVal are constants, see if we can swap them in 9032 // order to for a CSINV or CSINC out of them. 9033 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) { 9034 std::swap(TVal, FVal); 9035 std::swap(CTVal, CFVal); 9036 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 9037 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) { 9038 std::swap(TVal, FVal); 9039 std::swap(CTVal, CFVal); 9040 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 9041 } else if (TVal.getOpcode() == ISD::XOR) { 9042 // If TVal is a NOT we want to swap TVal and FVal so that we can match 9043 // with a CSINV rather than a CSEL. 9044 if (isAllOnesConstant(TVal.getOperand(1))) { 9045 std::swap(TVal, FVal); 9046 std::swap(CTVal, CFVal); 9047 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 9048 } 9049 } else if (TVal.getOpcode() == ISD::SUB) { 9050 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so 9051 // that we can match with a CSNEG rather than a CSEL. 9052 if (isNullConstant(TVal.getOperand(0))) { 9053 std::swap(TVal, FVal); 9054 std::swap(CTVal, CFVal); 9055 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 9056 } 9057 } else if (CTVal && CFVal) { 9058 const int64_t TrueVal = CTVal->getSExtValue(); 9059 const int64_t FalseVal = CFVal->getSExtValue(); 9060 bool Swap = false; 9061 9062 // If both TVal and FVal are constants, see if FVal is the 9063 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC 9064 // instead of a CSEL in that case. 9065 if (TrueVal == ~FalseVal) { 9066 Opcode = AArch64ISD::CSINV; 9067 } else if (FalseVal > std::numeric_limits<int64_t>::min() && 9068 TrueVal == -FalseVal) { 9069 Opcode = AArch64ISD::CSNEG; 9070 } else if (TVal.getValueType() == MVT::i32) { 9071 // If our operands are only 32-bit wide, make sure we use 32-bit 9072 // arithmetic for the check whether we can use CSINC. This ensures that 9073 // the addition in the check will wrap around properly in case there is 9074 // an overflow (which would not be the case if we do the check with 9075 // 64-bit arithmetic). 9076 const uint32_t TrueVal32 = CTVal->getZExtValue(); 9077 const uint32_t FalseVal32 = CFVal->getZExtValue(); 9078 9079 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) { 9080 Opcode = AArch64ISD::CSINC; 9081 9082 if (TrueVal32 > FalseVal32) { 9083 Swap = true; 9084 } 9085 } 9086 } else { 9087 // 64-bit check whether we can use CSINC. 9088 const uint64_t TrueVal64 = TrueVal; 9089 const uint64_t FalseVal64 = FalseVal; 9090 9091 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) { 9092 Opcode = AArch64ISD::CSINC; 9093 9094 if (TrueVal > FalseVal) { 9095 Swap = true; 9096 } 9097 } 9098 } 9099 9100 // Swap TVal and FVal if necessary. 9101 if (Swap) { 9102 std::swap(TVal, FVal); 9103 std::swap(CTVal, CFVal); 9104 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 9105 } 9106 9107 if (Opcode != AArch64ISD::CSEL) { 9108 // Drop FVal since we can get its value by simply inverting/negating 9109 // TVal. 9110 FVal = TVal; 9111 } 9112 } 9113 9114 // Avoid materializing a constant when possible by reusing a known value in 9115 // a register. However, don't perform this optimization if the known value 9116 // is one, zero or negative one in the case of a CSEL. We can always 9117 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the 9118 // FVal, respectively. 9119 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS); 9120 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() && 9121 !RHSVal->isZero() && !RHSVal->isAllOnes()) { 9122 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); 9123 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to 9124 // "a != C ? x : a" to avoid materializing C. 9125 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ) 9126 TVal = LHS; 9127 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE) 9128 FVal = LHS; 9129 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) { 9130 assert (CTVal && CFVal && "Expected constant operands for CSNEG."); 9131 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to 9132 // avoid materializing C. 9133 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); 9134 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) { 9135 Opcode = AArch64ISD::CSINV; 9136 TVal = LHS; 9137 FVal = DAG.getConstant(0, dl, FVal.getValueType()); 9138 } 9139 } 9140 9141 SDValue CCVal; 9142 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); 9143 EVT VT = TVal.getValueType(); 9144 return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp); 9145 } 9146 9147 // Now we know we're dealing with FP values. 9148 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 || 9149 LHS.getValueType() == MVT::f64); 9150 assert(LHS.getValueType() == RHS.getValueType()); 9151 EVT VT = TVal.getValueType(); 9152 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 9153 9154 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally 9155 // clean. Some of them require two CSELs to implement. 9156 AArch64CC::CondCode CC1, CC2; 9157 changeFPCCToAArch64CC(CC, CC1, CC2); 9158 9159 if (DAG.getTarget().Options.UnsafeFPMath) { 9160 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and 9161 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0. 9162 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS); 9163 if (RHSVal && RHSVal->isZero()) { 9164 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal); 9165 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal); 9166 9167 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) && 9168 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType()) 9169 TVal = LHS; 9170 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) && 9171 CFVal && CFVal->isZero() && 9172 FVal.getValueType() == LHS.getValueType()) 9173 FVal = LHS; 9174 } 9175 } 9176 9177 // Emit first, and possibly only, CSEL. 9178 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 9179 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); 9180 9181 // If we need a second CSEL, emit it, using the output of the first as the 9182 // RHS. We're effectively OR'ing the two CC's together. 9183 if (CC2 != AArch64CC::AL) { 9184 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); 9185 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); 9186 } 9187 9188 // Otherwise, return the output of the first CSEL. 9189 return CS1; 9190 } 9191 9192 SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op, 9193 SelectionDAG &DAG) const { 9194 EVT Ty = Op.getValueType(); 9195 auto Idx = Op.getConstantOperandAPInt(2); 9196 int64_t IdxVal = Idx.getSExtValue(); 9197 assert(Ty.isScalableVector() && 9198 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE"); 9199 9200 // We can use the splice instruction for certain index values where we are 9201 // able to efficiently generate the correct predicate. The index will be 9202 // inverted and used directly as the input to the ptrue instruction, i.e. 9203 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the 9204 // splice predicate. However, we can only do this if we can guarantee that 9205 // there are enough elements in the vector, hence we check the index <= min 9206 // number of elements. 9207 std::optional<unsigned> PredPattern; 9208 if (Ty.isScalableVector() && IdxVal < 0 && 9209 (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) != 9210 std::nullopt) { 9211 SDLoc DL(Op); 9212 9213 // Create a predicate where all but the last -IdxVal elements are false. 9214 EVT PredVT = Ty.changeVectorElementType(MVT::i1); 9215 SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern); 9216 Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred); 9217 9218 // Now splice the two inputs together using the predicate. 9219 return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0), 9220 Op.getOperand(1)); 9221 } 9222 9223 // This will select to an EXT instruction, which has a maximum immediate 9224 // value of 255, hence 2048-bits is the maximum value we can lower. 9225 if (IdxVal >= 0 && 9226 IdxVal < int64_t(2048 / Ty.getVectorElementType().getSizeInBits())) 9227 return Op; 9228 9229 return SDValue(); 9230 } 9231 9232 SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, 9233 SelectionDAG &DAG) const { 9234 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 9235 SDValue LHS = Op.getOperand(0); 9236 SDValue RHS = Op.getOperand(1); 9237 SDValue TVal = Op.getOperand(2); 9238 SDValue FVal = Op.getOperand(3); 9239 SDLoc DL(Op); 9240 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); 9241 } 9242 9243 SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, 9244 SelectionDAG &DAG) const { 9245 SDValue CCVal = Op->getOperand(0); 9246 SDValue TVal = Op->getOperand(1); 9247 SDValue FVal = Op->getOperand(2); 9248 SDLoc DL(Op); 9249 9250 EVT Ty = Op.getValueType(); 9251 if (Ty.isScalableVector()) { 9252 SDValue TruncCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, CCVal); 9253 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount()); 9254 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, TruncCC); 9255 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal); 9256 } 9257 9258 if (useSVEForFixedLengthVectorVT(Ty)) { 9259 // FIXME: Ideally this would be the same as above using i1 types, however 9260 // for the moment we can't deal with fixed i1 vector types properly, so 9261 // instead extend the predicate to a result type sized integer vector. 9262 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits()); 9263 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount()); 9264 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT); 9265 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal); 9266 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal); 9267 } 9268 9269 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select 9270 // instruction. 9271 if (ISD::isOverflowIntrOpRes(CCVal)) { 9272 // Only lower legal XALUO ops. 9273 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0))) 9274 return SDValue(); 9275 9276 AArch64CC::CondCode OFCC; 9277 SDValue Value, Overflow; 9278 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG); 9279 SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32); 9280 9281 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal, 9282 CCVal, Overflow); 9283 } 9284 9285 // Lower it the same way as we would lower a SELECT_CC node. 9286 ISD::CondCode CC; 9287 SDValue LHS, RHS; 9288 if (CCVal.getOpcode() == ISD::SETCC) { 9289 LHS = CCVal.getOperand(0); 9290 RHS = CCVal.getOperand(1); 9291 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get(); 9292 } else { 9293 LHS = CCVal; 9294 RHS = DAG.getConstant(0, DL, CCVal.getValueType()); 9295 CC = ISD::SETNE; 9296 } 9297 9298 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in 9299 // order to use FCSELSrrr 9300 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) { 9301 TVal = SDValue( 9302 DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32, 9303 DAG.getUNDEF(MVT::f32), TVal, 9304 DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)), 9305 0); 9306 FVal = SDValue( 9307 DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32, 9308 DAG.getUNDEF(MVT::f32), FVal, 9309 DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)), 9310 0); 9311 } 9312 9313 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); 9314 9315 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) { 9316 Res = SDValue( 9317 DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, Ty, Res, 9318 DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)), 9319 0); 9320 } 9321 9322 return Res; 9323 } 9324 9325 SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op, 9326 SelectionDAG &DAG) const { 9327 // Jump table entries as PC relative offsets. No additional tweaking 9328 // is necessary here. Just get the address of the jump table. 9329 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 9330 9331 if (getTargetMachine().getCodeModel() == CodeModel::Large && 9332 !Subtarget->isTargetMachO()) { 9333 return getAddrLarge(JT, DAG); 9334 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) { 9335 return getAddrTiny(JT, DAG); 9336 } 9337 return getAddr(JT, DAG); 9338 } 9339 9340 SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op, 9341 SelectionDAG &DAG) const { 9342 // Jump table entries as PC relative offsets. No additional tweaking 9343 // is necessary here. Just get the address of the jump table. 9344 SDLoc DL(Op); 9345 SDValue JT = Op.getOperand(1); 9346 SDValue Entry = Op.getOperand(2); 9347 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex(); 9348 9349 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); 9350 AFI->setJumpTableEntryInfo(JTI, 4, nullptr); 9351 9352 SDNode *Dest = 9353 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT, 9354 Entry, DAG.getTargetJumpTable(JTI, MVT::i32)); 9355 return DAG.getNode(ISD::BRIND, DL, MVT::Other, Op.getOperand(0), 9356 SDValue(Dest, 0)); 9357 } 9358 9359 SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op, 9360 SelectionDAG &DAG) const { 9361 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 9362 9363 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 9364 // Use the GOT for the large code model on iOS. 9365 if (Subtarget->isTargetMachO()) { 9366 return getGOT(CP, DAG); 9367 } 9368 return getAddrLarge(CP, DAG); 9369 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) { 9370 return getAddrTiny(CP, DAG); 9371 } else { 9372 return getAddr(CP, DAG); 9373 } 9374 } 9375 9376 SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op, 9377 SelectionDAG &DAG) const { 9378 BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op); 9379 if (getTargetMachine().getCodeModel() == CodeModel::Large && 9380 !Subtarget->isTargetMachO()) { 9381 return getAddrLarge(BA, DAG); 9382 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) { 9383 return getAddrTiny(BA, DAG); 9384 } 9385 return getAddr(BA, DAG); 9386 } 9387 9388 SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op, 9389 SelectionDAG &DAG) const { 9390 AArch64FunctionInfo *FuncInfo = 9391 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); 9392 9393 SDLoc DL(Op); 9394 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), 9395 getPointerTy(DAG.getDataLayout())); 9396 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout())); 9397 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 9398 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 9399 MachinePointerInfo(SV)); 9400 } 9401 9402 SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op, 9403 SelectionDAG &DAG) const { 9404 MachineFunction &MF = DAG.getMachineFunction(); 9405 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 9406 9407 SDLoc DL(Op); 9408 SDValue FR; 9409 if (Subtarget->isWindowsArm64EC()) { 9410 // With the Arm64EC ABI, we compute the address of the varargs save area 9411 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry, 9412 // but calls from an entry thunk can pass in a different address. 9413 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass); 9414 SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64); 9415 uint64_t StackOffset; 9416 if (FuncInfo->getVarArgsGPRSize() > 0) 9417 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize(); 9418 else 9419 StackOffset = FuncInfo->getVarArgsStackOffset(); 9420 FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val, 9421 DAG.getConstant(StackOffset, DL, MVT::i64)); 9422 } else { 9423 FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0 9424 ? FuncInfo->getVarArgsGPRIndex() 9425 : FuncInfo->getVarArgsStackIndex(), 9426 getPointerTy(DAG.getDataLayout())); 9427 } 9428 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 9429 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 9430 MachinePointerInfo(SV)); 9431 } 9432 9433 SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, 9434 SelectionDAG &DAG) const { 9435 // The layout of the va_list struct is specified in the AArch64 Procedure Call 9436 // Standard, section B.3. 9437 MachineFunction &MF = DAG.getMachineFunction(); 9438 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 9439 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8; 9440 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout()); 9441 auto PtrVT = getPointerTy(DAG.getDataLayout()); 9442 SDLoc DL(Op); 9443 9444 SDValue Chain = Op.getOperand(0); 9445 SDValue VAList = Op.getOperand(1); 9446 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 9447 SmallVector<SDValue, 4> MemOps; 9448 9449 // void *__stack at offset 0 9450 unsigned Offset = 0; 9451 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT); 9452 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT); 9453 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList, 9454 MachinePointerInfo(SV), Align(PtrSize))); 9455 9456 // void *__gr_top at offset 8 (4 on ILP32) 9457 Offset += PtrSize; 9458 int GPRSize = FuncInfo->getVarArgsGPRSize(); 9459 if (GPRSize > 0) { 9460 SDValue GRTop, GRTopAddr; 9461 9462 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 9463 DAG.getConstant(Offset, DL, PtrVT)); 9464 9465 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT); 9466 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop, 9467 DAG.getConstant(GPRSize, DL, PtrVT)); 9468 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT); 9469 9470 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr, 9471 MachinePointerInfo(SV, Offset), 9472 Align(PtrSize))); 9473 } 9474 9475 // void *__vr_top at offset 16 (8 on ILP32) 9476 Offset += PtrSize; 9477 int FPRSize = FuncInfo->getVarArgsFPRSize(); 9478 if (FPRSize > 0) { 9479 SDValue VRTop, VRTopAddr; 9480 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 9481 DAG.getConstant(Offset, DL, PtrVT)); 9482 9483 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT); 9484 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop, 9485 DAG.getConstant(FPRSize, DL, PtrVT)); 9486 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT); 9487 9488 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr, 9489 MachinePointerInfo(SV, Offset), 9490 Align(PtrSize))); 9491 } 9492 9493 // int __gr_offs at offset 24 (12 on ILP32) 9494 Offset += PtrSize; 9495 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 9496 DAG.getConstant(Offset, DL, PtrVT)); 9497 MemOps.push_back( 9498 DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), 9499 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4))); 9500 9501 // int __vr_offs at offset 28 (16 on ILP32) 9502 Offset += 4; 9503 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 9504 DAG.getConstant(Offset, DL, PtrVT)); 9505 MemOps.push_back( 9506 DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), 9507 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4))); 9508 9509 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); 9510 } 9511 9512 SDValue AArch64TargetLowering::LowerVASTART(SDValue Op, 9513 SelectionDAG &DAG) const { 9514 MachineFunction &MF = DAG.getMachineFunction(); 9515 9516 if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv())) 9517 return LowerWin64_VASTART(Op, DAG); 9518 else if (Subtarget->isTargetDarwin()) 9519 return LowerDarwin_VASTART(Op, DAG); 9520 else 9521 return LowerAAPCS_VASTART(Op, DAG); 9522 } 9523 9524 SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, 9525 SelectionDAG &DAG) const { 9526 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single 9527 // pointer. 9528 SDLoc DL(Op); 9529 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8; 9530 unsigned VaListSize = 9531 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows()) 9532 ? PtrSize 9533 : Subtarget->isTargetILP32() ? 20 : 32; 9534 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 9535 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 9536 9537 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2), 9538 DAG.getConstant(VaListSize, DL, MVT::i32), 9539 Align(PtrSize), false, false, false, 9540 MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV)); 9541 } 9542 9543 SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 9544 assert(Subtarget->isTargetDarwin() && 9545 "automatic va_arg instruction only works on Darwin"); 9546 9547 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 9548 EVT VT = Op.getValueType(); 9549 SDLoc DL(Op); 9550 SDValue Chain = Op.getOperand(0); 9551 SDValue Addr = Op.getOperand(1); 9552 MaybeAlign Align(Op.getConstantOperandVal(3)); 9553 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8; 9554 auto PtrVT = getPointerTy(DAG.getDataLayout()); 9555 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout()); 9556 SDValue VAList = 9557 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V)); 9558 Chain = VAList.getValue(1); 9559 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT); 9560 9561 if (VT.isScalableVector()) 9562 report_fatal_error("Passing SVE types to variadic functions is " 9563 "currently not supported"); 9564 9565 if (Align && *Align > MinSlotSize) { 9566 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 9567 DAG.getConstant(Align->value() - 1, DL, PtrVT)); 9568 VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList, 9569 DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT)); 9570 } 9571 9572 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); 9573 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); 9574 9575 // Scalar integer and FP values smaller than 64 bits are implicitly extended 9576 // up to 64 bits. At the very least, we have to increase the striding of the 9577 // vaargs list to match this, and for FP values we need to introduce 9578 // FP_ROUND nodes as well. 9579 if (VT.isInteger() && !VT.isVector()) 9580 ArgSize = std::max(ArgSize, MinSlotSize); 9581 bool NeedFPTrunc = false; 9582 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) { 9583 ArgSize = 8; 9584 NeedFPTrunc = true; 9585 } 9586 9587 // Increment the pointer, VAList, to the next vaarg 9588 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 9589 DAG.getConstant(ArgSize, DL, PtrVT)); 9590 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT); 9591 9592 // Store the incremented VAList to the legalized pointer 9593 SDValue APStore = 9594 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V)); 9595 9596 // Load the actual argument out of the pointer VAList 9597 if (NeedFPTrunc) { 9598 // Load the value as an f64. 9599 SDValue WideFP = 9600 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo()); 9601 // Round the value down to an f32. 9602 SDValue NarrowFP = 9603 DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0), 9604 DAG.getIntPtrConstant(1, DL, /*isTarget=*/true)); 9605 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) }; 9606 // Merge the rounded value with the chain output of the load. 9607 return DAG.getMergeValues(Ops, DL); 9608 } 9609 9610 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo()); 9611 } 9612 9613 SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, 9614 SelectionDAG &DAG) const { 9615 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 9616 MFI.setFrameAddressIsTaken(true); 9617 9618 EVT VT = Op.getValueType(); 9619 SDLoc DL(Op); 9620 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 9621 SDValue FrameAddr = 9622 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64); 9623 while (Depth--) 9624 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr, 9625 MachinePointerInfo()); 9626 9627 if (Subtarget->isTargetILP32()) 9628 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr, 9629 DAG.getValueType(VT)); 9630 9631 return FrameAddr; 9632 } 9633 9634 SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op, 9635 SelectionDAG &DAG) const { 9636 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 9637 9638 EVT VT = getPointerTy(DAG.getDataLayout()); 9639 SDLoc DL(Op); 9640 int FI = MFI.CreateFixedObject(4, 0, false); 9641 return DAG.getFrameIndex(FI, VT); 9642 } 9643 9644 #define GET_REGISTER_MATCHER 9645 #include "AArch64GenAsmMatcher.inc" 9646 9647 // FIXME? Maybe this could be a TableGen attribute on some registers and 9648 // this table could be generated automatically from RegInfo. 9649 Register AArch64TargetLowering:: 9650 getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const { 9651 Register Reg = MatchRegisterName(RegName); 9652 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) { 9653 const MCRegisterInfo *MRI = Subtarget->getRegisterInfo(); 9654 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false); 9655 if (!Subtarget->isXRegisterReserved(DwarfRegNum)) 9656 Reg = 0; 9657 } 9658 if (Reg) 9659 return Reg; 9660 report_fatal_error(Twine("Invalid register name \"" 9661 + StringRef(RegName) + "\".")); 9662 } 9663 9664 SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op, 9665 SelectionDAG &DAG) const { 9666 DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true); 9667 9668 EVT VT = Op.getValueType(); 9669 SDLoc DL(Op); 9670 9671 SDValue FrameAddr = 9672 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT); 9673 SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout())); 9674 9675 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset); 9676 } 9677 9678 SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, 9679 SelectionDAG &DAG) const { 9680 MachineFunction &MF = DAG.getMachineFunction(); 9681 MachineFrameInfo &MFI = MF.getFrameInfo(); 9682 MFI.setReturnAddressIsTaken(true); 9683 9684 EVT VT = Op.getValueType(); 9685 SDLoc DL(Op); 9686 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 9687 SDValue ReturnAddress; 9688 if (Depth) { 9689 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 9690 SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout())); 9691 ReturnAddress = DAG.getLoad( 9692 VT, DL, DAG.getEntryNode(), 9693 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo()); 9694 } else { 9695 // Return LR, which contains the return address. Mark it an implicit 9696 // live-in. 9697 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass); 9698 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); 9699 } 9700 9701 // The XPACLRI instruction assembles to a hint-space instruction before 9702 // Armv8.3-A therefore this instruction can be safely used for any pre 9703 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use 9704 // that instead. 9705 SDNode *St; 9706 if (Subtarget->hasPAuth()) { 9707 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress); 9708 } else { 9709 // XPACLRI operates on LR therefore we must move the operand accordingly. 9710 SDValue Chain = 9711 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress); 9712 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain); 9713 } 9714 return SDValue(St, 0); 9715 } 9716 9717 /// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two 9718 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 9719 SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op, 9720 SelectionDAG &DAG) const { 9721 SDValue Lo, Hi; 9722 expandShiftParts(Op.getNode(), Lo, Hi, DAG); 9723 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op)); 9724 } 9725 9726 bool AArch64TargetLowering::isOffsetFoldingLegal( 9727 const GlobalAddressSDNode *GA) const { 9728 // Offsets are folded in the DAG combine rather than here so that we can 9729 // intelligently choose an offset based on the uses. 9730 return false; 9731 } 9732 9733 bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, 9734 bool OptForSize) const { 9735 bool IsLegal = false; 9736 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and 9737 // 16-bit case when target has full fp16 support. 9738 // FIXME: We should be able to handle f128 as well with a clever lowering. 9739 const APInt ImmInt = Imm.bitcastToAPInt(); 9740 if (VT == MVT::f64) 9741 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero(); 9742 else if (VT == MVT::f32) 9743 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero(); 9744 else if (VT == MVT::f16 && Subtarget->hasFullFP16()) 9745 IsLegal = AArch64_AM::getFP16Imm(ImmInt) != -1 || Imm.isPosZero(); 9746 // TODO: fmov h0, w0 is also legal, however on't have an isel pattern to 9747 // generate that fmov. 9748 9749 // If we can not materialize in immediate field for fmov, check if the 9750 // value can be encoded as the immediate operand of a logical instruction. 9751 // The immediate value will be created with either MOVZ, MOVN, or ORR. 9752 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) { 9753 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr; 9754 // however the mov+fmov sequence is always better because of the reduced 9755 // cache pressure. The timings are still the same if you consider 9756 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the 9757 // movw+movk is fused). So we limit up to 2 instrdduction at most. 9758 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 9759 AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(), 9760 Insn); 9761 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2)); 9762 IsLegal = Insn.size() <= Limit; 9763 } 9764 9765 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT.getEVTString() 9766 << " imm value: "; Imm.dump();); 9767 return IsLegal; 9768 } 9769 9770 //===----------------------------------------------------------------------===// 9771 // AArch64 Optimization Hooks 9772 //===----------------------------------------------------------------------===// 9773 9774 static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, 9775 SDValue Operand, SelectionDAG &DAG, 9776 int &ExtraSteps) { 9777 EVT VT = Operand.getValueType(); 9778 if ((ST->hasNEON() && 9779 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 || 9780 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 || 9781 VT == MVT::v4f32)) || 9782 (ST->hasSVE() && 9783 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) { 9784 if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified) 9785 // For the reciprocal estimates, convergence is quadratic, so the number 9786 // of digits is doubled after each iteration. In ARMv8, the accuracy of 9787 // the initial estimate is 2^-8. Thus the number of extra steps to refine 9788 // the result for float (23 mantissa bits) is 2 and for double (52 9789 // mantissa bits) is 3. 9790 ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2; 9791 9792 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand); 9793 } 9794 9795 return SDValue(); 9796 } 9797 9798 SDValue 9799 AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG, 9800 const DenormalMode &Mode) const { 9801 SDLoc DL(Op); 9802 EVT VT = Op.getValueType(); 9803 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 9804 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); 9805 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ); 9806 } 9807 9808 SDValue 9809 AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op, 9810 SelectionDAG &DAG) const { 9811 return Op; 9812 } 9813 9814 SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand, 9815 SelectionDAG &DAG, int Enabled, 9816 int &ExtraSteps, 9817 bool &UseOneConst, 9818 bool Reciprocal) const { 9819 if (Enabled == ReciprocalEstimate::Enabled || 9820 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt())) 9821 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand, 9822 DAG, ExtraSteps)) { 9823 SDLoc DL(Operand); 9824 EVT VT = Operand.getValueType(); 9825 9826 SDNodeFlags Flags; 9827 Flags.setAllowReassociation(true); 9828 9829 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2) 9830 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N) 9831 for (int i = ExtraSteps; i > 0; --i) { 9832 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate, 9833 Flags); 9834 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags); 9835 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags); 9836 } 9837 if (!Reciprocal) 9838 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags); 9839 9840 ExtraSteps = 0; 9841 return Estimate; 9842 } 9843 9844 return SDValue(); 9845 } 9846 9847 SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand, 9848 SelectionDAG &DAG, int Enabled, 9849 int &ExtraSteps) const { 9850 if (Enabled == ReciprocalEstimate::Enabled) 9851 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand, 9852 DAG, ExtraSteps)) { 9853 SDLoc DL(Operand); 9854 EVT VT = Operand.getValueType(); 9855 9856 SDNodeFlags Flags; 9857 Flags.setAllowReassociation(true); 9858 9859 // Newton reciprocal iteration: E * (2 - X * E) 9860 // AArch64 reciprocal iteration instruction: (2 - M * N) 9861 for (int i = ExtraSteps; i > 0; --i) { 9862 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand, 9863 Estimate, Flags); 9864 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags); 9865 } 9866 9867 ExtraSteps = 0; 9868 return Estimate; 9869 } 9870 9871 return SDValue(); 9872 } 9873 9874 //===----------------------------------------------------------------------===// 9875 // AArch64 Inline Assembly Support 9876 //===----------------------------------------------------------------------===// 9877 9878 // Table of Constraints 9879 // TODO: This is the current set of constraints supported by ARM for the 9880 // compiler, not all of them may make sense. 9881 // 9882 // r - A general register 9883 // w - An FP/SIMD register of some size in the range v0-v31 9884 // x - An FP/SIMD register of some size in the range v0-v15 9885 // I - Constant that can be used with an ADD instruction 9886 // J - Constant that can be used with a SUB instruction 9887 // K - Constant that can be used with a 32-bit logical instruction 9888 // L - Constant that can be used with a 64-bit logical instruction 9889 // M - Constant that can be used as a 32-bit MOV immediate 9890 // N - Constant that can be used as a 64-bit MOV immediate 9891 // Q - A memory reference with base register and no offset 9892 // S - A symbolic address 9893 // Y - Floating point constant zero 9894 // Z - Integer constant zero 9895 // 9896 // Note that general register operands will be output using their 64-bit x 9897 // register name, whatever the size of the variable, unless the asm operand 9898 // is prefixed by the %w modifier. Floating-point and SIMD register operands 9899 // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or 9900 // %q modifier. 9901 const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const { 9902 // At this point, we have to lower this constraint to something else, so we 9903 // lower it to an "r" or "w". However, by doing this we will force the result 9904 // to be in register, while the X constraint is much more permissive. 9905 // 9906 // Although we are correct (we are free to emit anything, without 9907 // constraints), we might break use cases that would expect us to be more 9908 // efficient and emit something else. 9909 if (!Subtarget->hasFPARMv8()) 9910 return "r"; 9911 9912 if (ConstraintVT.isFloatingPoint()) 9913 return "w"; 9914 9915 if (ConstraintVT.isVector() && 9916 (ConstraintVT.getSizeInBits() == 64 || 9917 ConstraintVT.getSizeInBits() == 128)) 9918 return "w"; 9919 9920 return "r"; 9921 } 9922 9923 enum PredicateConstraint { 9924 Upl, 9925 Upa, 9926 Invalid 9927 }; 9928 9929 static PredicateConstraint parsePredicateConstraint(StringRef Constraint) { 9930 PredicateConstraint P = PredicateConstraint::Invalid; 9931 if (Constraint == "Upa") 9932 P = PredicateConstraint::Upa; 9933 if (Constraint == "Upl") 9934 P = PredicateConstraint::Upl; 9935 return P; 9936 } 9937 9938 /// getConstraintType - Given a constraint letter, return the type of 9939 /// constraint it is for this target. 9940 AArch64TargetLowering::ConstraintType 9941 AArch64TargetLowering::getConstraintType(StringRef Constraint) const { 9942 if (Constraint.size() == 1) { 9943 switch (Constraint[0]) { 9944 default: 9945 break; 9946 case 'x': 9947 case 'w': 9948 case 'y': 9949 return C_RegisterClass; 9950 // An address with a single base register. Due to the way we 9951 // currently handle addresses it is the same as 'r'. 9952 case 'Q': 9953 return C_Memory; 9954 case 'I': 9955 case 'J': 9956 case 'K': 9957 case 'L': 9958 case 'M': 9959 case 'N': 9960 case 'Y': 9961 case 'Z': 9962 return C_Immediate; 9963 case 'z': 9964 case 'S': // A symbolic address 9965 return C_Other; 9966 } 9967 } else if (parsePredicateConstraint(Constraint) != 9968 PredicateConstraint::Invalid) 9969 return C_RegisterClass; 9970 return TargetLowering::getConstraintType(Constraint); 9971 } 9972 9973 /// Examine constraint type and operand type and determine a weight value. 9974 /// This object must already have been set up with the operand type 9975 /// and the current alternative constraint selected. 9976 TargetLowering::ConstraintWeight 9977 AArch64TargetLowering::getSingleConstraintMatchWeight( 9978 AsmOperandInfo &info, const char *constraint) const { 9979 ConstraintWeight weight = CW_Invalid; 9980 Value *CallOperandVal = info.CallOperandVal; 9981 // If we don't have a value, we can't do a match, 9982 // but allow it at the lowest weight. 9983 if (!CallOperandVal) 9984 return CW_Default; 9985 Type *type = CallOperandVal->getType(); 9986 // Look at the constraint type. 9987 switch (*constraint) { 9988 default: 9989 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 9990 break; 9991 case 'x': 9992 case 'w': 9993 case 'y': 9994 if (type->isFloatingPointTy() || type->isVectorTy()) 9995 weight = CW_Register; 9996 break; 9997 case 'z': 9998 weight = CW_Constant; 9999 break; 10000 case 'U': 10001 if (parsePredicateConstraint(constraint) != PredicateConstraint::Invalid) 10002 weight = CW_Register; 10003 break; 10004 } 10005 return weight; 10006 } 10007 10008 std::pair<unsigned, const TargetRegisterClass *> 10009 AArch64TargetLowering::getRegForInlineAsmConstraint( 10010 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { 10011 if (Constraint.size() == 1) { 10012 switch (Constraint[0]) { 10013 case 'r': 10014 if (VT.isScalableVector()) 10015 return std::make_pair(0U, nullptr); 10016 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512) 10017 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass); 10018 if (VT.getFixedSizeInBits() == 64) 10019 return std::make_pair(0U, &AArch64::GPR64commonRegClass); 10020 return std::make_pair(0U, &AArch64::GPR32commonRegClass); 10021 case 'w': { 10022 if (!Subtarget->hasFPARMv8()) 10023 break; 10024 if (VT.isScalableVector()) { 10025 if (VT.getVectorElementType() != MVT::i1) 10026 return std::make_pair(0U, &AArch64::ZPRRegClass); 10027 return std::make_pair(0U, nullptr); 10028 } 10029 uint64_t VTSize = VT.getFixedSizeInBits(); 10030 if (VTSize == 16) 10031 return std::make_pair(0U, &AArch64::FPR16RegClass); 10032 if (VTSize == 32) 10033 return std::make_pair(0U, &AArch64::FPR32RegClass); 10034 if (VTSize == 64) 10035 return std::make_pair(0U, &AArch64::FPR64RegClass); 10036 if (VTSize == 128) 10037 return std::make_pair(0U, &AArch64::FPR128RegClass); 10038 break; 10039 } 10040 // The instructions that this constraint is designed for can 10041 // only take 128-bit registers so just use that regclass. 10042 case 'x': 10043 if (!Subtarget->hasFPARMv8()) 10044 break; 10045 if (VT.isScalableVector()) 10046 return std::make_pair(0U, &AArch64::ZPR_4bRegClass); 10047 if (VT.getSizeInBits() == 128) 10048 return std::make_pair(0U, &AArch64::FPR128_loRegClass); 10049 break; 10050 case 'y': 10051 if (!Subtarget->hasFPARMv8()) 10052 break; 10053 if (VT.isScalableVector()) 10054 return std::make_pair(0U, &AArch64::ZPR_3bRegClass); 10055 break; 10056 } 10057 } else { 10058 PredicateConstraint PC = parsePredicateConstraint(Constraint); 10059 if (PC != PredicateConstraint::Invalid) { 10060 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1) 10061 return std::make_pair(0U, nullptr); 10062 bool restricted = (PC == PredicateConstraint::Upl); 10063 return restricted ? std::make_pair(0U, &AArch64::PPR_3bRegClass) 10064 : std::make_pair(0U, &AArch64::PPRRegClass); 10065 } 10066 } 10067 if (StringRef("{cc}").equals_insensitive(Constraint)) 10068 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass); 10069 10070 // Use the default implementation in TargetLowering to convert the register 10071 // constraint into a member of a register class. 10072 std::pair<unsigned, const TargetRegisterClass *> Res; 10073 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 10074 10075 // Not found as a standard register? 10076 if (!Res.second) { 10077 unsigned Size = Constraint.size(); 10078 if ((Size == 4 || Size == 5) && Constraint[0] == '{' && 10079 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') { 10080 int RegNo; 10081 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo); 10082 if (!Failed && RegNo >= 0 && RegNo <= 31) { 10083 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size. 10084 // By default we'll emit v0-v31 for this unless there's a modifier where 10085 // we'll emit the correct register as well. 10086 if (VT != MVT::Other && VT.getSizeInBits() == 64) { 10087 Res.first = AArch64::FPR64RegClass.getRegister(RegNo); 10088 Res.second = &AArch64::FPR64RegClass; 10089 } else { 10090 Res.first = AArch64::FPR128RegClass.getRegister(RegNo); 10091 Res.second = &AArch64::FPR128RegClass; 10092 } 10093 } 10094 } 10095 } 10096 10097 if (Res.second && !Subtarget->hasFPARMv8() && 10098 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) && 10099 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second)) 10100 return std::make_pair(0U, nullptr); 10101 10102 return Res; 10103 } 10104 10105 EVT AArch64TargetLowering::getAsmOperandValueType(const DataLayout &DL, 10106 llvm::Type *Ty, 10107 bool AllowUnknown) const { 10108 if (Subtarget->hasLS64() && Ty->isIntegerTy(512)) 10109 return EVT(MVT::i64x8); 10110 10111 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown); 10112 } 10113 10114 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 10115 /// vector. If it is invalid, don't add anything to Ops. 10116 void AArch64TargetLowering::LowerAsmOperandForConstraint( 10117 SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, 10118 SelectionDAG &DAG) const { 10119 SDValue Result; 10120 10121 // Currently only support length 1 constraints. 10122 if (Constraint.length() != 1) 10123 return; 10124 10125 char ConstraintLetter = Constraint[0]; 10126 switch (ConstraintLetter) { 10127 default: 10128 break; 10129 10130 // This set of constraints deal with valid constants for various instructions. 10131 // Validate and return a target constant for them if we can. 10132 case 'z': { 10133 // 'z' maps to xzr or wzr so it needs an input of 0. 10134 if (!isNullConstant(Op)) 10135 return; 10136 10137 if (Op.getValueType() == MVT::i64) 10138 Result = DAG.getRegister(AArch64::XZR, MVT::i64); 10139 else 10140 Result = DAG.getRegister(AArch64::WZR, MVT::i32); 10141 break; 10142 } 10143 case 'S': { 10144 // An absolute symbolic address or label reference. 10145 if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) { 10146 Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op), 10147 GA->getValueType(0)); 10148 } else if (const BlockAddressSDNode *BA = 10149 dyn_cast<BlockAddressSDNode>(Op)) { 10150 Result = 10151 DAG.getTargetBlockAddress(BA->getBlockAddress(), BA->getValueType(0)); 10152 } else 10153 return; 10154 break; 10155 } 10156 10157 case 'I': 10158 case 'J': 10159 case 'K': 10160 case 'L': 10161 case 'M': 10162 case 'N': 10163 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 10164 if (!C) 10165 return; 10166 10167 // Grab the value and do some validation. 10168 uint64_t CVal = C->getZExtValue(); 10169 switch (ConstraintLetter) { 10170 // The I constraint applies only to simple ADD or SUB immediate operands: 10171 // i.e. 0 to 4095 with optional shift by 12 10172 // The J constraint applies only to ADD or SUB immediates that would be 10173 // valid when negated, i.e. if [an add pattern] were to be output as a SUB 10174 // instruction [or vice versa], in other words -1 to -4095 with optional 10175 // left shift by 12. 10176 case 'I': 10177 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal)) 10178 break; 10179 return; 10180 case 'J': { 10181 uint64_t NVal = -C->getSExtValue(); 10182 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) { 10183 CVal = C->getSExtValue(); 10184 break; 10185 } 10186 return; 10187 } 10188 // The K and L constraints apply *only* to logical immediates, including 10189 // what used to be the MOVI alias for ORR (though the MOVI alias has now 10190 // been removed and MOV should be used). So these constraints have to 10191 // distinguish between bit patterns that are valid 32-bit or 64-bit 10192 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but 10193 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice 10194 // versa. 10195 case 'K': 10196 if (AArch64_AM::isLogicalImmediate(CVal, 32)) 10197 break; 10198 return; 10199 case 'L': 10200 if (AArch64_AM::isLogicalImmediate(CVal, 64)) 10201 break; 10202 return; 10203 // The M and N constraints are a superset of K and L respectively, for use 10204 // with the MOV (immediate) alias. As well as the logical immediates they 10205 // also match 32 or 64-bit immediates that can be loaded either using a 10206 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca 10207 // (M) or 64-bit 0x1234000000000000 (N) etc. 10208 // As a note some of this code is liberally stolen from the asm parser. 10209 case 'M': { 10210 if (!isUInt<32>(CVal)) 10211 return; 10212 if (AArch64_AM::isLogicalImmediate(CVal, 32)) 10213 break; 10214 if ((CVal & 0xFFFF) == CVal) 10215 break; 10216 if ((CVal & 0xFFFF0000ULL) == CVal) 10217 break; 10218 uint64_t NCVal = ~(uint32_t)CVal; 10219 if ((NCVal & 0xFFFFULL) == NCVal) 10220 break; 10221 if ((NCVal & 0xFFFF0000ULL) == NCVal) 10222 break; 10223 return; 10224 } 10225 case 'N': { 10226 if (AArch64_AM::isLogicalImmediate(CVal, 64)) 10227 break; 10228 if ((CVal & 0xFFFFULL) == CVal) 10229 break; 10230 if ((CVal & 0xFFFF0000ULL) == CVal) 10231 break; 10232 if ((CVal & 0xFFFF00000000ULL) == CVal) 10233 break; 10234 if ((CVal & 0xFFFF000000000000ULL) == CVal) 10235 break; 10236 uint64_t NCVal = ~CVal; 10237 if ((NCVal & 0xFFFFULL) == NCVal) 10238 break; 10239 if ((NCVal & 0xFFFF0000ULL) == NCVal) 10240 break; 10241 if ((NCVal & 0xFFFF00000000ULL) == NCVal) 10242 break; 10243 if ((NCVal & 0xFFFF000000000000ULL) == NCVal) 10244 break; 10245 return; 10246 } 10247 default: 10248 return; 10249 } 10250 10251 // All assembler immediates are 64-bit integers. 10252 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64); 10253 break; 10254 } 10255 10256 if (Result.getNode()) { 10257 Ops.push_back(Result); 10258 return; 10259 } 10260 10261 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 10262 } 10263 10264 //===----------------------------------------------------------------------===// 10265 // AArch64 Advanced SIMD Support 10266 //===----------------------------------------------------------------------===// 10267 10268 /// WidenVector - Given a value in the V64 register class, produce the 10269 /// equivalent value in the V128 register class. 10270 static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) { 10271 EVT VT = V64Reg.getValueType(); 10272 unsigned NarrowSize = VT.getVectorNumElements(); 10273 MVT EltTy = VT.getVectorElementType().getSimpleVT(); 10274 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize); 10275 SDLoc DL(V64Reg); 10276 10277 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy), 10278 V64Reg, DAG.getConstant(0, DL, MVT::i64)); 10279 } 10280 10281 /// getExtFactor - Determine the adjustment factor for the position when 10282 /// generating an "extract from vector registers" instruction. 10283 static unsigned getExtFactor(SDValue &V) { 10284 EVT EltType = V.getValueType().getVectorElementType(); 10285 return EltType.getSizeInBits() / 8; 10286 } 10287 10288 /// NarrowVector - Given a value in the V128 register class, produce the 10289 /// equivalent value in the V64 register class. 10290 static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) { 10291 EVT VT = V128Reg.getValueType(); 10292 unsigned WideSize = VT.getVectorNumElements(); 10293 MVT EltTy = VT.getVectorElementType().getSimpleVT(); 10294 MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2); 10295 SDLoc DL(V128Reg); 10296 10297 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg); 10298 } 10299 10300 // Gather data to see if the operation can be modelled as a 10301 // shuffle in combination with VEXTs. 10302 SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, 10303 SelectionDAG &DAG) const { 10304 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 10305 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n"); 10306 SDLoc dl(Op); 10307 EVT VT = Op.getValueType(); 10308 assert(!VT.isScalableVector() && 10309 "Scalable vectors cannot be used with ISD::BUILD_VECTOR"); 10310 unsigned NumElts = VT.getVectorNumElements(); 10311 10312 struct ShuffleSourceInfo { 10313 SDValue Vec; 10314 unsigned MinElt; 10315 unsigned MaxElt; 10316 10317 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to 10318 // be compatible with the shuffle we intend to construct. As a result 10319 // ShuffleVec will be some sliding window into the original Vec. 10320 SDValue ShuffleVec; 10321 10322 // Code should guarantee that element i in Vec starts at element "WindowBase 10323 // + i * WindowScale in ShuffleVec". 10324 int WindowBase; 10325 int WindowScale; 10326 10327 ShuffleSourceInfo(SDValue Vec) 10328 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0), 10329 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {} 10330 10331 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } 10332 }; 10333 10334 // First gather all vectors used as an immediate source for this BUILD_VECTOR 10335 // node. 10336 SmallVector<ShuffleSourceInfo, 2> Sources; 10337 for (unsigned i = 0; i < NumElts; ++i) { 10338 SDValue V = Op.getOperand(i); 10339 if (V.isUndef()) 10340 continue; 10341 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 10342 !isa<ConstantSDNode>(V.getOperand(1)) || 10343 V.getOperand(0).getValueType().isScalableVector()) { 10344 LLVM_DEBUG( 10345 dbgs() << "Reshuffle failed: " 10346 "a shuffle can only come from building a vector from " 10347 "various elements of other fixed-width vectors, provided " 10348 "their indices are constant\n"); 10349 return SDValue(); 10350 } 10351 10352 // Add this element source to the list if it's not already there. 10353 SDValue SourceVec = V.getOperand(0); 10354 auto Source = find(Sources, SourceVec); 10355 if (Source == Sources.end()) 10356 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); 10357 10358 // Update the minimum and maximum lane number seen. 10359 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); 10360 Source->MinElt = std::min(Source->MinElt, EltNo); 10361 Source->MaxElt = std::max(Source->MaxElt, EltNo); 10362 } 10363 10364 // If we have 3 or 4 sources, try to generate a TBL, which will at least be 10365 // better than moving to/from gpr registers for larger vectors. 10366 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) { 10367 // Construct a mask for the tbl. We may need to adjust the index for types 10368 // larger than i8. 10369 SmallVector<unsigned, 16> Mask; 10370 unsigned OutputFactor = VT.getScalarSizeInBits() / 8; 10371 for (unsigned I = 0; I < NumElts; ++I) { 10372 SDValue V = Op.getOperand(I); 10373 if (V.isUndef()) { 10374 for (unsigned OF = 0; OF < OutputFactor; OF++) 10375 Mask.push_back(-1); 10376 continue; 10377 } 10378 // Set the Mask lanes adjusted for the size of the input and output 10379 // lanes. The Mask is always i8, so it will set OutputFactor lanes per 10380 // output element, adjusted in their positions per input and output types. 10381 unsigned Lane = V.getConstantOperandVal(1); 10382 for (unsigned S = 0; S < Sources.size(); S++) { 10383 if (V.getOperand(0) == Sources[S].Vec) { 10384 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits(); 10385 unsigned InputBase = 16 * S + Lane * InputSize / 8; 10386 for (unsigned OF = 0; OF < OutputFactor; OF++) 10387 Mask.push_back(InputBase + OF); 10388 break; 10389 } 10390 } 10391 } 10392 10393 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to 10394 // v16i8, and the TBLMask 10395 SmallVector<SDValue, 16> TBLOperands; 10396 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3 10397 ? Intrinsic::aarch64_neon_tbl3 10398 : Intrinsic::aarch64_neon_tbl4, 10399 dl, MVT::i32)); 10400 for (unsigned i = 0; i < Sources.size(); i++) { 10401 SDValue Src = Sources[i].Vec; 10402 EVT SrcVT = Src.getValueType(); 10403 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src); 10404 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) && 10405 "Expected a legally typed vector"); 10406 if (SrcVT.is64BitVector()) 10407 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src, 10408 DAG.getUNDEF(MVT::v8i8)); 10409 TBLOperands.push_back(Src); 10410 } 10411 10412 SmallVector<SDValue, 16> TBLMask; 10413 for (unsigned i = 0; i < Mask.size(); i++) 10414 TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32)); 10415 assert((Mask.size() == 8 || Mask.size() == 16) && 10416 "Expected a v8i8 or v16i8 Mask"); 10417 TBLOperands.push_back( 10418 DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask)); 10419 10420 SDValue Shuffle = 10421 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, 10422 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands); 10423 return DAG.getBitcast(VT, Shuffle); 10424 } 10425 10426 if (Sources.size() > 2) { 10427 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something " 10428 << "sensible when at most two source vectors are " 10429 << "involved\n"); 10430 return SDValue(); 10431 } 10432 10433 // Find out the smallest element size among result and two sources, and use 10434 // it as element size to build the shuffle_vector. 10435 EVT SmallestEltTy = VT.getVectorElementType(); 10436 for (auto &Source : Sources) { 10437 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); 10438 if (SrcEltTy.bitsLT(SmallestEltTy)) { 10439 SmallestEltTy = SrcEltTy; 10440 } 10441 } 10442 unsigned ResMultiplier = 10443 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits(); 10444 uint64_t VTSize = VT.getFixedSizeInBits(); 10445 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits(); 10446 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); 10447 10448 // If the source vector is too wide or too narrow, we may nevertheless be able 10449 // to construct a compatible shuffle either by concatenating it with UNDEF or 10450 // extracting a suitable range of elements. 10451 for (auto &Src : Sources) { 10452 EVT SrcVT = Src.ShuffleVec.getValueType(); 10453 10454 TypeSize SrcVTSize = SrcVT.getSizeInBits(); 10455 if (SrcVTSize == TypeSize::Fixed(VTSize)) 10456 continue; 10457 10458 // This stage of the search produces a source with the same element type as 10459 // the original, but with a total width matching the BUILD_VECTOR output. 10460 EVT EltVT = SrcVT.getVectorElementType(); 10461 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits(); 10462 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); 10463 10464 if (SrcVTSize.getFixedValue() < VTSize) { 10465 assert(2 * SrcVTSize == VTSize); 10466 // We can pad out the smaller vector for free, so if it's part of a 10467 // shuffle... 10468 Src.ShuffleVec = 10469 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, 10470 DAG.getUNDEF(Src.ShuffleVec.getValueType())); 10471 continue; 10472 } 10473 10474 if (SrcVTSize.getFixedValue() != 2 * VTSize) { 10475 LLVM_DEBUG( 10476 dbgs() << "Reshuffle failed: result vector too small to extract\n"); 10477 return SDValue(); 10478 } 10479 10480 if (Src.MaxElt - Src.MinElt >= NumSrcElts) { 10481 LLVM_DEBUG( 10482 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n"); 10483 return SDValue(); 10484 } 10485 10486 if (Src.MinElt >= NumSrcElts) { 10487 // The extraction can just take the second half 10488 Src.ShuffleVec = 10489 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 10490 DAG.getConstant(NumSrcElts, dl, MVT::i64)); 10491 Src.WindowBase = -NumSrcElts; 10492 } else if (Src.MaxElt < NumSrcElts) { 10493 // The extraction can just take the first half 10494 Src.ShuffleVec = 10495 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 10496 DAG.getConstant(0, dl, MVT::i64)); 10497 } else { 10498 // An actual VEXT is needed 10499 SDValue VEXTSrc1 = 10500 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 10501 DAG.getConstant(0, dl, MVT::i64)); 10502 SDValue VEXTSrc2 = 10503 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 10504 DAG.getConstant(NumSrcElts, dl, MVT::i64)); 10505 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1); 10506 10507 if (!SrcVT.is64BitVector()) { 10508 LLVM_DEBUG( 10509 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT " 10510 "for SVE vectors."); 10511 return SDValue(); 10512 } 10513 10514 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1, 10515 VEXTSrc2, 10516 DAG.getConstant(Imm, dl, MVT::i32)); 10517 Src.WindowBase = -Src.MinElt; 10518 } 10519 } 10520 10521 // Another possible incompatibility occurs from the vector element types. We 10522 // can fix this by bitcasting the source vectors to the same type we intend 10523 // for the shuffle. 10524 for (auto &Src : Sources) { 10525 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); 10526 if (SrcEltTy == SmallestEltTy) 10527 continue; 10528 assert(ShuffleVT.getVectorElementType() == SmallestEltTy); 10529 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); 10530 Src.WindowScale = 10531 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits(); 10532 Src.WindowBase *= Src.WindowScale; 10533 } 10534 10535 // Final check before we try to actually produce a shuffle. 10536 LLVM_DEBUG(for (auto Src 10537 : Sources) 10538 assert(Src.ShuffleVec.getValueType() == ShuffleVT);); 10539 10540 // The stars all align, our next step is to produce the mask for the shuffle. 10541 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); 10542 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits(); 10543 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { 10544 SDValue Entry = Op.getOperand(i); 10545 if (Entry.isUndef()) 10546 continue; 10547 10548 auto Src = find(Sources, Entry.getOperand(0)); 10549 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); 10550 10551 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit 10552 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this 10553 // segment. 10554 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); 10555 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(), 10556 VT.getScalarSizeInBits()); 10557 int LanesDefined = BitsDefined / BitsPerShuffleLane; 10558 10559 // This source is expected to fill ResMultiplier lanes of the final shuffle, 10560 // starting at the appropriate offset. 10561 int *LaneMask = &Mask[i * ResMultiplier]; 10562 10563 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; 10564 ExtractBase += NumElts * (Src - Sources.begin()); 10565 for (int j = 0; j < LanesDefined; ++j) 10566 LaneMask[j] = ExtractBase + j; 10567 } 10568 10569 // Final check before we try to produce nonsense... 10570 if (!isShuffleMaskLegal(Mask, ShuffleVT)) { 10571 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n"); 10572 return SDValue(); 10573 } 10574 10575 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; 10576 for (unsigned i = 0; i < Sources.size(); ++i) 10577 ShuffleOps[i] = Sources[i].ShuffleVec; 10578 10579 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], 10580 ShuffleOps[1], Mask); 10581 SDValue V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); 10582 10583 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump(); 10584 dbgs() << "Reshuffle, creating node: "; V.dump();); 10585 10586 return V; 10587 } 10588 10589 // check if an EXT instruction can handle the shuffle mask when the 10590 // vector sources of the shuffle are the same. 10591 static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { 10592 unsigned NumElts = VT.getVectorNumElements(); 10593 10594 // Assume that the first shuffle index is not UNDEF. Fail if it is. 10595 if (M[0] < 0) 10596 return false; 10597 10598 Imm = M[0]; 10599 10600 // If this is a VEXT shuffle, the immediate value is the index of the first 10601 // element. The other shuffle indices must be the successive elements after 10602 // the first one. 10603 unsigned ExpectedElt = Imm; 10604 for (unsigned i = 1; i < NumElts; ++i) { 10605 // Increment the expected index. If it wraps around, just follow it 10606 // back to index zero and keep going. 10607 ++ExpectedElt; 10608 if (ExpectedElt == NumElts) 10609 ExpectedElt = 0; 10610 10611 if (M[i] < 0) 10612 continue; // ignore UNDEF indices 10613 if (ExpectedElt != static_cast<unsigned>(M[i])) 10614 return false; 10615 } 10616 10617 return true; 10618 } 10619 10620 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from 10621 // v4i32s. This is really a truncate, which we can construct out of (legal) 10622 // concats and truncate nodes. 10623 static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG) { 10624 if (V.getValueType() != MVT::v16i8) 10625 return SDValue(); 10626 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR"); 10627 10628 for (unsigned X = 0; X < 4; X++) { 10629 // Check the first item in each group is an extract from lane 0 of a v4i32 10630 // or v4i16. 10631 SDValue BaseExt = V.getOperand(X * 4); 10632 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 10633 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 && 10634 BaseExt.getOperand(0).getValueType() != MVT::v4i32) || 10635 !isa<ConstantSDNode>(BaseExt.getOperand(1)) || 10636 BaseExt.getConstantOperandVal(1) != 0) 10637 return SDValue(); 10638 SDValue Base = BaseExt.getOperand(0); 10639 // And check the other items are extracts from the same vector. 10640 for (unsigned Y = 1; Y < 4; Y++) { 10641 SDValue Ext = V.getOperand(X * 4 + Y); 10642 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 10643 Ext.getOperand(0) != Base || 10644 !isa<ConstantSDNode>(Ext.getOperand(1)) || 10645 Ext.getConstantOperandVal(1) != Y) 10646 return SDValue(); 10647 } 10648 } 10649 10650 // Turn the buildvector into a series of truncates and concates, which will 10651 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are 10652 // concat together to produce 2 v8i16. These are both truncated and concat 10653 // together. 10654 SDLoc DL(V); 10655 SDValue Trunc[4] = { 10656 V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0), 10657 V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)}; 10658 for (SDValue &V : Trunc) 10659 if (V.getValueType() == MVT::v4i32) 10660 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V); 10661 SDValue Concat0 = 10662 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]); 10663 SDValue Concat1 = 10664 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]); 10665 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0); 10666 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1); 10667 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1); 10668 } 10669 10670 /// Check if a vector shuffle corresponds to a DUP instructions with a larger 10671 /// element width than the vector lane type. If that is the case the function 10672 /// returns true and writes the value of the DUP instruction lane operand into 10673 /// DupLaneOp 10674 static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize, 10675 unsigned &DupLaneOp) { 10676 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && 10677 "Only possible block sizes for wide DUP are: 16, 32, 64"); 10678 10679 if (BlockSize <= VT.getScalarSizeInBits()) 10680 return false; 10681 if (BlockSize % VT.getScalarSizeInBits() != 0) 10682 return false; 10683 if (VT.getSizeInBits() % BlockSize != 0) 10684 return false; 10685 10686 size_t SingleVecNumElements = VT.getVectorNumElements(); 10687 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits(); 10688 size_t NumBlocks = VT.getSizeInBits() / BlockSize; 10689 10690 // We are looking for masks like 10691 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element 10692 // might be replaced by 'undefined'. BlockIndices will eventually contain 10693 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7] 10694 // for the above examples) 10695 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1); 10696 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++) 10697 for (size_t I = 0; I < NumEltsPerBlock; I++) { 10698 int Elt = M[BlockIndex * NumEltsPerBlock + I]; 10699 if (Elt < 0) 10700 continue; 10701 // For now we don't support shuffles that use the second operand 10702 if ((unsigned)Elt >= SingleVecNumElements) 10703 return false; 10704 if (BlockElts[I] < 0) 10705 BlockElts[I] = Elt; 10706 else if (BlockElts[I] != Elt) 10707 return false; 10708 } 10709 10710 // We found a candidate block (possibly with some undefs). It must be a 10711 // sequence of consecutive integers starting with a value divisible by 10712 // NumEltsPerBlock with some values possibly replaced by undef-s. 10713 10714 // Find first non-undef element 10715 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; }); 10716 assert(FirstRealEltIter != BlockElts.end() && 10717 "Shuffle with all-undefs must have been caught by previous cases, " 10718 "e.g. isSplat()"); 10719 if (FirstRealEltIter == BlockElts.end()) { 10720 DupLaneOp = 0; 10721 return true; 10722 } 10723 10724 // Index of FirstRealElt in BlockElts 10725 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin(); 10726 10727 if ((unsigned)*FirstRealEltIter < FirstRealIndex) 10728 return false; 10729 // BlockElts[0] must have the following value if it isn't undef: 10730 size_t Elt0 = *FirstRealEltIter - FirstRealIndex; 10731 10732 // Check the first element 10733 if (Elt0 % NumEltsPerBlock != 0) 10734 return false; 10735 // Check that the sequence indeed consists of consecutive integers (modulo 10736 // undefs) 10737 for (size_t I = 0; I < NumEltsPerBlock; I++) 10738 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I) 10739 return false; 10740 10741 DupLaneOp = Elt0 / NumEltsPerBlock; 10742 return true; 10743 } 10744 10745 // check if an EXT instruction can handle the shuffle mask when the 10746 // vector sources of the shuffle are different. 10747 static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT, 10748 unsigned &Imm) { 10749 // Look for the first non-undef element. 10750 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; }); 10751 10752 // Benefit form APInt to handle overflow when calculating expected element. 10753 unsigned NumElts = VT.getVectorNumElements(); 10754 unsigned MaskBits = APInt(32, NumElts * 2).logBase2(); 10755 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1); 10756 // The following shuffle indices must be the successive elements after the 10757 // first real element. 10758 bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) { 10759 return Elt != ExpectedElt++ && Elt != -1; 10760 }); 10761 if (FoundWrongElt) 10762 return false; 10763 10764 // The index of an EXT is the first element if it is not UNDEF. 10765 // Watch out for the beginning UNDEFs. The EXT index should be the expected 10766 // value of the first element. E.g. 10767 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>. 10768 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>. 10769 // ExpectedElt is the last mask index plus 1. 10770 Imm = ExpectedElt.getZExtValue(); 10771 10772 // There are two difference cases requiring to reverse input vectors. 10773 // For example, for vector <4 x i32> we have the following cases, 10774 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>) 10775 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>) 10776 // For both cases, we finally use mask <5, 6, 7, 0>, which requires 10777 // to reverse two input vectors. 10778 if (Imm < NumElts) 10779 ReverseEXT = true; 10780 else 10781 Imm -= NumElts; 10782 10783 return true; 10784 } 10785 10786 /// isREVMask - Check if a vector shuffle corresponds to a REV 10787 /// instruction with the specified blocksize. (The order of the elements 10788 /// within each block of the vector is reversed.) 10789 static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { 10790 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64 || 10791 BlockSize == 128) && 10792 "Only possible block sizes for REV are: 16, 32, 64, 128"); 10793 10794 unsigned EltSz = VT.getScalarSizeInBits(); 10795 unsigned NumElts = VT.getVectorNumElements(); 10796 unsigned BlockElts = M[0] + 1; 10797 // If the first shuffle index is UNDEF, be optimistic. 10798 if (M[0] < 0) 10799 BlockElts = BlockSize / EltSz; 10800 10801 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) 10802 return false; 10803 10804 for (unsigned i = 0; i < NumElts; ++i) { 10805 if (M[i] < 0) 10806 continue; // ignore UNDEF indices 10807 if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts)) 10808 return false; 10809 } 10810 10811 return true; 10812 } 10813 10814 static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 10815 unsigned NumElts = VT.getVectorNumElements(); 10816 if (NumElts % 2 != 0) 10817 return false; 10818 WhichResult = (M[0] == 0 ? 0 : 1); 10819 unsigned Idx = WhichResult * NumElts / 2; 10820 for (unsigned i = 0; i != NumElts; i += 2) { 10821 if ((M[i] >= 0 && (unsigned)M[i] != Idx) || 10822 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts)) 10823 return false; 10824 Idx += 1; 10825 } 10826 10827 return true; 10828 } 10829 10830 static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 10831 unsigned NumElts = VT.getVectorNumElements(); 10832 WhichResult = (M[0] == 0 ? 0 : 1); 10833 for (unsigned i = 0; i != NumElts; ++i) { 10834 if (M[i] < 0) 10835 continue; // ignore UNDEF indices 10836 if ((unsigned)M[i] != 2 * i + WhichResult) 10837 return false; 10838 } 10839 10840 return true; 10841 } 10842 10843 static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 10844 unsigned NumElts = VT.getVectorNumElements(); 10845 if (NumElts % 2 != 0) 10846 return false; 10847 WhichResult = (M[0] == 0 ? 0 : 1); 10848 for (unsigned i = 0; i < NumElts; i += 2) { 10849 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || 10850 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult)) 10851 return false; 10852 } 10853 return true; 10854 } 10855 10856 /// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of 10857 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 10858 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. 10859 static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 10860 unsigned NumElts = VT.getVectorNumElements(); 10861 if (NumElts % 2 != 0) 10862 return false; 10863 WhichResult = (M[0] == 0 ? 0 : 1); 10864 unsigned Idx = WhichResult * NumElts / 2; 10865 for (unsigned i = 0; i != NumElts; i += 2) { 10866 if ((M[i] >= 0 && (unsigned)M[i] != Idx) || 10867 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx)) 10868 return false; 10869 Idx += 1; 10870 } 10871 10872 return true; 10873 } 10874 10875 /// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of 10876 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 10877 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, 10878 static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 10879 unsigned Half = VT.getVectorNumElements() / 2; 10880 WhichResult = (M[0] == 0 ? 0 : 1); 10881 for (unsigned j = 0; j != 2; ++j) { 10882 unsigned Idx = WhichResult; 10883 for (unsigned i = 0; i != Half; ++i) { 10884 int MIdx = M[i + j * Half]; 10885 if (MIdx >= 0 && (unsigned)MIdx != Idx) 10886 return false; 10887 Idx += 2; 10888 } 10889 } 10890 10891 return true; 10892 } 10893 10894 /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of 10895 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 10896 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. 10897 static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 10898 unsigned NumElts = VT.getVectorNumElements(); 10899 if (NumElts % 2 != 0) 10900 return false; 10901 WhichResult = (M[0] == 0 ? 0 : 1); 10902 for (unsigned i = 0; i < NumElts; i += 2) { 10903 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || 10904 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult)) 10905 return false; 10906 } 10907 return true; 10908 } 10909 10910 static bool isINSMask(ArrayRef<int> M, int NumInputElements, 10911 bool &DstIsLeft, int &Anomaly) { 10912 if (M.size() != static_cast<size_t>(NumInputElements)) 10913 return false; 10914 10915 int NumLHSMatch = 0, NumRHSMatch = 0; 10916 int LastLHSMismatch = -1, LastRHSMismatch = -1; 10917 10918 for (int i = 0; i < NumInputElements; ++i) { 10919 if (M[i] == -1) { 10920 ++NumLHSMatch; 10921 ++NumRHSMatch; 10922 continue; 10923 } 10924 10925 if (M[i] == i) 10926 ++NumLHSMatch; 10927 else 10928 LastLHSMismatch = i; 10929 10930 if (M[i] == i + NumInputElements) 10931 ++NumRHSMatch; 10932 else 10933 LastRHSMismatch = i; 10934 } 10935 10936 if (NumLHSMatch == NumInputElements - 1) { 10937 DstIsLeft = true; 10938 Anomaly = LastLHSMismatch; 10939 return true; 10940 } else if (NumRHSMatch == NumInputElements - 1) { 10941 DstIsLeft = false; 10942 Anomaly = LastRHSMismatch; 10943 return true; 10944 } 10945 10946 return false; 10947 } 10948 10949 static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) { 10950 if (VT.getSizeInBits() != 128) 10951 return false; 10952 10953 unsigned NumElts = VT.getVectorNumElements(); 10954 10955 for (int I = 0, E = NumElts / 2; I != E; I++) { 10956 if (Mask[I] != I) 10957 return false; 10958 } 10959 10960 int Offset = NumElts / 2; 10961 for (int I = NumElts / 2, E = NumElts; I != E; I++) { 10962 if (Mask[I] != I + SplitLHS * Offset) 10963 return false; 10964 } 10965 10966 return true; 10967 } 10968 10969 static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) { 10970 SDLoc DL(Op); 10971 EVT VT = Op.getValueType(); 10972 SDValue V0 = Op.getOperand(0); 10973 SDValue V1 = Op.getOperand(1); 10974 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask(); 10975 10976 if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() || 10977 VT.getVectorElementType() != V1.getValueType().getVectorElementType()) 10978 return SDValue(); 10979 10980 bool SplitV0 = V0.getValueSizeInBits() == 128; 10981 10982 if (!isConcatMask(Mask, VT, SplitV0)) 10983 return SDValue(); 10984 10985 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); 10986 if (SplitV0) { 10987 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0, 10988 DAG.getConstant(0, DL, MVT::i64)); 10989 } 10990 if (V1.getValueSizeInBits() == 128) { 10991 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1, 10992 DAG.getConstant(0, DL, MVT::i64)); 10993 } 10994 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1); 10995 } 10996 10997 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit 10998 /// the specified operations to build the shuffle. ID is the perfect-shuffle 10999 //ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle 11000 //table entry and LHS/RHS are the immediate inputs for this stage of the 11001 //shuffle. 11002 static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, 11003 SDValue V2, unsigned PFEntry, SDValue LHS, 11004 SDValue RHS, SelectionDAG &DAG, 11005 const SDLoc &dl) { 11006 unsigned OpNum = (PFEntry >> 26) & 0x0F; 11007 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1); 11008 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1); 11009 11010 enum { 11011 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 11012 OP_VREV, 11013 OP_VDUP0, 11014 OP_VDUP1, 11015 OP_VDUP2, 11016 OP_VDUP3, 11017 OP_VEXT1, 11018 OP_VEXT2, 11019 OP_VEXT3, 11020 OP_VUZPL, // VUZP, left result 11021 OP_VUZPR, // VUZP, right result 11022 OP_VZIPL, // VZIP, left result 11023 OP_VZIPR, // VZIP, right result 11024 OP_VTRNL, // VTRN, left result 11025 OP_VTRNR, // VTRN, right result 11026 OP_MOVLANE // Move lane. RHSID is the lane to move into 11027 }; 11028 11029 if (OpNum == OP_COPY) { 11030 if (LHSID == (1 * 9 + 2) * 9 + 3) 11031 return LHS; 11032 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!"); 11033 return RHS; 11034 } 11035 11036 if (OpNum == OP_MOVLANE) { 11037 // Decompose a PerfectShuffle ID to get the Mask for lane Elt 11038 auto getPFIDLane = [](unsigned ID, int Elt) -> int { 11039 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4"); 11040 Elt = 3 - Elt; 11041 while (Elt > 0) { 11042 ID /= 9; 11043 Elt--; 11044 } 11045 return (ID % 9 == 8) ? -1 : ID % 9; 11046 }; 11047 11048 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We 11049 // get the lane to move from from the PFID, which is always from the 11050 // original vectors (V1 or V2). 11051 SDValue OpLHS = GeneratePerfectShuffle( 11052 LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); 11053 EVT VT = OpLHS.getValueType(); 11054 assert(RHSID < 8 && "Expected a lane index for RHSID!"); 11055 unsigned ExtLane = 0; 11056 SDValue Input; 11057 11058 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs 11059 // convert into a higher type. 11060 if (RHSID & 0x4) { 11061 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1; 11062 if (MaskElt == -1) 11063 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1; 11064 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!"); 11065 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2); 11066 Input = MaskElt < 2 ? V1 : V2; 11067 if (VT.getScalarSizeInBits() == 16) { 11068 Input = DAG.getBitcast(MVT::v2f32, Input); 11069 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS); 11070 } else { 11071 assert(VT.getScalarSizeInBits() == 32 && 11072 "Expected 16 or 32 bit shuffle elemements"); 11073 Input = DAG.getBitcast(MVT::v2f64, Input); 11074 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS); 11075 } 11076 } else { 11077 int MaskElt = getPFIDLane(ID, RHSID); 11078 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!"); 11079 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4); 11080 Input = MaskElt < 4 ? V1 : V2; 11081 // Be careful about creating illegal types. Use f16 instead of i16. 11082 if (VT == MVT::v4i16) { 11083 Input = DAG.getBitcast(MVT::v4f16, Input); 11084 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS); 11085 } 11086 } 11087 SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, 11088 Input.getValueType().getVectorElementType(), 11089 Input, DAG.getVectorIdxConstant(ExtLane, dl)); 11090 SDValue Ins = 11091 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS, 11092 Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl)); 11093 return DAG.getBitcast(VT, Ins); 11094 } 11095 11096 SDValue OpLHS, OpRHS; 11097 OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, 11098 RHS, DAG, dl); 11099 OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS, 11100 RHS, DAG, dl); 11101 EVT VT = OpLHS.getValueType(); 11102 11103 switch (OpNum) { 11104 default: 11105 llvm_unreachable("Unknown shuffle opcode!"); 11106 case OP_VREV: 11107 // VREV divides the vector in half and swaps within the half. 11108 if (VT.getVectorElementType() == MVT::i32 || 11109 VT.getVectorElementType() == MVT::f32) 11110 return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS); 11111 // vrev <4 x i16> -> REV32 11112 if (VT.getVectorElementType() == MVT::i16 || 11113 VT.getVectorElementType() == MVT::f16 || 11114 VT.getVectorElementType() == MVT::bf16) 11115 return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS); 11116 // vrev <4 x i8> -> REV16 11117 assert(VT.getVectorElementType() == MVT::i8); 11118 return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS); 11119 case OP_VDUP0: 11120 case OP_VDUP1: 11121 case OP_VDUP2: 11122 case OP_VDUP3: { 11123 EVT EltTy = VT.getVectorElementType(); 11124 unsigned Opcode; 11125 if (EltTy == MVT::i8) 11126 Opcode = AArch64ISD::DUPLANE8; 11127 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16) 11128 Opcode = AArch64ISD::DUPLANE16; 11129 else if (EltTy == MVT::i32 || EltTy == MVT::f32) 11130 Opcode = AArch64ISD::DUPLANE32; 11131 else if (EltTy == MVT::i64 || EltTy == MVT::f64) 11132 Opcode = AArch64ISD::DUPLANE64; 11133 else 11134 llvm_unreachable("Invalid vector element type?"); 11135 11136 if (VT.getSizeInBits() == 64) 11137 OpLHS = WidenVector(OpLHS, DAG); 11138 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64); 11139 return DAG.getNode(Opcode, dl, VT, OpLHS, Lane); 11140 } 11141 case OP_VEXT1: 11142 case OP_VEXT2: 11143 case OP_VEXT3: { 11144 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS); 11145 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS, 11146 DAG.getConstant(Imm, dl, MVT::i32)); 11147 } 11148 case OP_VUZPL: 11149 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS, 11150 OpRHS); 11151 case OP_VUZPR: 11152 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS, 11153 OpRHS); 11154 case OP_VZIPL: 11155 return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS, 11156 OpRHS); 11157 case OP_VZIPR: 11158 return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS, 11159 OpRHS); 11160 case OP_VTRNL: 11161 return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS, 11162 OpRHS); 11163 case OP_VTRNR: 11164 return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS, 11165 OpRHS); 11166 } 11167 } 11168 11169 static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask, 11170 SelectionDAG &DAG) { 11171 // Check to see if we can use the TBL instruction. 11172 SDValue V1 = Op.getOperand(0); 11173 SDValue V2 = Op.getOperand(1); 11174 SDLoc DL(Op); 11175 11176 EVT EltVT = Op.getValueType().getVectorElementType(); 11177 unsigned BytesPerElt = EltVT.getSizeInBits() / 8; 11178 11179 bool Swap = false; 11180 if (V1.isUndef() || isZerosVector(V1.getNode())) { 11181 std::swap(V1, V2); 11182 Swap = true; 11183 } 11184 11185 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill 11186 // out of range values with 0s. We do need to make sure that any out-of-range 11187 // values are really out-of-range for a v16i8 vector. 11188 bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode()); 11189 MVT IndexVT = MVT::v8i8; 11190 unsigned IndexLen = 8; 11191 if (Op.getValueSizeInBits() == 128) { 11192 IndexVT = MVT::v16i8; 11193 IndexLen = 16; 11194 } 11195 11196 SmallVector<SDValue, 8> TBLMask; 11197 for (int Val : ShuffleMask) { 11198 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { 11199 unsigned Offset = Byte + Val * BytesPerElt; 11200 if (Swap) 11201 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen; 11202 if (IsUndefOrZero && Offset >= IndexLen) 11203 Offset = 255; 11204 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32)); 11205 } 11206 } 11207 11208 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1); 11209 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2); 11210 11211 SDValue Shuffle; 11212 if (IsUndefOrZero) { 11213 if (IndexLen == 8) 11214 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst); 11215 Shuffle = DAG.getNode( 11216 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, 11217 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, 11218 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen))); 11219 } else { 11220 if (IndexLen == 8) { 11221 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst); 11222 Shuffle = DAG.getNode( 11223 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, 11224 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, 11225 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen))); 11226 } else { 11227 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we 11228 // cannot currently represent the register constraints on the input 11229 // table registers. 11230 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst, 11231 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0], 11232 // IndexLen)); 11233 Shuffle = DAG.getNode( 11234 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, 11235 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst, 11236 V2Cst, 11237 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen))); 11238 } 11239 } 11240 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle); 11241 } 11242 11243 static unsigned getDUPLANEOp(EVT EltType) { 11244 if (EltType == MVT::i8) 11245 return AArch64ISD::DUPLANE8; 11246 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16) 11247 return AArch64ISD::DUPLANE16; 11248 if (EltType == MVT::i32 || EltType == MVT::f32) 11249 return AArch64ISD::DUPLANE32; 11250 if (EltType == MVT::i64 || EltType == MVT::f64) 11251 return AArch64ISD::DUPLANE64; 11252 11253 llvm_unreachable("Invalid vector element type?"); 11254 } 11255 11256 static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT, 11257 unsigned Opcode, SelectionDAG &DAG) { 11258 // Try to eliminate a bitcasted extract subvector before a DUPLANE. 11259 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) { 11260 // Match: dup (bitcast (extract_subv X, C)), LaneC 11261 if (BitCast.getOpcode() != ISD::BITCAST || 11262 BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR) 11263 return false; 11264 11265 // The extract index must align in the destination type. That may not 11266 // happen if the bitcast is from narrow to wide type. 11267 SDValue Extract = BitCast.getOperand(0); 11268 unsigned ExtIdx = Extract.getConstantOperandVal(1); 11269 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits(); 11270 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth; 11271 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits(); 11272 if (ExtIdxInBits % CastedEltBitWidth != 0) 11273 return false; 11274 11275 // Can't handle cases where vector size is not 128-bit 11276 if (!Extract.getOperand(0).getValueType().is128BitVector()) 11277 return false; 11278 11279 // Update the lane value by offsetting with the scaled extract index. 11280 LaneC += ExtIdxInBits / CastedEltBitWidth; 11281 11282 // Determine the casted vector type of the wide vector input. 11283 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC' 11284 // Examples: 11285 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3 11286 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5 11287 unsigned SrcVecNumElts = 11288 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth; 11289 CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(), 11290 SrcVecNumElts); 11291 return true; 11292 }; 11293 MVT CastVT; 11294 if (getScaledOffsetDup(V, Lane, CastVT)) { 11295 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0)); 11296 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR && 11297 V.getOperand(0).getValueType().is128BitVector()) { 11298 // The lane is incremented by the index of the extract. 11299 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3 11300 Lane += V.getConstantOperandVal(1); 11301 V = V.getOperand(0); 11302 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) { 11303 // The lane is decremented if we are splatting from the 2nd operand. 11304 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1 11305 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2; 11306 Lane -= Idx * VT.getVectorNumElements() / 2; 11307 V = WidenVector(V.getOperand(Idx), DAG); 11308 } else if (VT.getSizeInBits() == 64) { 11309 // Widen the operand to 128-bit register with undef. 11310 V = WidenVector(V, DAG); 11311 } 11312 return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64)); 11313 } 11314 11315 // Return true if we can get a new shuffle mask by checking the parameter mask 11316 // array to test whether every two adjacent mask values are continuous and 11317 // starting from an even number. 11318 static bool isWideTypeMask(ArrayRef<int> M, EVT VT, 11319 SmallVectorImpl<int> &NewMask) { 11320 unsigned NumElts = VT.getVectorNumElements(); 11321 if (NumElts % 2 != 0) 11322 return false; 11323 11324 NewMask.clear(); 11325 for (unsigned i = 0; i < NumElts; i += 2) { 11326 int M0 = M[i]; 11327 int M1 = M[i + 1]; 11328 11329 // If both elements are undef, new mask is undef too. 11330 if (M0 == -1 && M1 == -1) { 11331 NewMask.push_back(-1); 11332 continue; 11333 } 11334 11335 if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) { 11336 NewMask.push_back(M1 / 2); 11337 continue; 11338 } 11339 11340 if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) { 11341 NewMask.push_back(M0 / 2); 11342 continue; 11343 } 11344 11345 NewMask.clear(); 11346 return false; 11347 } 11348 11349 assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!"); 11350 return true; 11351 } 11352 11353 // Try to widen element type to get a new mask value for a better permutation 11354 // sequence, so that we can use NEON shuffle instructions, such as zip1/2, 11355 // UZP1/2, TRN1/2, REV, INS, etc. 11356 // For example: 11357 // shufflevector <4 x i32> %a, <4 x i32> %b, 11358 // <4 x i32> <i32 6, i32 7, i32 2, i32 3> 11359 // is equivalent to: 11360 // shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1> 11361 // Finally, we can get: 11362 // mov v0.d[0], v1.d[1] 11363 static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG) { 11364 SDLoc DL(Op); 11365 EVT VT = Op.getValueType(); 11366 EVT ScalarVT = VT.getVectorElementType(); 11367 unsigned ElementSize = ScalarVT.getFixedSizeInBits(); 11368 SDValue V0 = Op.getOperand(0); 11369 SDValue V1 = Op.getOperand(1); 11370 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask(); 11371 11372 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ... 11373 // We need to make sure the wider element type is legal. Thus, ElementSize 11374 // should be not larger than 32 bits, and i1 type should also be excluded. 11375 if (ElementSize > 32 || ElementSize == 1) 11376 return SDValue(); 11377 11378 SmallVector<int, 8> NewMask; 11379 if (isWideTypeMask(Mask, VT, NewMask)) { 11380 MVT NewEltVT = VT.isFloatingPoint() 11381 ? MVT::getFloatingPointVT(ElementSize * 2) 11382 : MVT::getIntegerVT(ElementSize * 2); 11383 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2); 11384 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) { 11385 V0 = DAG.getBitcast(NewVT, V0); 11386 V1 = DAG.getBitcast(NewVT, V1); 11387 return DAG.getBitcast(VT, 11388 DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask)); 11389 } 11390 } 11391 11392 return SDValue(); 11393 } 11394 11395 // Try to fold shuffle (tbl2, tbl2) into a single tbl4. 11396 static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op, 11397 ArrayRef<int> ShuffleMask, 11398 SelectionDAG &DAG) { 11399 SDValue Tbl1 = Op->getOperand(0); 11400 SDValue Tbl2 = Op->getOperand(1); 11401 SDLoc dl(Op); 11402 SDValue Tbl2ID = 11403 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, dl, MVT::i64); 11404 11405 EVT VT = Op.getValueType(); 11406 if (Tbl1->getOpcode() != ISD::INTRINSIC_WO_CHAIN || 11407 Tbl1->getOperand(0) != Tbl2ID || 11408 Tbl2->getOpcode() != ISD::INTRINSIC_WO_CHAIN || 11409 Tbl2->getOperand(0) != Tbl2ID) 11410 return SDValue(); 11411 11412 if (Tbl1->getValueType(0) != MVT::v16i8 || 11413 Tbl2->getValueType(0) != MVT::v16i8) 11414 return SDValue(); 11415 11416 SDValue Mask1 = Tbl1->getOperand(3); 11417 SDValue Mask2 = Tbl2->getOperand(3); 11418 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue()); 11419 for (unsigned I = 0; I < 16; I++) { 11420 if (ShuffleMask[I] < 16) 11421 TBLMaskParts[I] = Mask1->getOperand(ShuffleMask[I]); 11422 else { 11423 auto *C = 11424 dyn_cast<ConstantSDNode>(Mask2->getOperand(ShuffleMask[I] - 16)); 11425 if (!C) 11426 return SDValue(); 11427 TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, dl, MVT::i32); 11428 } 11429 } 11430 11431 SDValue TBLMask = DAG.getBuildVector(VT, dl, TBLMaskParts); 11432 SDValue ID = 11433 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64); 11434 11435 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8, 11436 {ID, Tbl1->getOperand(1), Tbl1->getOperand(2), 11437 Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask}); 11438 } 11439 11440 // Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros, 11441 // but we don't have an appropriate instruction, 11442 // so custom-lower it as ZIP1-with-zeros. 11443 SDValue 11444 AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op, 11445 SelectionDAG &DAG) const { 11446 SDLoc dl(Op); 11447 EVT VT = Op.getValueType(); 11448 SDValue SrcOp = Op.getOperand(0); 11449 EVT SrcVT = SrcOp.getValueType(); 11450 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 && 11451 "Unexpected extension factor."); 11452 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits(); 11453 // FIXME: support multi-step zipping? 11454 if (Scale != 2) 11455 return SDValue(); 11456 SDValue Zeros = DAG.getConstant(0, dl, SrcVT); 11457 return DAG.getBitcast(VT, 11458 DAG.getNode(AArch64ISD::ZIP1, dl, SrcVT, SrcOp, Zeros)); 11459 } 11460 11461 SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, 11462 SelectionDAG &DAG) const { 11463 SDLoc dl(Op); 11464 EVT VT = Op.getValueType(); 11465 11466 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 11467 11468 if (useSVEForFixedLengthVectorVT(VT, 11469 Subtarget->forceStreamingCompatibleSVE())) 11470 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG); 11471 11472 // Convert shuffles that are directly supported on NEON to target-specific 11473 // DAG nodes, instead of keeping them as shuffles and matching them again 11474 // during code selection. This is more efficient and avoids the possibility 11475 // of inconsistencies between legalization and selection. 11476 ArrayRef<int> ShuffleMask = SVN->getMask(); 11477 11478 SDValue V1 = Op.getOperand(0); 11479 SDValue V2 = Op.getOperand(1); 11480 11481 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!"); 11482 assert(ShuffleMask.size() == VT.getVectorNumElements() && 11483 "Unexpected VECTOR_SHUFFLE mask size!"); 11484 11485 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG)) 11486 return Res; 11487 11488 if (SVN->isSplat()) { 11489 int Lane = SVN->getSplatIndex(); 11490 // If this is undef splat, generate it via "just" vdup, if possible. 11491 if (Lane == -1) 11492 Lane = 0; 11493 11494 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) 11495 return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(), 11496 V1.getOperand(0)); 11497 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non- 11498 // constant. If so, we can just reference the lane's definition directly. 11499 if (V1.getOpcode() == ISD::BUILD_VECTOR && 11500 !isa<ConstantSDNode>(V1.getOperand(Lane))) 11501 return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane)); 11502 11503 // Otherwise, duplicate from the lane of the input vector. 11504 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType()); 11505 return constructDup(V1, Lane, dl, VT, Opcode, DAG); 11506 } 11507 11508 // Check if the mask matches a DUP for a wider element 11509 for (unsigned LaneSize : {64U, 32U, 16U}) { 11510 unsigned Lane = 0; 11511 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) { 11512 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64 11513 : LaneSize == 32 ? AArch64ISD::DUPLANE32 11514 : AArch64ISD::DUPLANE16; 11515 // Cast V1 to an integer vector with required lane size 11516 MVT NewEltTy = MVT::getIntegerVT(LaneSize); 11517 unsigned NewEltCount = VT.getSizeInBits() / LaneSize; 11518 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount); 11519 V1 = DAG.getBitcast(NewVecTy, V1); 11520 // Constuct the DUP instruction 11521 V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG); 11522 // Cast back to the original type 11523 return DAG.getBitcast(VT, V1); 11524 } 11525 } 11526 11527 if (isREVMask(ShuffleMask, VT, 64)) 11528 return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2); 11529 if (isREVMask(ShuffleMask, VT, 32)) 11530 return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2); 11531 if (isREVMask(ShuffleMask, VT, 16)) 11532 return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2); 11533 11534 if (((VT.getVectorNumElements() == 8 && VT.getScalarSizeInBits() == 16) || 11535 (VT.getVectorNumElements() == 16 && VT.getScalarSizeInBits() == 8)) && 11536 ShuffleVectorInst::isReverseMask(ShuffleMask)) { 11537 SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1); 11538 return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev, 11539 DAG.getConstant(8, dl, MVT::i32)); 11540 } 11541 11542 bool ReverseEXT = false; 11543 unsigned Imm; 11544 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) { 11545 if (ReverseEXT) 11546 std::swap(V1, V2); 11547 Imm *= getExtFactor(V1); 11548 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2, 11549 DAG.getConstant(Imm, dl, MVT::i32)); 11550 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) { 11551 Imm *= getExtFactor(V1); 11552 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1, 11553 DAG.getConstant(Imm, dl, MVT::i32)); 11554 } 11555 11556 unsigned WhichResult; 11557 if (isZIPMask(ShuffleMask, VT, WhichResult)) { 11558 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; 11559 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); 11560 } 11561 if (isUZPMask(ShuffleMask, VT, WhichResult)) { 11562 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; 11563 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); 11564 } 11565 if (isTRNMask(ShuffleMask, VT, WhichResult)) { 11566 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; 11567 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); 11568 } 11569 11570 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 11571 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; 11572 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); 11573 } 11574 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 11575 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; 11576 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); 11577 } 11578 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 11579 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; 11580 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); 11581 } 11582 11583 if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG)) 11584 return Concat; 11585 11586 bool DstIsLeft; 11587 int Anomaly; 11588 int NumInputElements = V1.getValueType().getVectorNumElements(); 11589 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) { 11590 SDValue DstVec = DstIsLeft ? V1 : V2; 11591 SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64); 11592 11593 SDValue SrcVec = V1; 11594 int SrcLane = ShuffleMask[Anomaly]; 11595 if (SrcLane >= NumInputElements) { 11596 SrcVec = V2; 11597 SrcLane -= VT.getVectorNumElements(); 11598 } 11599 SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64); 11600 11601 EVT ScalarVT = VT.getVectorElementType(); 11602 11603 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger()) 11604 ScalarVT = MVT::i32; 11605 11606 return DAG.getNode( 11607 ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, 11608 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV), 11609 DstLaneV); 11610 } 11611 11612 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG)) 11613 return NewSD; 11614 11615 // If the shuffle is not directly supported and it has 4 elements, use 11616 // the PerfectShuffle-generated table to synthesize it from other shuffles. 11617 unsigned NumElts = VT.getVectorNumElements(); 11618 if (NumElts == 4) { 11619 unsigned PFIndexes[4]; 11620 for (unsigned i = 0; i != 4; ++i) { 11621 if (ShuffleMask[i] < 0) 11622 PFIndexes[i] = 8; 11623 else 11624 PFIndexes[i] = ShuffleMask[i]; 11625 } 11626 11627 // Compute the index in the perfect shuffle table. 11628 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + 11629 PFIndexes[2] * 9 + PFIndexes[3]; 11630 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 11631 return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG, 11632 dl); 11633 } 11634 11635 return GenerateTBL(Op, ShuffleMask, DAG); 11636 } 11637 11638 SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op, 11639 SelectionDAG &DAG) const { 11640 EVT VT = Op.getValueType(); 11641 11642 if (useSVEForFixedLengthVectorVT(VT, 11643 Subtarget->forceStreamingCompatibleSVE())) 11644 return LowerToScalableOp(Op, DAG); 11645 11646 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 && 11647 "Unexpected vector type!"); 11648 11649 // We can handle the constant cases during isel. 11650 if (isa<ConstantSDNode>(Op.getOperand(0))) 11651 return Op; 11652 11653 // There isn't a natural way to handle the general i1 case, so we use some 11654 // trickery with whilelo. 11655 SDLoc DL(Op); 11656 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64); 11657 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal, 11658 DAG.getValueType(MVT::i1)); 11659 SDValue ID = 11660 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64); 11661 SDValue Zero = DAG.getConstant(0, DL, MVT::i64); 11662 if (VT == MVT::nxv1i1) 11663 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1, 11664 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID, 11665 Zero, SplatVal), 11666 Zero); 11667 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal); 11668 } 11669 11670 SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op, 11671 SelectionDAG &DAG) const { 11672 SDLoc DL(Op); 11673 11674 EVT VT = Op.getValueType(); 11675 if (!isTypeLegal(VT) || !VT.isScalableVector()) 11676 return SDValue(); 11677 11678 // Current lowering only supports the SVE-ACLE types. 11679 if (VT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock) 11680 return SDValue(); 11681 11682 // The DUPQ operation is indepedent of element type so normalise to i64s. 11683 SDValue Idx128 = Op.getOperand(2); 11684 11685 // DUPQ can be used when idx is in range. 11686 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128); 11687 if (CIdx && (CIdx->getZExtValue() <= 3)) { 11688 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64); 11689 return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI); 11690 } 11691 11692 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1)); 11693 11694 // The ACLE says this must produce the same result as: 11695 // svtbl(data, svadd_x(svptrue_b64(), 11696 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1), 11697 // index * 2)) 11698 SDValue One = DAG.getConstant(1, DL, MVT::i64); 11699 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One); 11700 11701 // create the vector 0,1,0,1,... 11702 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64); 11703 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne); 11704 11705 // create the vector idx64,idx64+1,idx64,idx64+1,... 11706 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128); 11707 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64); 11708 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64); 11709 11710 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],... 11711 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask); 11712 return DAG.getNode(ISD::BITCAST, DL, VT, TBL); 11713 } 11714 11715 11716 static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, 11717 APInt &UndefBits) { 11718 EVT VT = BVN->getValueType(0); 11719 APInt SplatBits, SplatUndef; 11720 unsigned SplatBitSize; 11721 bool HasAnyUndefs; 11722 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 11723 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize; 11724 11725 for (unsigned i = 0; i < NumSplats; ++i) { 11726 CnstBits <<= SplatBitSize; 11727 UndefBits <<= SplatBitSize; 11728 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits()); 11729 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits()); 11730 } 11731 11732 return true; 11733 } 11734 11735 return false; 11736 } 11737 11738 // Try 64-bit splatted SIMD immediate. 11739 static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, 11740 const APInt &Bits) { 11741 if (Bits.getHiBits(64) == Bits.getLoBits(64)) { 11742 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); 11743 EVT VT = Op.getValueType(); 11744 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64; 11745 11746 if (AArch64_AM::isAdvSIMDModImmType10(Value)) { 11747 Value = AArch64_AM::encodeAdvSIMDModImmType10(Value); 11748 11749 SDLoc dl(Op); 11750 SDValue Mov = DAG.getNode(NewOp, dl, MovTy, 11751 DAG.getConstant(Value, dl, MVT::i32)); 11752 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 11753 } 11754 } 11755 11756 return SDValue(); 11757 } 11758 11759 // Try 32-bit splatted SIMD immediate. 11760 static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, 11761 const APInt &Bits, 11762 const SDValue *LHS = nullptr) { 11763 EVT VT = Op.getValueType(); 11764 if (VT.isFixedLengthVector() && 11765 DAG.getSubtarget<AArch64Subtarget>().forceStreamingCompatibleSVE()) 11766 return SDValue(); 11767 11768 if (Bits.getHiBits(64) == Bits.getLoBits(64)) { 11769 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); 11770 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 11771 bool isAdvSIMDModImm = false; 11772 uint64_t Shift; 11773 11774 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) { 11775 Value = AArch64_AM::encodeAdvSIMDModImmType1(Value); 11776 Shift = 0; 11777 } 11778 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) { 11779 Value = AArch64_AM::encodeAdvSIMDModImmType2(Value); 11780 Shift = 8; 11781 } 11782 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) { 11783 Value = AArch64_AM::encodeAdvSIMDModImmType3(Value); 11784 Shift = 16; 11785 } 11786 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) { 11787 Value = AArch64_AM::encodeAdvSIMDModImmType4(Value); 11788 Shift = 24; 11789 } 11790 11791 if (isAdvSIMDModImm) { 11792 SDLoc dl(Op); 11793 SDValue Mov; 11794 11795 if (LHS) 11796 Mov = DAG.getNode(NewOp, dl, MovTy, *LHS, 11797 DAG.getConstant(Value, dl, MVT::i32), 11798 DAG.getConstant(Shift, dl, MVT::i32)); 11799 else 11800 Mov = DAG.getNode(NewOp, dl, MovTy, 11801 DAG.getConstant(Value, dl, MVT::i32), 11802 DAG.getConstant(Shift, dl, MVT::i32)); 11803 11804 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 11805 } 11806 } 11807 11808 return SDValue(); 11809 } 11810 11811 // Try 16-bit splatted SIMD immediate. 11812 static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, 11813 const APInt &Bits, 11814 const SDValue *LHS = nullptr) { 11815 EVT VT = Op.getValueType(); 11816 if (VT.isFixedLengthVector() && 11817 DAG.getSubtarget<AArch64Subtarget>().forceStreamingCompatibleSVE()) 11818 return SDValue(); 11819 11820 if (Bits.getHiBits(64) == Bits.getLoBits(64)) { 11821 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); 11822 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 11823 bool isAdvSIMDModImm = false; 11824 uint64_t Shift; 11825 11826 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) { 11827 Value = AArch64_AM::encodeAdvSIMDModImmType5(Value); 11828 Shift = 0; 11829 } 11830 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) { 11831 Value = AArch64_AM::encodeAdvSIMDModImmType6(Value); 11832 Shift = 8; 11833 } 11834 11835 if (isAdvSIMDModImm) { 11836 SDLoc dl(Op); 11837 SDValue Mov; 11838 11839 if (LHS) 11840 Mov = DAG.getNode(NewOp, dl, MovTy, *LHS, 11841 DAG.getConstant(Value, dl, MVT::i32), 11842 DAG.getConstant(Shift, dl, MVT::i32)); 11843 else 11844 Mov = DAG.getNode(NewOp, dl, MovTy, 11845 DAG.getConstant(Value, dl, MVT::i32), 11846 DAG.getConstant(Shift, dl, MVT::i32)); 11847 11848 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 11849 } 11850 } 11851 11852 return SDValue(); 11853 } 11854 11855 // Try 32-bit splatted SIMD immediate with shifted ones. 11856 static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, 11857 SelectionDAG &DAG, const APInt &Bits) { 11858 if (Bits.getHiBits(64) == Bits.getLoBits(64)) { 11859 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); 11860 EVT VT = Op.getValueType(); 11861 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 11862 bool isAdvSIMDModImm = false; 11863 uint64_t Shift; 11864 11865 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) { 11866 Value = AArch64_AM::encodeAdvSIMDModImmType7(Value); 11867 Shift = 264; 11868 } 11869 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) { 11870 Value = AArch64_AM::encodeAdvSIMDModImmType8(Value); 11871 Shift = 272; 11872 } 11873 11874 if (isAdvSIMDModImm) { 11875 SDLoc dl(Op); 11876 SDValue Mov = DAG.getNode(NewOp, dl, MovTy, 11877 DAG.getConstant(Value, dl, MVT::i32), 11878 DAG.getConstant(Shift, dl, MVT::i32)); 11879 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 11880 } 11881 } 11882 11883 return SDValue(); 11884 } 11885 11886 // Try 8-bit splatted SIMD immediate. 11887 static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, 11888 const APInt &Bits) { 11889 if (Bits.getHiBits(64) == Bits.getLoBits(64)) { 11890 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); 11891 EVT VT = Op.getValueType(); 11892 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8; 11893 11894 if (AArch64_AM::isAdvSIMDModImmType9(Value)) { 11895 Value = AArch64_AM::encodeAdvSIMDModImmType9(Value); 11896 11897 SDLoc dl(Op); 11898 SDValue Mov = DAG.getNode(NewOp, dl, MovTy, 11899 DAG.getConstant(Value, dl, MVT::i32)); 11900 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 11901 } 11902 } 11903 11904 return SDValue(); 11905 } 11906 11907 // Try FP splatted SIMD immediate. 11908 static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, 11909 const APInt &Bits) { 11910 if (Bits.getHiBits(64) == Bits.getLoBits(64)) { 11911 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); 11912 EVT VT = Op.getValueType(); 11913 bool isWide = (VT.getSizeInBits() == 128); 11914 MVT MovTy; 11915 bool isAdvSIMDModImm = false; 11916 11917 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) { 11918 Value = AArch64_AM::encodeAdvSIMDModImmType11(Value); 11919 MovTy = isWide ? MVT::v4f32 : MVT::v2f32; 11920 } 11921 else if (isWide && 11922 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) { 11923 Value = AArch64_AM::encodeAdvSIMDModImmType12(Value); 11924 MovTy = MVT::v2f64; 11925 } 11926 11927 if (isAdvSIMDModImm) { 11928 SDLoc dl(Op); 11929 SDValue Mov = DAG.getNode(NewOp, dl, MovTy, 11930 DAG.getConstant(Value, dl, MVT::i32)); 11931 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 11932 } 11933 } 11934 11935 return SDValue(); 11936 } 11937 11938 // Specialized code to quickly find if PotentialBVec is a BuildVector that 11939 // consists of only the same constant int value, returned in reference arg 11940 // ConstVal 11941 static bool isAllConstantBuildVector(const SDValue &PotentialBVec, 11942 uint64_t &ConstVal) { 11943 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec); 11944 if (!Bvec) 11945 return false; 11946 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0)); 11947 if (!FirstElt) 11948 return false; 11949 EVT VT = Bvec->getValueType(0); 11950 unsigned NumElts = VT.getVectorNumElements(); 11951 for (unsigned i = 1; i < NumElts; ++i) 11952 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt) 11953 return false; 11954 ConstVal = FirstElt->getZExtValue(); 11955 return true; 11956 } 11957 11958 // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)), 11959 // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a 11960 // BUILD_VECTORs with constant element C1, C2 is a constant, and: 11961 // - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2) 11962 // - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2) 11963 // The (or (lsl Y, C2), (and X, BvecC1)) case is also handled. 11964 static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { 11965 EVT VT = N->getValueType(0); 11966 11967 if (!VT.isVector()) 11968 return SDValue(); 11969 11970 SDLoc DL(N); 11971 11972 SDValue And; 11973 SDValue Shift; 11974 11975 SDValue FirstOp = N->getOperand(0); 11976 unsigned FirstOpc = FirstOp.getOpcode(); 11977 SDValue SecondOp = N->getOperand(1); 11978 unsigned SecondOpc = SecondOp.getOpcode(); 11979 11980 // Is one of the operands an AND or a BICi? The AND may have been optimised to 11981 // a BICi in order to use an immediate instead of a register. 11982 // Is the other operand an shl or lshr? This will have been turned into: 11983 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift. 11984 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) && 11985 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR)) { 11986 And = FirstOp; 11987 Shift = SecondOp; 11988 11989 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) && 11990 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR)) { 11991 And = SecondOp; 11992 Shift = FirstOp; 11993 } else 11994 return SDValue(); 11995 11996 bool IsAnd = And.getOpcode() == ISD::AND; 11997 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR; 11998 11999 // Is the shift amount constant? 12000 ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1)); 12001 if (!C2node) 12002 return SDValue(); 12003 12004 uint64_t C1; 12005 if (IsAnd) { 12006 // Is the and mask vector all constant? 12007 if (!isAllConstantBuildVector(And.getOperand(1), C1)) 12008 return SDValue(); 12009 } else { 12010 // Reconstruct the corresponding AND immediate from the two BICi immediates. 12011 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1)); 12012 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2)); 12013 assert(C1nodeImm && C1nodeShift); 12014 C1 = ~(C1nodeImm->getZExtValue() << C1nodeShift->getZExtValue()); 12015 } 12016 12017 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or 12018 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account 12019 // how much one can shift elements of a particular size? 12020 uint64_t C2 = C2node->getZExtValue(); 12021 unsigned ElemSizeInBits = VT.getScalarSizeInBits(); 12022 if (C2 > ElemSizeInBits) 12023 return SDValue(); 12024 12025 APInt C1AsAPInt(ElemSizeInBits, C1); 12026 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2) 12027 : APInt::getLowBitsSet(ElemSizeInBits, C2); 12028 if (C1AsAPInt != RequiredC1) 12029 return SDValue(); 12030 12031 SDValue X = And.getOperand(0); 12032 SDValue Y = Shift.getOperand(0); 12033 12034 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI; 12035 SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Shift.getOperand(1)); 12036 12037 LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n"); 12038 LLVM_DEBUG(N->dump(&DAG)); 12039 LLVM_DEBUG(dbgs() << "into: \n"); 12040 LLVM_DEBUG(ResultSLI->dump(&DAG)); 12041 12042 ++NumShiftInserts; 12043 return ResultSLI; 12044 } 12045 12046 SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, 12047 SelectionDAG &DAG) const { 12048 if (useSVEForFixedLengthVectorVT(Op.getValueType(), 12049 Subtarget->forceStreamingCompatibleSVE())) 12050 return LowerToScalableOp(Op, DAG); 12051 12052 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2)) 12053 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG)) 12054 return Res; 12055 12056 EVT VT = Op.getValueType(); 12057 12058 SDValue LHS = Op.getOperand(0); 12059 BuildVectorSDNode *BVN = 12060 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode()); 12061 if (!BVN) { 12062 // OR commutes, so try swapping the operands. 12063 LHS = Op.getOperand(1); 12064 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode()); 12065 } 12066 if (!BVN) 12067 return Op; 12068 12069 APInt DefBits(VT.getSizeInBits(), 0); 12070 APInt UndefBits(VT.getSizeInBits(), 0); 12071 if (resolveBuildVector(BVN, DefBits, UndefBits)) { 12072 SDValue NewOp; 12073 12074 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG, 12075 DefBits, &LHS)) || 12076 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG, 12077 DefBits, &LHS))) 12078 return NewOp; 12079 12080 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG, 12081 UndefBits, &LHS)) || 12082 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG, 12083 UndefBits, &LHS))) 12084 return NewOp; 12085 } 12086 12087 // We can always fall back to a non-immediate OR. 12088 return Op; 12089 } 12090 12091 // Normalize the operands of BUILD_VECTOR. The value of constant operands will 12092 // be truncated to fit element width. 12093 static SDValue NormalizeBuildVector(SDValue Op, 12094 SelectionDAG &DAG) { 12095 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 12096 SDLoc dl(Op); 12097 EVT VT = Op.getValueType(); 12098 EVT EltTy= VT.getVectorElementType(); 12099 12100 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16) 12101 return Op; 12102 12103 SmallVector<SDValue, 16> Ops; 12104 for (SDValue Lane : Op->ops()) { 12105 // For integer vectors, type legalization would have promoted the 12106 // operands already. Otherwise, if Op is a floating-point splat 12107 // (with operands cast to integers), then the only possibilities 12108 // are constants and UNDEFs. 12109 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) { 12110 APInt LowBits(EltTy.getSizeInBits(), 12111 CstLane->getZExtValue()); 12112 Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32); 12113 } else if (Lane.getNode()->isUndef()) { 12114 Lane = DAG.getUNDEF(MVT::i32); 12115 } else { 12116 assert(Lane.getValueType() == MVT::i32 && 12117 "Unexpected BUILD_VECTOR operand type"); 12118 } 12119 Ops.push_back(Lane); 12120 } 12121 return DAG.getBuildVector(VT, dl, Ops); 12122 } 12123 12124 static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG) { 12125 EVT VT = Op.getValueType(); 12126 12127 APInt DefBits(VT.getSizeInBits(), 0); 12128 APInt UndefBits(VT.getSizeInBits(), 0); 12129 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 12130 if (resolveBuildVector(BVN, DefBits, UndefBits)) { 12131 SDValue NewOp; 12132 if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) || 12133 (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || 12134 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) || 12135 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || 12136 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) || 12137 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits))) 12138 return NewOp; 12139 12140 DefBits = ~DefBits; 12141 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) || 12142 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) || 12143 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits))) 12144 return NewOp; 12145 12146 DefBits = UndefBits; 12147 if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) || 12148 (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || 12149 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) || 12150 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || 12151 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) || 12152 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits))) 12153 return NewOp; 12154 12155 DefBits = ~UndefBits; 12156 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) || 12157 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) || 12158 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits))) 12159 return NewOp; 12160 } 12161 12162 return SDValue(); 12163 } 12164 12165 SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, 12166 SelectionDAG &DAG) const { 12167 EVT VT = Op.getValueType(); 12168 12169 if (useSVEForFixedLengthVectorVT(VT, 12170 Subtarget->forceStreamingCompatibleSVE())) { 12171 if (auto SeqInfo = cast<BuildVectorSDNode>(Op)->isConstantSequence()) { 12172 SDLoc DL(Op); 12173 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 12174 SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT); 12175 SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second); 12176 SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps); 12177 return convertFromScalableVector(DAG, Op.getValueType(), Seq); 12178 } 12179 12180 // Revert to common legalisation for all other variants. 12181 return SDValue(); 12182 } 12183 12184 // Try to build a simple constant vector. 12185 Op = NormalizeBuildVector(Op, DAG); 12186 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so, 12187 // abort. 12188 if (Op.getOpcode() != ISD::BUILD_VECTOR) 12189 return SDValue(); 12190 12191 if (VT.isInteger()) { 12192 // Certain vector constants, used to express things like logical NOT and 12193 // arithmetic NEG, are passed through unmodified. This allows special 12194 // patterns for these operations to match, which will lower these constants 12195 // to whatever is proven necessary. 12196 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 12197 if (BVN->isConstant()) 12198 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) { 12199 unsigned BitSize = VT.getVectorElementType().getSizeInBits(); 12200 APInt Val(BitSize, 12201 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue()); 12202 if (Val.isZero() || Val.isAllOnes()) 12203 return Op; 12204 } 12205 } 12206 12207 if (SDValue V = ConstantBuildVector(Op, DAG)) 12208 return V; 12209 12210 // Scan through the operands to find some interesting properties we can 12211 // exploit: 12212 // 1) If only one value is used, we can use a DUP, or 12213 // 2) if only the low element is not undef, we can just insert that, or 12214 // 3) if only one constant value is used (w/ some non-constant lanes), 12215 // we can splat the constant value into the whole vector then fill 12216 // in the non-constant lanes. 12217 // 4) FIXME: If different constant values are used, but we can intelligently 12218 // select the values we'll be overwriting for the non-constant 12219 // lanes such that we can directly materialize the vector 12220 // some other way (MOVI, e.g.), we can be sneaky. 12221 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP. 12222 SDLoc dl(Op); 12223 unsigned NumElts = VT.getVectorNumElements(); 12224 bool isOnlyLowElement = true; 12225 bool usesOnlyOneValue = true; 12226 bool usesOnlyOneConstantValue = true; 12227 bool isConstant = true; 12228 bool AllLanesExtractElt = true; 12229 unsigned NumConstantLanes = 0; 12230 unsigned NumDifferentLanes = 0; 12231 unsigned NumUndefLanes = 0; 12232 SDValue Value; 12233 SDValue ConstantValue; 12234 for (unsigned i = 0; i < NumElts; ++i) { 12235 SDValue V = Op.getOperand(i); 12236 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) 12237 AllLanesExtractElt = false; 12238 if (V.isUndef()) { 12239 ++NumUndefLanes; 12240 continue; 12241 } 12242 if (i > 0) 12243 isOnlyLowElement = false; 12244 if (!isIntOrFPConstant(V)) 12245 isConstant = false; 12246 12247 if (isIntOrFPConstant(V)) { 12248 ++NumConstantLanes; 12249 if (!ConstantValue.getNode()) 12250 ConstantValue = V; 12251 else if (ConstantValue != V) 12252 usesOnlyOneConstantValue = false; 12253 } 12254 12255 if (!Value.getNode()) 12256 Value = V; 12257 else if (V != Value) { 12258 usesOnlyOneValue = false; 12259 ++NumDifferentLanes; 12260 } 12261 } 12262 12263 if (!Value.getNode()) { 12264 LLVM_DEBUG( 12265 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n"); 12266 return DAG.getUNDEF(VT); 12267 } 12268 12269 // Convert BUILD_VECTOR where all elements but the lowest are undef into 12270 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector 12271 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR. 12272 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) { 12273 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 " 12274 "SCALAR_TO_VECTOR node\n"); 12275 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); 12276 } 12277 12278 if (AllLanesExtractElt) { 12279 SDNode *Vector = nullptr; 12280 bool Even = false; 12281 bool Odd = false; 12282 // Check whether the extract elements match the Even pattern <0,2,4,...> or 12283 // the Odd pattern <1,3,5,...>. 12284 for (unsigned i = 0; i < NumElts; ++i) { 12285 SDValue V = Op.getOperand(i); 12286 const SDNode *N = V.getNode(); 12287 if (!isa<ConstantSDNode>(N->getOperand(1))) 12288 break; 12289 SDValue N0 = N->getOperand(0); 12290 12291 // All elements are extracted from the same vector. 12292 if (!Vector) { 12293 Vector = N0.getNode(); 12294 // Check that the type of EXTRACT_VECTOR_ELT matches the type of 12295 // BUILD_VECTOR. 12296 if (VT.getVectorElementType() != 12297 N0.getValueType().getVectorElementType()) 12298 break; 12299 } else if (Vector != N0.getNode()) { 12300 Odd = false; 12301 Even = false; 12302 break; 12303 } 12304 12305 // Extracted values are either at Even indices <0,2,4,...> or at Odd 12306 // indices <1,3,5,...>. 12307 uint64_t Val = N->getConstantOperandVal(1); 12308 if (Val == 2 * i) { 12309 Even = true; 12310 continue; 12311 } 12312 if (Val - 1 == 2 * i) { 12313 Odd = true; 12314 continue; 12315 } 12316 12317 // Something does not match: abort. 12318 Odd = false; 12319 Even = false; 12320 break; 12321 } 12322 if (Even || Odd) { 12323 SDValue LHS = 12324 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0), 12325 DAG.getConstant(0, dl, MVT::i64)); 12326 SDValue RHS = 12327 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0), 12328 DAG.getConstant(NumElts, dl, MVT::i64)); 12329 12330 if (Even && !Odd) 12331 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS, 12332 RHS); 12333 if (Odd && !Even) 12334 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS, 12335 RHS); 12336 } 12337 } 12338 12339 // Use DUP for non-constant splats. For f32 constant splats, reduce to 12340 // i32 and try again. 12341 if (usesOnlyOneValue) { 12342 if (!isConstant) { 12343 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 12344 Value.getValueType() != VT) { 12345 LLVM_DEBUG( 12346 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n"); 12347 return DAG.getNode(AArch64ISD::DUP, dl, VT, Value); 12348 } 12349 12350 // This is actually a DUPLANExx operation, which keeps everything vectory. 12351 12352 SDValue Lane = Value.getOperand(1); 12353 Value = Value.getOperand(0); 12354 if (Value.getValueSizeInBits() == 64) { 12355 LLVM_DEBUG( 12356 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, " 12357 "widening it\n"); 12358 Value = WidenVector(Value, DAG); 12359 } 12360 12361 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType()); 12362 return DAG.getNode(Opcode, dl, VT, Value, Lane); 12363 } 12364 12365 if (VT.getVectorElementType().isFloatingPoint()) { 12366 SmallVector<SDValue, 8> Ops; 12367 EVT EltTy = VT.getVectorElementType(); 12368 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 || 12369 EltTy == MVT::f64) && "Unsupported floating-point vector type"); 12370 LLVM_DEBUG( 12371 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int " 12372 "BITCASTS, and try again\n"); 12373 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits()); 12374 for (unsigned i = 0; i < NumElts; ++i) 12375 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i))); 12376 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts); 12377 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); 12378 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: "; 12379 Val.dump();); 12380 Val = LowerBUILD_VECTOR(Val, DAG); 12381 if (Val.getNode()) 12382 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 12383 } 12384 } 12385 12386 // If we need to insert a small number of different non-constant elements and 12387 // the vector width is sufficiently large, prefer using DUP with the common 12388 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred, 12389 // skip the constant lane handling below. 12390 bool PreferDUPAndInsert = 12391 !isConstant && NumDifferentLanes >= 1 && 12392 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) && 12393 NumDifferentLanes >= NumConstantLanes; 12394 12395 // If there was only one constant value used and for more than one lane, 12396 // start by splatting that value, then replace the non-constant lanes. This 12397 // is better than the default, which will perform a separate initialization 12398 // for each lane. 12399 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) { 12400 // Firstly, try to materialize the splat constant. 12401 SDValue Vec = DAG.getSplatBuildVector(VT, dl, ConstantValue), 12402 Val = ConstantBuildVector(Vec, DAG); 12403 if (!Val) { 12404 // Otherwise, materialize the constant and splat it. 12405 Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue); 12406 DAG.ReplaceAllUsesWith(Vec.getNode(), &Val); 12407 } 12408 12409 // Now insert the non-constant lanes. 12410 for (unsigned i = 0; i < NumElts; ++i) { 12411 SDValue V = Op.getOperand(i); 12412 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); 12413 if (!isIntOrFPConstant(V)) 12414 // Note that type legalization likely mucked about with the VT of the 12415 // source operand, so we may have to convert it here before inserting. 12416 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx); 12417 } 12418 return Val; 12419 } 12420 12421 // This will generate a load from the constant pool. 12422 if (isConstant) { 12423 LLVM_DEBUG( 12424 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default " 12425 "expansion\n"); 12426 return SDValue(); 12427 } 12428 12429 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from 12430 // v4i32s. This is really a truncate, which we can construct out of (legal) 12431 // concats and truncate nodes. 12432 if (SDValue M = ReconstructTruncateFromBuildVector(Op, DAG)) 12433 return M; 12434 12435 // Empirical tests suggest this is rarely worth it for vectors of length <= 2. 12436 if (NumElts >= 4) { 12437 if (SDValue shuffle = ReconstructShuffle(Op, DAG)) 12438 return shuffle; 12439 } 12440 12441 if (PreferDUPAndInsert) { 12442 // First, build a constant vector with the common element. 12443 SmallVector<SDValue, 8> Ops(NumElts, Value); 12444 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG); 12445 // Next, insert the elements that do not match the common value. 12446 for (unsigned I = 0; I < NumElts; ++I) 12447 if (Op.getOperand(I) != Value) 12448 NewVector = 12449 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector, 12450 Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64)); 12451 12452 return NewVector; 12453 } 12454 12455 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we 12456 // know the default expansion would otherwise fall back on something even 12457 // worse. For a vector with one or two non-undef values, that's 12458 // scalar_to_vector for the elements followed by a shuffle (provided the 12459 // shuffle is valid for the target) and materialization element by element 12460 // on the stack followed by a load for everything else. 12461 if (!isConstant && !usesOnlyOneValue) { 12462 LLVM_DEBUG( 12463 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence " 12464 "of INSERT_VECTOR_ELT\n"); 12465 12466 SDValue Vec = DAG.getUNDEF(VT); 12467 SDValue Op0 = Op.getOperand(0); 12468 unsigned i = 0; 12469 12470 // Use SCALAR_TO_VECTOR for lane zero to 12471 // a) Avoid a RMW dependency on the full vector register, and 12472 // b) Allow the register coalescer to fold away the copy if the 12473 // value is already in an S or D register, and we're forced to emit an 12474 // INSERT_SUBREG that we can't fold anywhere. 12475 // 12476 // We also allow types like i8 and i16 which are illegal scalar but legal 12477 // vector element types. After type-legalization the inserted value is 12478 // extended (i32) and it is safe to cast them to the vector type by ignoring 12479 // the upper bits of the lowest lane (e.g. v8i8, v4i16). 12480 if (!Op0.isUndef()) { 12481 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n"); 12482 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0); 12483 ++i; 12484 } 12485 LLVM_DEBUG(if (i < NumElts) dbgs() 12486 << "Creating nodes for the other vector elements:\n";); 12487 for (; i < NumElts; ++i) { 12488 SDValue V = Op.getOperand(i); 12489 if (V.isUndef()) 12490 continue; 12491 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); 12492 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); 12493 } 12494 return Vec; 12495 } 12496 12497 LLVM_DEBUG( 12498 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find " 12499 "better alternative\n"); 12500 return SDValue(); 12501 } 12502 12503 SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op, 12504 SelectionDAG &DAG) const { 12505 if (useSVEForFixedLengthVectorVT(Op.getValueType(), 12506 Subtarget->forceStreamingCompatibleSVE())) 12507 return LowerFixedLengthConcatVectorsToSVE(Op, DAG); 12508 12509 assert(Op.getValueType().isScalableVector() && 12510 isTypeLegal(Op.getValueType()) && 12511 "Expected legal scalable vector type!"); 12512 12513 if (isTypeLegal(Op.getOperand(0).getValueType())) { 12514 unsigned NumOperands = Op->getNumOperands(); 12515 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) && 12516 "Unexpected number of operands in CONCAT_VECTORS"); 12517 12518 if (NumOperands == 2) 12519 return Op; 12520 12521 // Concat each pair of subvectors and pack into the lower half of the array. 12522 SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end()); 12523 while (ConcatOps.size() > 1) { 12524 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) { 12525 SDValue V1 = ConcatOps[I]; 12526 SDValue V2 = ConcatOps[I + 1]; 12527 EVT SubVT = V1.getValueType(); 12528 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext()); 12529 ConcatOps[I / 2] = 12530 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2); 12531 } 12532 ConcatOps.resize(ConcatOps.size() / 2); 12533 } 12534 return ConcatOps[0]; 12535 } 12536 12537 return SDValue(); 12538 } 12539 12540 SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 12541 SelectionDAG &DAG) const { 12542 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!"); 12543 12544 if (useSVEForFixedLengthVectorVT(Op.getValueType(), 12545 Subtarget->forceStreamingCompatibleSVE())) 12546 return LowerFixedLengthInsertVectorElt(Op, DAG); 12547 12548 // Check for non-constant or out of range lane. 12549 EVT VT = Op.getOperand(0).getValueType(); 12550 12551 if (VT.getScalarType() == MVT::i1) { 12552 EVT VectorVT = getPromotedVTForPredicate(VT); 12553 SDLoc DL(Op); 12554 SDValue ExtendedVector = 12555 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT); 12556 SDValue ExtendedValue = 12557 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL, 12558 VectorVT.getScalarType().getSizeInBits() < 32 12559 ? MVT::i32 12560 : VectorVT.getScalarType()); 12561 ExtendedVector = 12562 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector, 12563 ExtendedValue, Op.getOperand(2)); 12564 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT); 12565 } 12566 12567 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 12568 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) 12569 return SDValue(); 12570 12571 // Insertion/extraction are legal for V128 types. 12572 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || 12573 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || 12574 VT == MVT::v8f16 || VT == MVT::v8bf16) 12575 return Op; 12576 12577 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && 12578 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 && 12579 VT != MVT::v4bf16) 12580 return SDValue(); 12581 12582 // For V64 types, we perform insertion by expanding the value 12583 // to a V128 type and perform the insertion on that. 12584 SDLoc DL(Op); 12585 SDValue WideVec = WidenVector(Op.getOperand(0), DAG); 12586 EVT WideTy = WideVec.getValueType(); 12587 12588 SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec, 12589 Op.getOperand(1), Op.getOperand(2)); 12590 // Re-narrow the resultant vector. 12591 return NarrowVector(Node, DAG); 12592 } 12593 12594 SDValue 12595 AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 12596 SelectionDAG &DAG) const { 12597 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!"); 12598 EVT VT = Op.getOperand(0).getValueType(); 12599 12600 if (VT.getScalarType() == MVT::i1) { 12601 // We can't directly extract from an SVE predicate; extend it first. 12602 // (This isn't the only possible lowering, but it's straightforward.) 12603 EVT VectorVT = getPromotedVTForPredicate(VT); 12604 SDLoc DL(Op); 12605 SDValue Extend = 12606 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0)); 12607 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32; 12608 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy, 12609 Extend, Op.getOperand(1)); 12610 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType()); 12611 } 12612 12613 if (useSVEForFixedLengthVectorVT(VT, 12614 Subtarget->forceStreamingCompatibleSVE())) 12615 return LowerFixedLengthExtractVectorElt(Op, DAG); 12616 12617 // Check for non-constant or out of range lane. 12618 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 12619 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) 12620 return SDValue(); 12621 12622 // Insertion/extraction are legal for V128 types. 12623 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || 12624 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || 12625 VT == MVT::v8f16 || VT == MVT::v8bf16) 12626 return Op; 12627 12628 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && 12629 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 && 12630 VT != MVT::v4bf16) 12631 return SDValue(); 12632 12633 // For V64 types, we perform extraction by expanding the value 12634 // to a V128 type and perform the extraction on that. 12635 SDLoc DL(Op); 12636 SDValue WideVec = WidenVector(Op.getOperand(0), DAG); 12637 EVT WideTy = WideVec.getValueType(); 12638 12639 EVT ExtrTy = WideTy.getVectorElementType(); 12640 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8) 12641 ExtrTy = MVT::i32; 12642 12643 // For extractions, we just return the result directly. 12644 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec, 12645 Op.getOperand(1)); 12646 } 12647 12648 SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, 12649 SelectionDAG &DAG) const { 12650 assert(Op.getValueType().isFixedLengthVector() && 12651 "Only cases that extract a fixed length vector are supported!"); 12652 12653 EVT InVT = Op.getOperand(0).getValueType(); 12654 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 12655 unsigned Size = Op.getValueSizeInBits(); 12656 12657 // If we don't have legal types yet, do nothing 12658 if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT)) 12659 return SDValue(); 12660 12661 if (InVT.isScalableVector()) { 12662 // This will be matched by custom code during ISelDAGToDAG. 12663 if (Idx == 0 && isPackedVectorType(InVT, DAG)) 12664 return Op; 12665 12666 return SDValue(); 12667 } 12668 12669 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel. 12670 if (Idx == 0 && InVT.getSizeInBits() <= 128) 12671 return Op; 12672 12673 // If this is extracting the upper 64-bits of a 128-bit vector, we match 12674 // that directly. 12675 if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 && 12676 InVT.getSizeInBits() == 128 && !Subtarget->forceStreamingCompatibleSVE()) 12677 return Op; 12678 12679 if (useSVEForFixedLengthVectorVT(InVT, 12680 Subtarget->forceStreamingCompatibleSVE())) { 12681 SDLoc DL(Op); 12682 12683 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); 12684 SDValue NewInVec = 12685 convertToScalableVector(DAG, ContainerVT, Op.getOperand(0)); 12686 12687 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ContainerVT, NewInVec, 12688 NewInVec, DAG.getConstant(Idx, DL, MVT::i64)); 12689 return convertFromScalableVector(DAG, Op.getValueType(), Splice); 12690 } 12691 12692 return SDValue(); 12693 } 12694 12695 SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, 12696 SelectionDAG &DAG) const { 12697 assert(Op.getValueType().isScalableVector() && 12698 "Only expect to lower inserts into scalable vectors!"); 12699 12700 EVT InVT = Op.getOperand(1).getValueType(); 12701 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 12702 12703 SDValue Vec0 = Op.getOperand(0); 12704 SDValue Vec1 = Op.getOperand(1); 12705 SDLoc DL(Op); 12706 EVT VT = Op.getValueType(); 12707 12708 if (InVT.isScalableVector()) { 12709 if (!isTypeLegal(VT)) 12710 return SDValue(); 12711 12712 // Break down insert_subvector into simpler parts. 12713 if (VT.getVectorElementType() == MVT::i1) { 12714 unsigned NumElts = VT.getVectorMinNumElements(); 12715 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); 12716 12717 SDValue Lo, Hi; 12718 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0, 12719 DAG.getVectorIdxConstant(0, DL)); 12720 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0, 12721 DAG.getVectorIdxConstant(NumElts / 2, DL)); 12722 if (Idx < (NumElts / 2)) { 12723 SDValue NewLo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1, 12724 DAG.getVectorIdxConstant(Idx, DL)); 12725 return DAG.getNode(AArch64ISD::UZP1, DL, VT, NewLo, Hi); 12726 } else { 12727 SDValue NewHi = 12728 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1, 12729 DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL)); 12730 return DAG.getNode(AArch64ISD::UZP1, DL, VT, Lo, NewHi); 12731 } 12732 } 12733 12734 // Ensure the subvector is half the size of the main vector. 12735 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2)) 12736 return SDValue(); 12737 12738 // Here narrow and wide refers to the vector element types. After "casting" 12739 // both vectors must have the same bit length and so because the subvector 12740 // has fewer elements, those elements need to be bigger. 12741 EVT NarrowVT = getPackedSVEVectorVT(VT.getVectorElementCount()); 12742 EVT WideVT = getPackedSVEVectorVT(InVT.getVectorElementCount()); 12743 12744 // NOP cast operands to the largest legal vector of the same element count. 12745 if (VT.isFloatingPoint()) { 12746 Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG); 12747 Vec1 = getSVESafeBitCast(WideVT, Vec1, DAG); 12748 } else { 12749 // Legal integer vectors are already their largest so Vec0 is fine as is. 12750 Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1); 12751 } 12752 12753 // To replace the top/bottom half of vector V with vector SubV we widen the 12754 // preserved half of V, concatenate this to SubV (the order depending on the 12755 // half being replaced) and then narrow the result. 12756 SDValue Narrow; 12757 if (Idx == 0) { 12758 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0); 12759 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0); 12760 } else { 12761 assert(Idx == InVT.getVectorMinNumElements() && 12762 "Invalid subvector index!"); 12763 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0); 12764 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1); 12765 } 12766 12767 return getSVESafeBitCast(VT, Narrow, DAG); 12768 } 12769 12770 if (Idx == 0 && isPackedVectorType(VT, DAG)) { 12771 // This will be matched by custom code during ISelDAGToDAG. 12772 if (Vec0.isUndef()) 12773 return Op; 12774 12775 std::optional<unsigned> PredPattern = 12776 getSVEPredPatternFromNumElements(InVT.getVectorNumElements()); 12777 auto PredTy = VT.changeVectorElementType(MVT::i1); 12778 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern); 12779 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1); 12780 return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0); 12781 } 12782 12783 return SDValue(); 12784 } 12785 12786 static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) { 12787 if (Op.getOpcode() != AArch64ISD::DUP && 12788 Op.getOpcode() != ISD::SPLAT_VECTOR && 12789 Op.getOpcode() != ISD::BUILD_VECTOR) 12790 return false; 12791 12792 if (Op.getOpcode() == ISD::BUILD_VECTOR && 12793 !isAllConstantBuildVector(Op, SplatVal)) 12794 return false; 12795 12796 if (Op.getOpcode() != ISD::BUILD_VECTOR && 12797 !isa<ConstantSDNode>(Op->getOperand(0))) 12798 return false; 12799 12800 SplatVal = Op->getConstantOperandVal(0); 12801 if (Op.getValueType().getVectorElementType() != MVT::i64) 12802 SplatVal = (int32_t)SplatVal; 12803 12804 Negated = false; 12805 if (isPowerOf2_64(SplatVal)) 12806 return true; 12807 12808 Negated = true; 12809 if (isPowerOf2_64(-SplatVal)) { 12810 SplatVal = -SplatVal; 12811 return true; 12812 } 12813 12814 return false; 12815 } 12816 12817 SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const { 12818 EVT VT = Op.getValueType(); 12819 SDLoc dl(Op); 12820 12821 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) 12822 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG); 12823 12824 assert(VT.isScalableVector() && "Expected a scalable vector."); 12825 12826 bool Signed = Op.getOpcode() == ISD::SDIV; 12827 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED; 12828 12829 bool Negated; 12830 uint64_t SplatVal; 12831 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) { 12832 SDValue Pg = getPredicateForScalableVector(DAG, dl, VT); 12833 SDValue Res = 12834 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, VT, Pg, Op->getOperand(0), 12835 DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32)); 12836 if (Negated) 12837 Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res); 12838 12839 return Res; 12840 } 12841 12842 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64) 12843 return LowerToPredicatedOp(Op, DAG, PredOpcode); 12844 12845 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit 12846 // operations, and truncate the result. 12847 EVT WidenedVT; 12848 if (VT == MVT::nxv16i8) 12849 WidenedVT = MVT::nxv8i16; 12850 else if (VT == MVT::nxv8i16) 12851 WidenedVT = MVT::nxv4i32; 12852 else 12853 llvm_unreachable("Unexpected Custom DIV operation"); 12854 12855 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO; 12856 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI; 12857 SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0)); 12858 SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1)); 12859 SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0)); 12860 SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1)); 12861 SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo); 12862 SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi); 12863 return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi); 12864 } 12865 12866 bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { 12867 // Currently no fixed length shuffles that require SVE are legal. 12868 if (useSVEForFixedLengthVectorVT(VT, 12869 Subtarget->forceStreamingCompatibleSVE())) 12870 return false; 12871 12872 if (VT.getVectorNumElements() == 4 && 12873 (VT.is128BitVector() || VT.is64BitVector())) { 12874 unsigned Cost = getPerfectShuffleCost(M); 12875 if (Cost <= 1) 12876 return true; 12877 } 12878 12879 bool DummyBool; 12880 int DummyInt; 12881 unsigned DummyUnsigned; 12882 12883 return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) || 12884 isREVMask(M, VT, 32) || isREVMask(M, VT, 16) || 12885 isEXTMask(M, VT, DummyBool, DummyUnsigned) || 12886 // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM. 12887 isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) || 12888 isZIPMask(M, VT, DummyUnsigned) || 12889 isTRN_v_undef_Mask(M, VT, DummyUnsigned) || 12890 isUZP_v_undef_Mask(M, VT, DummyUnsigned) || 12891 isZIP_v_undef_Mask(M, VT, DummyUnsigned) || 12892 isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) || 12893 isConcatMask(M, VT, VT.getSizeInBits() == 128)); 12894 } 12895 12896 bool AArch64TargetLowering::isVectorClearMaskLegal(ArrayRef<int> M, 12897 EVT VT) const { 12898 // Just delegate to the generic legality, clear masks aren't special. 12899 return isShuffleMaskLegal(M, VT); 12900 } 12901 12902 /// getVShiftImm - Check if this is a valid build_vector for the immediate 12903 /// operand of a vector shift operation, where all the elements of the 12904 /// build_vector must have the same constant integer value. 12905 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { 12906 // Ignore bit_converts. 12907 while (Op.getOpcode() == ISD::BITCAST) 12908 Op = Op.getOperand(0); 12909 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); 12910 APInt SplatBits, SplatUndef; 12911 unsigned SplatBitSize; 12912 bool HasAnyUndefs; 12913 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, 12914 HasAnyUndefs, ElementBits) || 12915 SplatBitSize > ElementBits) 12916 return false; 12917 Cnt = SplatBits.getSExtValue(); 12918 return true; 12919 } 12920 12921 /// isVShiftLImm - Check if this is a valid build_vector for the immediate 12922 /// operand of a vector shift left operation. That value must be in the range: 12923 /// 0 <= Value < ElementBits for a left shift; or 12924 /// 0 <= Value <= ElementBits for a long left shift. 12925 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { 12926 assert(VT.isVector() && "vector shift count is not a vector type"); 12927 int64_t ElementBits = VT.getScalarSizeInBits(); 12928 if (!getVShiftImm(Op, ElementBits, Cnt)) 12929 return false; 12930 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); 12931 } 12932 12933 /// isVShiftRImm - Check if this is a valid build_vector for the immediate 12934 /// operand of a vector shift right operation. The value must be in the range: 12935 /// 1 <= Value <= ElementBits for a right shift; or 12936 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) { 12937 assert(VT.isVector() && "vector shift count is not a vector type"); 12938 int64_t ElementBits = VT.getScalarSizeInBits(); 12939 if (!getVShiftImm(Op, ElementBits, Cnt)) 12940 return false; 12941 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); 12942 } 12943 12944 SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op, 12945 SelectionDAG &DAG) const { 12946 EVT VT = Op.getValueType(); 12947 12948 if (VT.getScalarType() == MVT::i1) { 12949 // Lower i1 truncate to `(x & 1) != 0`. 12950 SDLoc dl(Op); 12951 EVT OpVT = Op.getOperand(0).getValueType(); 12952 SDValue Zero = DAG.getConstant(0, dl, OpVT); 12953 SDValue One = DAG.getConstant(1, dl, OpVT); 12954 SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One); 12955 return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE); 12956 } 12957 12958 if (!VT.isVector() || VT.isScalableVector()) 12959 return SDValue(); 12960 12961 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(), 12962 Subtarget->forceStreamingCompatibleSVE())) 12963 return LowerFixedLengthVectorTruncateToSVE(Op, DAG); 12964 12965 return SDValue(); 12966 } 12967 12968 SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, 12969 SelectionDAG &DAG) const { 12970 EVT VT = Op.getValueType(); 12971 SDLoc DL(Op); 12972 int64_t Cnt; 12973 12974 if (!Op.getOperand(1).getValueType().isVector()) 12975 return Op; 12976 unsigned EltSize = VT.getScalarSizeInBits(); 12977 12978 switch (Op.getOpcode()) { 12979 case ISD::SHL: 12980 if (VT.isScalableVector() || 12981 useSVEForFixedLengthVectorVT(VT, 12982 Subtarget->forceStreamingCompatibleSVE())) 12983 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED); 12984 12985 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) 12986 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0), 12987 DAG.getConstant(Cnt, DL, MVT::i32)); 12988 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 12989 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL, 12990 MVT::i32), 12991 Op.getOperand(0), Op.getOperand(1)); 12992 case ISD::SRA: 12993 case ISD::SRL: 12994 if (VT.isScalableVector() || 12995 useSVEForFixedLengthVectorVT( 12996 VT, Subtarget->forceStreamingCompatibleSVE())) { 12997 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED 12998 : AArch64ISD::SRL_PRED; 12999 return LowerToPredicatedOp(Op, DAG, Opc); 13000 } 13001 13002 // Right shift immediate 13003 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) { 13004 unsigned Opc = 13005 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR; 13006 return DAG.getNode(Opc, DL, VT, Op.getOperand(0), 13007 DAG.getConstant(Cnt, DL, MVT::i32)); 13008 } 13009 13010 // Right shift register. Note, there is not a shift right register 13011 // instruction, but the shift left register instruction takes a signed 13012 // value, where negative numbers specify a right shift. 13013 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl 13014 : Intrinsic::aarch64_neon_ushl; 13015 // negate the shift amount 13016 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), 13017 Op.getOperand(1)); 13018 SDValue NegShiftLeft = 13019 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 13020 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0), 13021 NegShift); 13022 return NegShiftLeft; 13023 } 13024 13025 llvm_unreachable("unexpected shift opcode"); 13026 } 13027 13028 static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, 13029 AArch64CC::CondCode CC, bool NoNans, EVT VT, 13030 const SDLoc &dl, SelectionDAG &DAG) { 13031 EVT SrcVT = LHS.getValueType(); 13032 assert(VT.getSizeInBits() == SrcVT.getSizeInBits() && 13033 "function only supposed to emit natural comparisons"); 13034 13035 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode()); 13036 APInt CnstBits(VT.getSizeInBits(), 0); 13037 APInt UndefBits(VT.getSizeInBits(), 0); 13038 bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits); 13039 bool IsZero = IsCnst && (CnstBits == 0); 13040 13041 if (SrcVT.getVectorElementType().isFloatingPoint()) { 13042 switch (CC) { 13043 default: 13044 return SDValue(); 13045 case AArch64CC::NE: { 13046 SDValue Fcmeq; 13047 if (IsZero) 13048 Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); 13049 else 13050 Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); 13051 return DAG.getNOT(dl, Fcmeq, VT); 13052 } 13053 case AArch64CC::EQ: 13054 if (IsZero) 13055 return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); 13056 return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); 13057 case AArch64CC::GE: 13058 if (IsZero) 13059 return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS); 13060 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS); 13061 case AArch64CC::GT: 13062 if (IsZero) 13063 return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS); 13064 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS); 13065 case AArch64CC::LE: 13066 if (!NoNans) 13067 return SDValue(); 13068 // If we ignore NaNs then we can use to the LS implementation. 13069 [[fallthrough]]; 13070 case AArch64CC::LS: 13071 if (IsZero) 13072 return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS); 13073 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS); 13074 case AArch64CC::LT: 13075 if (!NoNans) 13076 return SDValue(); 13077 // If we ignore NaNs then we can use to the MI implementation. 13078 [[fallthrough]]; 13079 case AArch64CC::MI: 13080 if (IsZero) 13081 return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS); 13082 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS); 13083 } 13084 } 13085 13086 switch (CC) { 13087 default: 13088 return SDValue(); 13089 case AArch64CC::NE: { 13090 SDValue Cmeq; 13091 if (IsZero) 13092 Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); 13093 else 13094 Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); 13095 return DAG.getNOT(dl, Cmeq, VT); 13096 } 13097 case AArch64CC::EQ: 13098 if (IsZero) 13099 return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); 13100 return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); 13101 case AArch64CC::GE: 13102 if (IsZero) 13103 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS); 13104 return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS); 13105 case AArch64CC::GT: 13106 if (IsZero) 13107 return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS); 13108 return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS); 13109 case AArch64CC::LE: 13110 if (IsZero) 13111 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS); 13112 return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS); 13113 case AArch64CC::LS: 13114 return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS); 13115 case AArch64CC::LO: 13116 return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS); 13117 case AArch64CC::LT: 13118 if (IsZero) 13119 return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS); 13120 return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS); 13121 case AArch64CC::HI: 13122 return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS); 13123 case AArch64CC::HS: 13124 return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS); 13125 } 13126 } 13127 13128 SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, 13129 SelectionDAG &DAG) const { 13130 if (Op.getValueType().isScalableVector()) 13131 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO); 13132 13133 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(), 13134 Subtarget->forceStreamingCompatibleSVE())) 13135 return LowerFixedLengthVectorSetccToSVE(Op, DAG); 13136 13137 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 13138 SDValue LHS = Op.getOperand(0); 13139 SDValue RHS = Op.getOperand(1); 13140 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger(); 13141 SDLoc dl(Op); 13142 13143 if (LHS.getValueType().getVectorElementType().isInteger()) { 13144 assert(LHS.getValueType() == RHS.getValueType()); 13145 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); 13146 SDValue Cmp = 13147 EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG); 13148 return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); 13149 } 13150 13151 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16(); 13152 13153 // Make v4f16 (only) fcmp operations utilise vector instructions 13154 // v8f16 support will be a litle more complicated 13155 if (!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) { 13156 if (LHS.getValueType().getVectorNumElements() == 4) { 13157 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS); 13158 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS); 13159 SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC); 13160 DAG.ReplaceAllUsesWith(Op, NewSetcc); 13161 CmpVT = MVT::v4i32; 13162 } else 13163 return SDValue(); 13164 } 13165 13166 assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) || 13167 LHS.getValueType().getVectorElementType() != MVT::f128); 13168 13169 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally 13170 // clean. Some of them require two branches to implement. 13171 AArch64CC::CondCode CC1, CC2; 13172 bool ShouldInvert; 13173 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert); 13174 13175 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs(); 13176 SDValue Cmp = 13177 EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG); 13178 if (!Cmp.getNode()) 13179 return SDValue(); 13180 13181 if (CC2 != AArch64CC::AL) { 13182 SDValue Cmp2 = 13183 EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG); 13184 if (!Cmp2.getNode()) 13185 return SDValue(); 13186 13187 Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2); 13188 } 13189 13190 Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); 13191 13192 if (ShouldInvert) 13193 Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType()); 13194 13195 return Cmp; 13196 } 13197 13198 static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, 13199 SelectionDAG &DAG) { 13200 SDValue VecOp = ScalarOp.getOperand(0); 13201 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp); 13202 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx, 13203 DAG.getConstant(0, DL, MVT::i64)); 13204 } 13205 13206 SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op, 13207 SelectionDAG &DAG) const { 13208 SDValue Src = Op.getOperand(0); 13209 13210 // Try to lower fixed length reductions to SVE. 13211 EVT SrcVT = Src.getValueType(); 13212 bool OverrideNEON = Subtarget->forceStreamingCompatibleSVE() || 13213 Op.getOpcode() == ISD::VECREDUCE_AND || 13214 Op.getOpcode() == ISD::VECREDUCE_OR || 13215 Op.getOpcode() == ISD::VECREDUCE_XOR || 13216 Op.getOpcode() == ISD::VECREDUCE_FADD || 13217 (Op.getOpcode() != ISD::VECREDUCE_ADD && 13218 SrcVT.getVectorElementType() == MVT::i64); 13219 if (SrcVT.isScalableVector() || 13220 useSVEForFixedLengthVectorVT( 13221 SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) { 13222 13223 if (SrcVT.getVectorElementType() == MVT::i1) 13224 return LowerPredReductionToSVE(Op, DAG); 13225 13226 switch (Op.getOpcode()) { 13227 case ISD::VECREDUCE_ADD: 13228 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG); 13229 case ISD::VECREDUCE_AND: 13230 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG); 13231 case ISD::VECREDUCE_OR: 13232 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG); 13233 case ISD::VECREDUCE_SMAX: 13234 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG); 13235 case ISD::VECREDUCE_SMIN: 13236 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG); 13237 case ISD::VECREDUCE_UMAX: 13238 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG); 13239 case ISD::VECREDUCE_UMIN: 13240 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG); 13241 case ISD::VECREDUCE_XOR: 13242 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG); 13243 case ISD::VECREDUCE_FADD: 13244 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG); 13245 case ISD::VECREDUCE_FMAX: 13246 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG); 13247 case ISD::VECREDUCE_FMIN: 13248 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG); 13249 default: 13250 llvm_unreachable("Unhandled fixed length reduction"); 13251 } 13252 } 13253 13254 // Lower NEON reductions. 13255 SDLoc dl(Op); 13256 switch (Op.getOpcode()) { 13257 case ISD::VECREDUCE_ADD: 13258 return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG); 13259 case ISD::VECREDUCE_SMAX: 13260 return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG); 13261 case ISD::VECREDUCE_SMIN: 13262 return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG); 13263 case ISD::VECREDUCE_UMAX: 13264 return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG); 13265 case ISD::VECREDUCE_UMIN: 13266 return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG); 13267 case ISD::VECREDUCE_FMAX: { 13268 return DAG.getNode( 13269 ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), 13270 DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32), 13271 Src); 13272 } 13273 case ISD::VECREDUCE_FMIN: { 13274 return DAG.getNode( 13275 ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), 13276 DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32), 13277 Src); 13278 } 13279 default: 13280 llvm_unreachable("Unhandled reduction"); 13281 } 13282 } 13283 13284 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op, 13285 SelectionDAG &DAG) const { 13286 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>(); 13287 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics()) 13288 return SDValue(); 13289 13290 // LSE has an atomic load-add instruction, but not a load-sub. 13291 SDLoc dl(Op); 13292 MVT VT = Op.getSimpleValueType(); 13293 SDValue RHS = Op.getOperand(2); 13294 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode()); 13295 RHS = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), RHS); 13296 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, AN->getMemoryVT(), 13297 Op.getOperand(0), Op.getOperand(1), RHS, 13298 AN->getMemOperand()); 13299 } 13300 13301 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op, 13302 SelectionDAG &DAG) const { 13303 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>(); 13304 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics()) 13305 return SDValue(); 13306 13307 // LSE has an atomic load-clear instruction, but not a load-and. 13308 SDLoc dl(Op); 13309 MVT VT = Op.getSimpleValueType(); 13310 SDValue RHS = Op.getOperand(2); 13311 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode()); 13312 RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS); 13313 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(), 13314 Op.getOperand(0), Op.getOperand(1), RHS, 13315 AN->getMemOperand()); 13316 } 13317 13318 SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC( 13319 SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const { 13320 SDLoc dl(Op); 13321 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 13322 SDValue Callee = DAG.getTargetExternalSymbol(Subtarget->getChkStkName(), 13323 PtrVT, 0); 13324 13325 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 13326 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask(); 13327 if (Subtarget->hasCustomCallingConv()) 13328 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask); 13329 13330 Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size, 13331 DAG.getConstant(4, dl, MVT::i64)); 13332 Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue()); 13333 Chain = 13334 DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue), 13335 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64), 13336 DAG.getRegisterMask(Mask), Chain.getValue(1)); 13337 // To match the actual intent better, we should read the output from X15 here 13338 // again (instead of potentially spilling it to the stack), but rereading Size 13339 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined 13340 // here. 13341 13342 Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size, 13343 DAG.getConstant(4, dl, MVT::i64)); 13344 return Chain; 13345 } 13346 13347 SDValue 13348 AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 13349 SelectionDAG &DAG) const { 13350 assert(Subtarget->isTargetWindows() && 13351 "Only Windows alloca probing supported"); 13352 SDLoc dl(Op); 13353 // Get the inputs. 13354 SDNode *Node = Op.getNode(); 13355 SDValue Chain = Op.getOperand(0); 13356 SDValue Size = Op.getOperand(1); 13357 MaybeAlign Align = 13358 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue(); 13359 EVT VT = Node->getValueType(0); 13360 13361 if (DAG.getMachineFunction().getFunction().hasFnAttribute( 13362 "no-stack-arg-probe")) { 13363 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); 13364 Chain = SP.getValue(1); 13365 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); 13366 if (Align) 13367 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), 13368 DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); 13369 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); 13370 SDValue Ops[2] = {SP, Chain}; 13371 return DAG.getMergeValues(Ops, dl); 13372 } 13373 13374 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); 13375 13376 Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG); 13377 13378 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); 13379 Chain = SP.getValue(1); 13380 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); 13381 if (Align) 13382 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), 13383 DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); 13384 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); 13385 13386 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl); 13387 13388 SDValue Ops[2] = {SP, Chain}; 13389 return DAG.getMergeValues(Ops, dl); 13390 } 13391 13392 SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op, 13393 SelectionDAG &DAG) const { 13394 EVT VT = Op.getValueType(); 13395 assert(VT != MVT::i64 && "Expected illegal VSCALE node"); 13396 13397 SDLoc DL(Op); 13398 APInt MulImm = cast<ConstantSDNode>(Op.getOperand(0))->getAPIntValue(); 13399 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL, 13400 VT); 13401 } 13402 13403 /// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics. 13404 template <unsigned NumVecs> 13405 static bool 13406 setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL, 13407 AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI) { 13408 Info.opc = ISD::INTRINSIC_VOID; 13409 // Retrieve EC from first vector argument. 13410 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType()); 13411 ElementCount EC = VT.getVectorElementCount(); 13412 #ifndef NDEBUG 13413 // Check the assumption that all input vectors are the same type. 13414 for (unsigned I = 0; I < NumVecs; ++I) 13415 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) && 13416 "Invalid type."); 13417 #endif 13418 // memVT is `NumVecs * VT`. 13419 Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(), 13420 EC * NumVecs); 13421 Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1); 13422 Info.offset = 0; 13423 Info.align.reset(); 13424 Info.flags = MachineMemOperand::MOStore; 13425 return true; 13426 } 13427 13428 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as 13429 /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment 13430 /// specified in the intrinsic calls. 13431 bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 13432 const CallInst &I, 13433 MachineFunction &MF, 13434 unsigned Intrinsic) const { 13435 auto &DL = I.getModule()->getDataLayout(); 13436 switch (Intrinsic) { 13437 case Intrinsic::aarch64_sve_st2: 13438 return setInfoSVEStN<2>(*this, DL, Info, I); 13439 case Intrinsic::aarch64_sve_st3: 13440 return setInfoSVEStN<3>(*this, DL, Info, I); 13441 case Intrinsic::aarch64_sve_st4: 13442 return setInfoSVEStN<4>(*this, DL, Info, I); 13443 case Intrinsic::aarch64_neon_ld2: 13444 case Intrinsic::aarch64_neon_ld3: 13445 case Intrinsic::aarch64_neon_ld4: 13446 case Intrinsic::aarch64_neon_ld1x2: 13447 case Intrinsic::aarch64_neon_ld1x3: 13448 case Intrinsic::aarch64_neon_ld1x4: 13449 case Intrinsic::aarch64_neon_ld2lane: 13450 case Intrinsic::aarch64_neon_ld3lane: 13451 case Intrinsic::aarch64_neon_ld4lane: 13452 case Intrinsic::aarch64_neon_ld2r: 13453 case Intrinsic::aarch64_neon_ld3r: 13454 case Intrinsic::aarch64_neon_ld4r: { 13455 Info.opc = ISD::INTRINSIC_W_CHAIN; 13456 // Conservatively set memVT to the entire set of vectors loaded. 13457 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 13458 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 13459 Info.ptrVal = I.getArgOperand(I.arg_size() - 1); 13460 Info.offset = 0; 13461 Info.align.reset(); 13462 // volatile loads with NEON intrinsics not supported 13463 Info.flags = MachineMemOperand::MOLoad; 13464 return true; 13465 } 13466 case Intrinsic::aarch64_neon_st2: 13467 case Intrinsic::aarch64_neon_st3: 13468 case Intrinsic::aarch64_neon_st4: 13469 case Intrinsic::aarch64_neon_st1x2: 13470 case Intrinsic::aarch64_neon_st1x3: 13471 case Intrinsic::aarch64_neon_st1x4: 13472 case Intrinsic::aarch64_neon_st2lane: 13473 case Intrinsic::aarch64_neon_st3lane: 13474 case Intrinsic::aarch64_neon_st4lane: { 13475 Info.opc = ISD::INTRINSIC_VOID; 13476 // Conservatively set memVT to the entire set of vectors stored. 13477 unsigned NumElts = 0; 13478 for (const Value *Arg : I.args()) { 13479 Type *ArgTy = Arg->getType(); 13480 if (!ArgTy->isVectorTy()) 13481 break; 13482 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 13483 } 13484 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 13485 Info.ptrVal = I.getArgOperand(I.arg_size() - 1); 13486 Info.offset = 0; 13487 Info.align.reset(); 13488 // volatile stores with NEON intrinsics not supported 13489 Info.flags = MachineMemOperand::MOStore; 13490 return true; 13491 } 13492 case Intrinsic::aarch64_ldaxr: 13493 case Intrinsic::aarch64_ldxr: { 13494 Type *ValTy = I.getParamElementType(0); 13495 Info.opc = ISD::INTRINSIC_W_CHAIN; 13496 Info.memVT = MVT::getVT(ValTy); 13497 Info.ptrVal = I.getArgOperand(0); 13498 Info.offset = 0; 13499 Info.align = DL.getABITypeAlign(ValTy); 13500 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 13501 return true; 13502 } 13503 case Intrinsic::aarch64_stlxr: 13504 case Intrinsic::aarch64_stxr: { 13505 Type *ValTy = I.getParamElementType(1); 13506 Info.opc = ISD::INTRINSIC_W_CHAIN; 13507 Info.memVT = MVT::getVT(ValTy); 13508 Info.ptrVal = I.getArgOperand(1); 13509 Info.offset = 0; 13510 Info.align = DL.getABITypeAlign(ValTy); 13511 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 13512 return true; 13513 } 13514 case Intrinsic::aarch64_ldaxp: 13515 case Intrinsic::aarch64_ldxp: 13516 Info.opc = ISD::INTRINSIC_W_CHAIN; 13517 Info.memVT = MVT::i128; 13518 Info.ptrVal = I.getArgOperand(0); 13519 Info.offset = 0; 13520 Info.align = Align(16); 13521 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 13522 return true; 13523 case Intrinsic::aarch64_stlxp: 13524 case Intrinsic::aarch64_stxp: 13525 Info.opc = ISD::INTRINSIC_W_CHAIN; 13526 Info.memVT = MVT::i128; 13527 Info.ptrVal = I.getArgOperand(2); 13528 Info.offset = 0; 13529 Info.align = Align(16); 13530 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 13531 return true; 13532 case Intrinsic::aarch64_sve_ldnt1: { 13533 Type *ElTy = cast<VectorType>(I.getType())->getElementType(); 13534 Info.opc = ISD::INTRINSIC_W_CHAIN; 13535 Info.memVT = MVT::getVT(I.getType()); 13536 Info.ptrVal = I.getArgOperand(1); 13537 Info.offset = 0; 13538 Info.align = DL.getABITypeAlign(ElTy); 13539 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MONonTemporal; 13540 return true; 13541 } 13542 case Intrinsic::aarch64_sve_stnt1: { 13543 Type *ElTy = 13544 cast<VectorType>(I.getArgOperand(0)->getType())->getElementType(); 13545 Info.opc = ISD::INTRINSIC_W_CHAIN; 13546 Info.memVT = MVT::getVT(I.getOperand(0)->getType()); 13547 Info.ptrVal = I.getArgOperand(2); 13548 Info.offset = 0; 13549 Info.align = DL.getABITypeAlign(ElTy); 13550 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal; 13551 return true; 13552 } 13553 case Intrinsic::aarch64_mops_memset_tag: { 13554 Value *Dst = I.getArgOperand(0); 13555 Value *Val = I.getArgOperand(1); 13556 Info.opc = ISD::INTRINSIC_W_CHAIN; 13557 Info.memVT = MVT::getVT(Val->getType()); 13558 Info.ptrVal = Dst; 13559 Info.offset = 0; 13560 Info.align = I.getParamAlign(0).valueOrOne(); 13561 Info.flags = MachineMemOperand::MOStore; 13562 // The size of the memory being operated on is unknown at this point 13563 Info.size = MemoryLocation::UnknownSize; 13564 return true; 13565 } 13566 default: 13567 break; 13568 } 13569 13570 return false; 13571 } 13572 13573 bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load, 13574 ISD::LoadExtType ExtTy, 13575 EVT NewVT) const { 13576 // TODO: This may be worth removing. Check regression tests for diffs. 13577 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT)) 13578 return false; 13579 13580 // If we're reducing the load width in order to avoid having to use an extra 13581 // instruction to do extension then it's probably a good idea. 13582 if (ExtTy != ISD::NON_EXTLOAD) 13583 return true; 13584 // Don't reduce load width if it would prevent us from combining a shift into 13585 // the offset. 13586 MemSDNode *Mem = dyn_cast<MemSDNode>(Load); 13587 assert(Mem); 13588 const SDValue &Base = Mem->getBasePtr(); 13589 if (Base.getOpcode() == ISD::ADD && 13590 Base.getOperand(1).getOpcode() == ISD::SHL && 13591 Base.getOperand(1).hasOneUse() && 13592 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) { 13593 // It's unknown whether a scalable vector has a power-of-2 bitwidth. 13594 if (Mem->getMemoryVT().isScalableVector()) 13595 return false; 13596 // The shift can be combined if it matches the size of the value being 13597 // loaded (and so reducing the width would make it not match). 13598 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1); 13599 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8; 13600 if (ShiftAmount == Log2_32(LoadBytes)) 13601 return false; 13602 } 13603 // We have no reason to disallow reducing the load width, so allow it. 13604 return true; 13605 } 13606 13607 // Truncations from 64-bit GPR to 32-bit GPR is free. 13608 bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { 13609 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 13610 return false; 13611 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue(); 13612 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue(); 13613 return NumBits1 > NumBits2; 13614 } 13615 bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 13616 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) 13617 return false; 13618 uint64_t NumBits1 = VT1.getFixedSizeInBits(); 13619 uint64_t NumBits2 = VT2.getFixedSizeInBits(); 13620 return NumBits1 > NumBits2; 13621 } 13622 13623 /// Check if it is profitable to hoist instruction in then/else to if. 13624 /// Not profitable if I and it's user can form a FMA instruction 13625 /// because we prefer FMSUB/FMADD. 13626 bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const { 13627 if (I->getOpcode() != Instruction::FMul) 13628 return true; 13629 13630 if (!I->hasOneUse()) 13631 return true; 13632 13633 Instruction *User = I->user_back(); 13634 13635 if (!(User->getOpcode() == Instruction::FSub || 13636 User->getOpcode() == Instruction::FAdd)) 13637 return true; 13638 13639 const TargetOptions &Options = getTargetMachine().Options; 13640 const Function *F = I->getFunction(); 13641 const DataLayout &DL = F->getParent()->getDataLayout(); 13642 Type *Ty = User->getOperand(0)->getType(); 13643 13644 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) && 13645 isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) && 13646 (Options.AllowFPOpFusion == FPOpFusion::Fast || 13647 Options.UnsafeFPMath)); 13648 } 13649 13650 // All 32-bit GPR operations implicitly zero the high-half of the corresponding 13651 // 64-bit GPR. 13652 bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { 13653 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 13654 return false; 13655 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 13656 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 13657 return NumBits1 == 32 && NumBits2 == 64; 13658 } 13659 bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 13660 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) 13661 return false; 13662 unsigned NumBits1 = VT1.getSizeInBits(); 13663 unsigned NumBits2 = VT2.getSizeInBits(); 13664 return NumBits1 == 32 && NumBits2 == 64; 13665 } 13666 13667 bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 13668 EVT VT1 = Val.getValueType(); 13669 if (isZExtFree(VT1, VT2)) { 13670 return true; 13671 } 13672 13673 if (Val.getOpcode() != ISD::LOAD) 13674 return false; 13675 13676 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend. 13677 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() && 13678 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() && 13679 VT1.getSizeInBits() <= 32); 13680 } 13681 13682 bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const { 13683 if (isa<FPExtInst>(Ext)) 13684 return false; 13685 13686 // Vector types are not free. 13687 if (Ext->getType()->isVectorTy()) 13688 return false; 13689 13690 for (const Use &U : Ext->uses()) { 13691 // The extension is free if we can fold it with a left shift in an 13692 // addressing mode or an arithmetic operation: add, sub, and cmp. 13693 13694 // Is there a shift? 13695 const Instruction *Instr = cast<Instruction>(U.getUser()); 13696 13697 // Is this a constant shift? 13698 switch (Instr->getOpcode()) { 13699 case Instruction::Shl: 13700 if (!isa<ConstantInt>(Instr->getOperand(1))) 13701 return false; 13702 break; 13703 case Instruction::GetElementPtr: { 13704 gep_type_iterator GTI = gep_type_begin(Instr); 13705 auto &DL = Ext->getModule()->getDataLayout(); 13706 std::advance(GTI, U.getOperandNo()-1); 13707 Type *IdxTy = GTI.getIndexedType(); 13708 // This extension will end up with a shift because of the scaling factor. 13709 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0. 13710 // Get the shift amount based on the scaling factor: 13711 // log2(sizeof(IdxTy)) - log2(8). 13712 uint64_t ShiftAmt = 13713 countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) - 13714 3; 13715 // Is the constant foldable in the shift of the addressing mode? 13716 // I.e., shift amount is between 1 and 4 inclusive. 13717 if (ShiftAmt == 0 || ShiftAmt > 4) 13718 return false; 13719 break; 13720 } 13721 case Instruction::Trunc: 13722 // Check if this is a noop. 13723 // trunc(sext ty1 to ty2) to ty1. 13724 if (Instr->getType() == Ext->getOperand(0)->getType()) 13725 continue; 13726 [[fallthrough]]; 13727 default: 13728 return false; 13729 } 13730 13731 // At this point we can use the bfm family, so this extension is free 13732 // for that use. 13733 } 13734 return true; 13735 } 13736 13737 static bool isSplatShuffle(Value *V) { 13738 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V)) 13739 return all_equal(Shuf->getShuffleMask()); 13740 return false; 13741 } 13742 13743 /// Check if both Op1 and Op2 are shufflevector extracts of either the lower 13744 /// or upper half of the vector elements. 13745 static bool areExtractShuffleVectors(Value *Op1, Value *Op2, 13746 bool AllowSplat = false) { 13747 auto areTypesHalfed = [](Value *FullV, Value *HalfV) { 13748 auto *FullTy = FullV->getType(); 13749 auto *HalfTy = HalfV->getType(); 13750 return FullTy->getPrimitiveSizeInBits().getFixedValue() == 13751 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue(); 13752 }; 13753 13754 auto extractHalf = [](Value *FullV, Value *HalfV) { 13755 auto *FullVT = cast<FixedVectorType>(FullV->getType()); 13756 auto *HalfVT = cast<FixedVectorType>(HalfV->getType()); 13757 return FullVT->getNumElements() == 2 * HalfVT->getNumElements(); 13758 }; 13759 13760 ArrayRef<int> M1, M2; 13761 Value *S1Op1 = nullptr, *S2Op1 = nullptr; 13762 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) || 13763 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2)))) 13764 return false; 13765 13766 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that 13767 // it is not checked as an extract below. 13768 if (AllowSplat && isSplatShuffle(Op1)) 13769 S1Op1 = nullptr; 13770 if (AllowSplat && isSplatShuffle(Op2)) 13771 S2Op1 = nullptr; 13772 13773 // Check that the operands are half as wide as the result and we extract 13774 // half of the elements of the input vectors. 13775 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) || 13776 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2)))) 13777 return false; 13778 13779 // Check the mask extracts either the lower or upper half of vector 13780 // elements. 13781 int M1Start = 0; 13782 int M2Start = 0; 13783 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2; 13784 if ((S1Op1 && 13785 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) || 13786 (S2Op1 && 13787 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start))) 13788 return false; 13789 13790 if ((M1Start != 0 && M1Start != (NumElements / 2)) || 13791 (M2Start != 0 && M2Start != (NumElements / 2))) 13792 return false; 13793 if (S1Op1 && S2Op1 && M1Start != M2Start) 13794 return false; 13795 13796 return true; 13797 } 13798 13799 /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth 13800 /// of the vector elements. 13801 static bool areExtractExts(Value *Ext1, Value *Ext2) { 13802 auto areExtDoubled = [](Instruction *Ext) { 13803 return Ext->getType()->getScalarSizeInBits() == 13804 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); 13805 }; 13806 13807 if (!match(Ext1, m_ZExtOrSExt(m_Value())) || 13808 !match(Ext2, m_ZExtOrSExt(m_Value())) || 13809 !areExtDoubled(cast<Instruction>(Ext1)) || 13810 !areExtDoubled(cast<Instruction>(Ext2))) 13811 return false; 13812 13813 return true; 13814 } 13815 13816 /// Check if Op could be used with vmull_high_p64 intrinsic. 13817 static bool isOperandOfVmullHighP64(Value *Op) { 13818 Value *VectorOperand = nullptr; 13819 ConstantInt *ElementIndex = nullptr; 13820 return match(Op, m_ExtractElt(m_Value(VectorOperand), 13821 m_ConstantInt(ElementIndex))) && 13822 ElementIndex->getValue() == 1 && 13823 isa<FixedVectorType>(VectorOperand->getType()) && 13824 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2; 13825 } 13826 13827 /// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic. 13828 static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) { 13829 return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2); 13830 } 13831 13832 /// Check if sinking \p I's operands to I's basic block is profitable, because 13833 /// the operands can be folded into a target instruction, e.g. 13834 /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2). 13835 bool AArch64TargetLowering::shouldSinkOperands( 13836 Instruction *I, SmallVectorImpl<Use *> &Ops) const { 13837 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 13838 switch (II->getIntrinsicID()) { 13839 case Intrinsic::aarch64_neon_smull: 13840 case Intrinsic::aarch64_neon_umull: 13841 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1), 13842 /*AllowSplat=*/true)) { 13843 Ops.push_back(&II->getOperandUse(0)); 13844 Ops.push_back(&II->getOperandUse(1)); 13845 return true; 13846 } 13847 [[fallthrough]]; 13848 13849 case Intrinsic::fma: 13850 if (isa<VectorType>(I->getType()) && 13851 cast<VectorType>(I->getType())->getElementType()->isHalfTy() && 13852 !Subtarget->hasFullFP16()) 13853 return false; 13854 [[fallthrough]]; 13855 case Intrinsic::aarch64_neon_sqdmull: 13856 case Intrinsic::aarch64_neon_sqdmulh: 13857 case Intrinsic::aarch64_neon_sqrdmulh: 13858 // Sink splats for index lane variants 13859 if (isSplatShuffle(II->getOperand(0))) 13860 Ops.push_back(&II->getOperandUse(0)); 13861 if (isSplatShuffle(II->getOperand(1))) 13862 Ops.push_back(&II->getOperandUse(1)); 13863 return !Ops.empty(); 13864 case Intrinsic::aarch64_sve_ptest_first: 13865 case Intrinsic::aarch64_sve_ptest_last: 13866 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0))) 13867 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue) 13868 Ops.push_back(&II->getOperandUse(0)); 13869 return !Ops.empty(); 13870 case Intrinsic::aarch64_sme_write_horiz: 13871 case Intrinsic::aarch64_sme_write_vert: 13872 case Intrinsic::aarch64_sme_writeq_horiz: 13873 case Intrinsic::aarch64_sme_writeq_vert: { 13874 auto *Idx = dyn_cast<Instruction>(II->getOperand(1)); 13875 if (!Idx || Idx->getOpcode() != Instruction::Add) 13876 return false; 13877 Ops.push_back(&II->getOperandUse(1)); 13878 return true; 13879 } 13880 case Intrinsic::aarch64_sme_read_horiz: 13881 case Intrinsic::aarch64_sme_read_vert: 13882 case Intrinsic::aarch64_sme_readq_horiz: 13883 case Intrinsic::aarch64_sme_readq_vert: 13884 case Intrinsic::aarch64_sme_ld1b_vert: 13885 case Intrinsic::aarch64_sme_ld1h_vert: 13886 case Intrinsic::aarch64_sme_ld1w_vert: 13887 case Intrinsic::aarch64_sme_ld1d_vert: 13888 case Intrinsic::aarch64_sme_ld1q_vert: 13889 case Intrinsic::aarch64_sme_st1b_vert: 13890 case Intrinsic::aarch64_sme_st1h_vert: 13891 case Intrinsic::aarch64_sme_st1w_vert: 13892 case Intrinsic::aarch64_sme_st1d_vert: 13893 case Intrinsic::aarch64_sme_st1q_vert: 13894 case Intrinsic::aarch64_sme_ld1b_horiz: 13895 case Intrinsic::aarch64_sme_ld1h_horiz: 13896 case Intrinsic::aarch64_sme_ld1w_horiz: 13897 case Intrinsic::aarch64_sme_ld1d_horiz: 13898 case Intrinsic::aarch64_sme_ld1q_horiz: 13899 case Intrinsic::aarch64_sme_st1b_horiz: 13900 case Intrinsic::aarch64_sme_st1h_horiz: 13901 case Intrinsic::aarch64_sme_st1w_horiz: 13902 case Intrinsic::aarch64_sme_st1d_horiz: 13903 case Intrinsic::aarch64_sme_st1q_horiz: { 13904 auto *Idx = dyn_cast<Instruction>(II->getOperand(3)); 13905 if (!Idx || Idx->getOpcode() != Instruction::Add) 13906 return false; 13907 Ops.push_back(&II->getOperandUse(3)); 13908 return true; 13909 } 13910 case Intrinsic::aarch64_neon_pmull: 13911 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1))) 13912 return false; 13913 Ops.push_back(&II->getOperandUse(0)); 13914 Ops.push_back(&II->getOperandUse(1)); 13915 return true; 13916 case Intrinsic::aarch64_neon_pmull64: 13917 if (!areOperandsOfVmullHighP64(II->getArgOperand(0), 13918 II->getArgOperand(1))) 13919 return false; 13920 Ops.push_back(&II->getArgOperandUse(0)); 13921 Ops.push_back(&II->getArgOperandUse(1)); 13922 return true; 13923 default: 13924 return false; 13925 } 13926 } 13927 13928 if (!I->getType()->isVectorTy()) 13929 return false; 13930 13931 switch (I->getOpcode()) { 13932 case Instruction::Sub: 13933 case Instruction::Add: { 13934 if (!areExtractExts(I->getOperand(0), I->getOperand(1))) 13935 return false; 13936 13937 // If the exts' operands extract either the lower or upper elements, we 13938 // can sink them too. 13939 auto Ext1 = cast<Instruction>(I->getOperand(0)); 13940 auto Ext2 = cast<Instruction>(I->getOperand(1)); 13941 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) { 13942 Ops.push_back(&Ext1->getOperandUse(0)); 13943 Ops.push_back(&Ext2->getOperandUse(0)); 13944 } 13945 13946 Ops.push_back(&I->getOperandUse(0)); 13947 Ops.push_back(&I->getOperandUse(1)); 13948 13949 return true; 13950 } 13951 case Instruction::Mul: { 13952 int NumZExts = 0, NumSExts = 0; 13953 for (auto &Op : I->operands()) { 13954 // Make sure we are not already sinking this operand 13955 if (any_of(Ops, [&](Use *U) { return U->get() == Op; })) 13956 continue; 13957 13958 if (match(&Op, m_SExt(m_Value()))) { 13959 NumSExts++; 13960 continue; 13961 } else if (match(&Op, m_ZExt(m_Value()))) { 13962 NumZExts++; 13963 continue; 13964 } 13965 13966 ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op); 13967 13968 // If the Shuffle is a splat and the operand is a zext/sext, sinking the 13969 // operand and the s/zext can help create indexed s/umull. This is 13970 // especially useful to prevent i64 mul being scalarized. 13971 if (Shuffle && isSplatShuffle(Shuffle) && 13972 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) { 13973 Ops.push_back(&Shuffle->getOperandUse(0)); 13974 Ops.push_back(&Op); 13975 if (match(Shuffle->getOperand(0), m_SExt(m_Value()))) 13976 NumSExts++; 13977 else 13978 NumZExts++; 13979 continue; 13980 } 13981 13982 if (!Shuffle) 13983 continue; 13984 13985 Value *ShuffleOperand = Shuffle->getOperand(0); 13986 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand); 13987 if (!Insert) 13988 continue; 13989 13990 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1)); 13991 if (!OperandInstr) 13992 continue; 13993 13994 ConstantInt *ElementConstant = 13995 dyn_cast<ConstantInt>(Insert->getOperand(2)); 13996 // Check that the insertelement is inserting into element 0 13997 if (!ElementConstant || ElementConstant->getZExtValue() != 0) 13998 continue; 13999 14000 unsigned Opcode = OperandInstr->getOpcode(); 14001 if (Opcode == Instruction::SExt) 14002 NumSExts++; 14003 else if (Opcode == Instruction::ZExt) 14004 NumZExts++; 14005 else { 14006 // If we find that the top bits are known 0, then we can sink and allow 14007 // the backend to generate a umull. 14008 unsigned Bitwidth = I->getType()->getScalarSizeInBits(); 14009 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2); 14010 const DataLayout &DL = I->getFunction()->getParent()->getDataLayout(); 14011 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL)) 14012 continue; 14013 NumZExts++; 14014 } 14015 14016 Ops.push_back(&Shuffle->getOperandUse(0)); 14017 Ops.push_back(&Op); 14018 } 14019 14020 // Is it profitable to sink if we found two of the same type of extends. 14021 return !Ops.empty() && (NumSExts == 2 || NumZExts == 2); 14022 } 14023 default: 14024 return false; 14025 } 14026 return false; 14027 } 14028 14029 static void createTblShuffleForZExt(ZExtInst *ZExt, bool IsLittleEndian) { 14030 Value *Op = ZExt->getOperand(0); 14031 auto *SrcTy = cast<FixedVectorType>(Op->getType()); 14032 auto *DstTy = cast<FixedVectorType>(ZExt->getType()); 14033 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth(); 14034 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth(); 14035 assert(DstWidth % SrcWidth == 0 && 14036 "TBL lowering is not supported for a ZExt instruction with this " 14037 "source & destination element type."); 14038 unsigned ZExtFactor = DstWidth / SrcWidth; 14039 unsigned NumElts = SrcTy->getNumElements(); 14040 IRBuilder<> Builder(ZExt); 14041 SmallVector<int> Mask; 14042 // Create a mask that selects <0,...,Op[i]> for each lane of the destination 14043 // vector to replace the original ZExt. This can later be lowered to a set of 14044 // tbl instructions. 14045 for (unsigned i = 0; i < NumElts * ZExtFactor; i++) { 14046 if (IsLittleEndian) { 14047 if (i % ZExtFactor == 0) 14048 Mask.push_back(i / ZExtFactor); 14049 else 14050 Mask.push_back(NumElts); 14051 } else { 14052 if ((i + 1) % ZExtFactor == 0) 14053 Mask.push_back((i - ZExtFactor + 1) / ZExtFactor); 14054 else 14055 Mask.push_back(NumElts); 14056 } 14057 } 14058 14059 auto *FirstEltZero = Builder.CreateInsertElement( 14060 PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0)); 14061 Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask); 14062 Result = Builder.CreateBitCast(Result, DstTy); 14063 ZExt->replaceAllUsesWith(Result); 14064 ZExt->eraseFromParent(); 14065 } 14066 14067 static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) { 14068 IRBuilder<> Builder(TI); 14069 SmallVector<Value *> Parts; 14070 int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements(); 14071 auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType()); 14072 auto *DstTy = cast<FixedVectorType>(TI->getType()); 14073 assert(SrcTy->getElementType()->isIntegerTy() && 14074 "Non-integer type source vector element is not supported"); 14075 assert(DstTy->getElementType()->isIntegerTy(8) && 14076 "Unsupported destination vector element type"); 14077 unsigned SrcElemTySz = 14078 cast<IntegerType>(SrcTy->getElementType())->getBitWidth(); 14079 unsigned DstElemTySz = 14080 cast<IntegerType>(DstTy->getElementType())->getBitWidth(); 14081 assert((SrcElemTySz % DstElemTySz == 0) && 14082 "Cannot lower truncate to tbl instructions for a source element size " 14083 "that is not divisible by the destination element size"); 14084 unsigned TruncFactor = SrcElemTySz / DstElemTySz; 14085 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) && 14086 "Unsupported source vector element type size"); 14087 Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16); 14088 14089 // Create a mask to choose every nth byte from the source vector table of 14090 // bytes to create the truncated destination vector, where 'n' is the truncate 14091 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose 14092 // 0,8,16,..Y*8th bytes for the little-endian format 14093 SmallVector<Constant *, 16> MaskConst; 14094 for (int Itr = 0; Itr < 16; Itr++) { 14095 if (Itr < NumElements) 14096 MaskConst.push_back(Builder.getInt8( 14097 IsLittleEndian ? Itr * TruncFactor 14098 : Itr * TruncFactor + (TruncFactor - 1))); 14099 else 14100 MaskConst.push_back(Builder.getInt8(255)); 14101 } 14102 14103 int MaxTblSz = 128 * 4; 14104 int MaxSrcSz = SrcElemTySz * NumElements; 14105 int ElemsPerTbl = 14106 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz); 14107 assert(ElemsPerTbl <= 16 && 14108 "Maximum elements selected using TBL instruction cannot exceed 16!"); 14109 14110 int ShuffleCount = 128 / SrcElemTySz; 14111 SmallVector<int> ShuffleLanes; 14112 for (int i = 0; i < ShuffleCount; ++i) 14113 ShuffleLanes.push_back(i); 14114 14115 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles 14116 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated, 14117 // call TBL & save the result in a vector of TBL results for combining later. 14118 SmallVector<Value *> Results; 14119 while (ShuffleLanes.back() < NumElements) { 14120 Parts.push_back(Builder.CreateBitCast( 14121 Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy)); 14122 14123 if (Parts.size() == 4) { 14124 auto *F = Intrinsic::getDeclaration(TI->getModule(), 14125 Intrinsic::aarch64_neon_tbl4, VecTy); 14126 Parts.push_back(ConstantVector::get(MaskConst)); 14127 Results.push_back(Builder.CreateCall(F, Parts)); 14128 Parts.clear(); 14129 } 14130 14131 for (int i = 0; i < ShuffleCount; ++i) 14132 ShuffleLanes[i] += ShuffleCount; 14133 } 14134 14135 assert((Parts.empty() || Results.empty()) && 14136 "Lowering trunc for vectors requiring different TBL instructions is " 14137 "not supported!"); 14138 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD 14139 // registers 14140 if (!Parts.empty()) { 14141 Intrinsic::ID TblID; 14142 switch (Parts.size()) { 14143 case 1: 14144 TblID = Intrinsic::aarch64_neon_tbl1; 14145 break; 14146 case 2: 14147 TblID = Intrinsic::aarch64_neon_tbl2; 14148 break; 14149 case 3: 14150 TblID = Intrinsic::aarch64_neon_tbl3; 14151 break; 14152 } 14153 14154 auto *F = Intrinsic::getDeclaration(TI->getModule(), TblID, VecTy); 14155 Parts.push_back(ConstantVector::get(MaskConst)); 14156 Results.push_back(Builder.CreateCall(F, Parts)); 14157 } 14158 14159 // Extract the destination vector from TBL result(s) after combining them 14160 // where applicable. Currently, at most two TBLs are supported. 14161 assert(Results.size() <= 2 && "Trunc lowering does not support generation of " 14162 "more than 2 tbl instructions!"); 14163 Value *FinalResult = Results[0]; 14164 if (Results.size() == 1) { 14165 if (ElemsPerTbl < 16) { 14166 SmallVector<int> FinalMask(ElemsPerTbl); 14167 std::iota(FinalMask.begin(), FinalMask.end(), 0); 14168 FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask); 14169 } 14170 } else { 14171 SmallVector<int> FinalMask(ElemsPerTbl * Results.size()); 14172 if (ElemsPerTbl < 16) { 14173 std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0); 14174 std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16); 14175 } else { 14176 std::iota(FinalMask.begin(), FinalMask.end(), 0); 14177 } 14178 FinalResult = 14179 Builder.CreateShuffleVector(Results[0], Results[1], FinalMask); 14180 } 14181 14182 TI->replaceAllUsesWith(FinalResult); 14183 TI->eraseFromParent(); 14184 } 14185 14186 bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(Instruction *I, 14187 Loop *L) const { 14188 // shuffle_vector instructions are serialized when targeting SVE, 14189 // see LowerSPLAT_VECTOR. This peephole is not beneficial. 14190 if (Subtarget->useSVEForFixedLengthVectors()) 14191 return false; 14192 14193 // Try to optimize conversions using tbl. This requires materializing constant 14194 // index vectors, which can increase code size and add loads. Skip the 14195 // transform unless the conversion is in a loop block guaranteed to execute 14196 // and we are not optimizing for size. 14197 Function *F = I->getParent()->getParent(); 14198 if (!L || L->getHeader() != I->getParent() || F->hasMinSize() || 14199 F->hasOptSize()) 14200 return false; 14201 14202 auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType()); 14203 auto *DstTy = dyn_cast<FixedVectorType>(I->getType()); 14204 if (!SrcTy || !DstTy) 14205 return false; 14206 14207 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be 14208 // lowered to tbl instructions to insert the original i8 elements 14209 // into i8x lanes. This is enabled for cases where it is beneficial. 14210 auto *ZExt = dyn_cast<ZExtInst>(I); 14211 if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) { 14212 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth(); 14213 if (DstWidth % 8 == 0 && DstWidth > 16 && DstWidth < 64) { 14214 createTblShuffleForZExt(ZExt, Subtarget->isLittleEndian()); 14215 return true; 14216 } 14217 } 14218 14219 auto *UIToFP = dyn_cast<UIToFPInst>(I); 14220 if (UIToFP && SrcTy->getElementType()->isIntegerTy(8) && 14221 DstTy->getElementType()->isFloatTy()) { 14222 IRBuilder<> Builder(I); 14223 auto *ZExt = cast<ZExtInst>( 14224 Builder.CreateZExt(I->getOperand(0), VectorType::getInteger(DstTy))); 14225 auto *UI = Builder.CreateUIToFP(ZExt, DstTy); 14226 I->replaceAllUsesWith(UI); 14227 I->eraseFromParent(); 14228 createTblShuffleForZExt(ZExt, Subtarget->isLittleEndian()); 14229 return true; 14230 } 14231 14232 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui 14233 // followed by a truncate lowered to using tbl.4. 14234 auto *FPToUI = dyn_cast<FPToUIInst>(I); 14235 if (FPToUI && 14236 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) && 14237 SrcTy->getElementType()->isFloatTy() && 14238 DstTy->getElementType()->isIntegerTy(8)) { 14239 IRBuilder<> Builder(I); 14240 auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0), 14241 VectorType::getInteger(SrcTy)); 14242 auto *TruncI = Builder.CreateTrunc(WideConv, DstTy); 14243 I->replaceAllUsesWith(TruncI); 14244 I->eraseFromParent(); 14245 createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian()); 14246 return true; 14247 } 14248 14249 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate 14250 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits 14251 // per lane of the input that is represented using 1,2,3 or 4 128-bit table 14252 // registers 14253 auto *TI = dyn_cast<TruncInst>(I); 14254 if (TI && DstTy->getElementType()->isIntegerTy(8) && 14255 ((SrcTy->getElementType()->isIntegerTy(32) || 14256 SrcTy->getElementType()->isIntegerTy(64)) && 14257 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) { 14258 createTblForTrunc(TI, Subtarget->isLittleEndian()); 14259 return true; 14260 } 14261 14262 return false; 14263 } 14264 14265 bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType, 14266 Align &RequiredAligment) const { 14267 if (!LoadedType.isSimple() || 14268 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint())) 14269 return false; 14270 // Cyclone supports unaligned accesses. 14271 RequiredAligment = Align(1); 14272 unsigned NumBits = LoadedType.getSizeInBits(); 14273 return NumBits == 32 || NumBits == 64; 14274 } 14275 14276 /// A helper function for determining the number of interleaved accesses we 14277 /// will generate when lowering accesses of the given type. 14278 unsigned AArch64TargetLowering::getNumInterleavedAccesses( 14279 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const { 14280 unsigned VecSize = 128; 14281 if (UseScalable) 14282 VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u); 14283 return std::max<unsigned>(1, (DL.getTypeSizeInBits(VecTy) + 127) / VecSize); 14284 } 14285 14286 MachineMemOperand::Flags 14287 AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const { 14288 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor && 14289 I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr) 14290 return MOStridedAccess; 14291 return MachineMemOperand::MONone; 14292 } 14293 14294 bool AArch64TargetLowering::isLegalInterleavedAccessType( 14295 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const { 14296 14297 unsigned VecSize = DL.getTypeSizeInBits(VecTy); 14298 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); 14299 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements(); 14300 14301 UseScalable = false; 14302 14303 // Ensure that the predicate for this number of elements is available. 14304 if (Subtarget->hasSVE() && !getSVEPredPatternFromNumElements(NumElements)) 14305 return false; 14306 14307 // Ensure the number of vector elements is greater than 1. 14308 if (NumElements < 2) 14309 return false; 14310 14311 // Ensure the element type is legal. 14312 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64) 14313 return false; 14314 14315 if (Subtarget->forceStreamingCompatibleSVE() || 14316 (Subtarget->useSVEForFixedLengthVectors() && 14317 (VecSize % Subtarget->getMinSVEVectorSizeInBits() == 0 || 14318 (VecSize < Subtarget->getMinSVEVectorSizeInBits() && 14319 isPowerOf2_32(NumElements) && VecSize > 128)))) { 14320 UseScalable = true; 14321 return true; 14322 } 14323 14324 // Ensure the total vector size is 64 or a multiple of 128. Types larger than 14325 // 128 will be split into multiple interleaved accesses. 14326 return VecSize == 64 || VecSize % 128 == 0; 14327 } 14328 14329 static ScalableVectorType *getSVEContainerIRType(FixedVectorType *VTy) { 14330 if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext())) 14331 return ScalableVectorType::get(VTy->getElementType(), 2); 14332 14333 if (VTy->getElementType() == Type::getFloatTy(VTy->getContext())) 14334 return ScalableVectorType::get(VTy->getElementType(), 4); 14335 14336 if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext())) 14337 return ScalableVectorType::get(VTy->getElementType(), 8); 14338 14339 if (VTy->getElementType() == Type::getHalfTy(VTy->getContext())) 14340 return ScalableVectorType::get(VTy->getElementType(), 8); 14341 14342 if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext())) 14343 return ScalableVectorType::get(VTy->getElementType(), 2); 14344 14345 if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext())) 14346 return ScalableVectorType::get(VTy->getElementType(), 4); 14347 14348 if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext())) 14349 return ScalableVectorType::get(VTy->getElementType(), 8); 14350 14351 if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext())) 14352 return ScalableVectorType::get(VTy->getElementType(), 16); 14353 14354 llvm_unreachable("Cannot handle input vector type"); 14355 } 14356 14357 /// Lower an interleaved load into a ldN intrinsic. 14358 /// 14359 /// E.g. Lower an interleaved load (Factor = 2): 14360 /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr 14361 /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements 14362 /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements 14363 /// 14364 /// Into: 14365 /// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr) 14366 /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 14367 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 14368 bool AArch64TargetLowering::lowerInterleavedLoad( 14369 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, 14370 ArrayRef<unsigned> Indices, unsigned Factor) const { 14371 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 14372 "Invalid interleave factor"); 14373 assert(!Shuffles.empty() && "Empty shufflevector input"); 14374 assert(Shuffles.size() == Indices.size() && 14375 "Unmatched number of shufflevectors and indices"); 14376 14377 const DataLayout &DL = LI->getModule()->getDataLayout(); 14378 14379 VectorType *VTy = Shuffles[0]->getType(); 14380 14381 // Skip if we do not have NEON and skip illegal vector types. We can 14382 // "legalize" wide vector types into multiple interleaved accesses as long as 14383 // the vector types are divisible by 128. 14384 bool UseScalable; 14385 if (!Subtarget->hasNEON() || 14386 !isLegalInterleavedAccessType(VTy, DL, UseScalable)) 14387 return false; 14388 14389 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable); 14390 14391 auto *FVTy = cast<FixedVectorType>(VTy); 14392 14393 // A pointer vector can not be the return type of the ldN intrinsics. Need to 14394 // load integer vectors first and then convert to pointer vectors. 14395 Type *EltTy = FVTy->getElementType(); 14396 if (EltTy->isPointerTy()) 14397 FVTy = 14398 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements()); 14399 14400 // If we're going to generate more than one load, reset the sub-vector type 14401 // to something legal. 14402 FVTy = FixedVectorType::get(FVTy->getElementType(), 14403 FVTy->getNumElements() / NumLoads); 14404 14405 auto *LDVTy = 14406 UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy; 14407 14408 IRBuilder<> Builder(LI); 14409 14410 // The base address of the load. 14411 Value *BaseAddr = LI->getPointerOperand(); 14412 14413 if (NumLoads > 1) { 14414 // We will compute the pointer operand of each load from the original base 14415 // address using GEPs. Cast the base address to a pointer to the scalar 14416 // element type. 14417 BaseAddr = Builder.CreateBitCast( 14418 BaseAddr, 14419 LDVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace())); 14420 } 14421 14422 Type *PtrTy = 14423 UseScalable 14424 ? LDVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()) 14425 : LDVTy->getPointerTo(LI->getPointerAddressSpace()); 14426 Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()), 14427 LDVTy->getElementCount()); 14428 14429 static const Intrinsic::ID SVELoadIntrs[3] = { 14430 Intrinsic::aarch64_sve_ld2_sret, Intrinsic::aarch64_sve_ld3_sret, 14431 Intrinsic::aarch64_sve_ld4_sret}; 14432 static const Intrinsic::ID NEONLoadIntrs[3] = {Intrinsic::aarch64_neon_ld2, 14433 Intrinsic::aarch64_neon_ld3, 14434 Intrinsic::aarch64_neon_ld4}; 14435 Function *LdNFunc; 14436 if (UseScalable) 14437 LdNFunc = Intrinsic::getDeclaration(LI->getModule(), 14438 SVELoadIntrs[Factor - 2], {LDVTy}); 14439 else 14440 LdNFunc = Intrinsic::getDeclaration( 14441 LI->getModule(), NEONLoadIntrs[Factor - 2], {LDVTy, PtrTy}); 14442 14443 // Holds sub-vectors extracted from the load intrinsic return values. The 14444 // sub-vectors are associated with the shufflevector instructions they will 14445 // replace. 14446 DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs; 14447 14448 Value *PTrue = nullptr; 14449 if (UseScalable) { 14450 std::optional<unsigned> PgPattern = 14451 getSVEPredPatternFromNumElements(FVTy->getNumElements()); 14452 if (Subtarget->getMinSVEVectorSizeInBits() == 14453 Subtarget->getMaxSVEVectorSizeInBits() && 14454 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy)) 14455 PgPattern = AArch64SVEPredPattern::all; 14456 14457 auto *PTruePat = 14458 ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern); 14459 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy}, 14460 {PTruePat}); 14461 } 14462 14463 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) { 14464 14465 // If we're generating more than one load, compute the base address of 14466 // subsequent loads as an offset from the previous. 14467 if (LoadCount > 0) 14468 BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr, 14469 FVTy->getNumElements() * Factor); 14470 14471 CallInst *LdN; 14472 if (UseScalable) 14473 LdN = Builder.CreateCall( 14474 LdNFunc, {PTrue, Builder.CreateBitCast(BaseAddr, PtrTy)}, "ldN"); 14475 else 14476 LdN = Builder.CreateCall(LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), 14477 "ldN"); 14478 14479 // Extract and store the sub-vectors returned by the load intrinsic. 14480 for (unsigned i = 0; i < Shuffles.size(); i++) { 14481 ShuffleVectorInst *SVI = Shuffles[i]; 14482 unsigned Index = Indices[i]; 14483 14484 Value *SubVec = Builder.CreateExtractValue(LdN, Index); 14485 14486 if (UseScalable) 14487 SubVec = Builder.CreateExtractVector( 14488 FVTy, SubVec, 14489 ConstantInt::get(Type::getInt64Ty(VTy->getContext()), 0)); 14490 14491 // Convert the integer vector to pointer vector if the element is pointer. 14492 if (EltTy->isPointerTy()) 14493 SubVec = Builder.CreateIntToPtr( 14494 SubVec, FixedVectorType::get(SVI->getType()->getElementType(), 14495 FVTy->getNumElements())); 14496 14497 SubVecs[SVI].push_back(SubVec); 14498 } 14499 } 14500 14501 // Replace uses of the shufflevector instructions with the sub-vectors 14502 // returned by the load intrinsic. If a shufflevector instruction is 14503 // associated with more than one sub-vector, those sub-vectors will be 14504 // concatenated into a single wide vector. 14505 for (ShuffleVectorInst *SVI : Shuffles) { 14506 auto &SubVec = SubVecs[SVI]; 14507 auto *WideVec = 14508 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0]; 14509 SVI->replaceAllUsesWith(WideVec); 14510 } 14511 14512 return true; 14513 } 14514 14515 /// Lower an interleaved store into a stN intrinsic. 14516 /// 14517 /// E.g. Lower an interleaved store (Factor = 3): 14518 /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, 14519 /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> 14520 /// store <12 x i32> %i.vec, <12 x i32>* %ptr 14521 /// 14522 /// Into: 14523 /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> 14524 /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> 14525 /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> 14526 /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr) 14527 /// 14528 /// Note that the new shufflevectors will be removed and we'll only generate one 14529 /// st3 instruction in CodeGen. 14530 /// 14531 /// Example for a more general valid mask (Factor 3). Lower: 14532 /// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1, 14533 /// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19> 14534 /// store <12 x i32> %i.vec, <12 x i32>* %ptr 14535 /// 14536 /// Into: 14537 /// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7> 14538 /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> 14539 /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> 14540 /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr) 14541 bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, 14542 ShuffleVectorInst *SVI, 14543 unsigned Factor) const { 14544 // Skip if streaming compatible SVE is enabled, because it generates invalid 14545 // code in streaming mode when SVE length is not specified. 14546 if (Subtarget->forceStreamingCompatibleSVE()) 14547 return false; 14548 14549 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 14550 "Invalid interleave factor"); 14551 14552 auto *VecTy = cast<FixedVectorType>(SVI->getType()); 14553 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); 14554 14555 unsigned LaneLen = VecTy->getNumElements() / Factor; 14556 Type *EltTy = VecTy->getElementType(); 14557 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen); 14558 14559 const DataLayout &DL = SI->getModule()->getDataLayout(); 14560 bool UseScalable; 14561 14562 // Skip if we do not have NEON and skip illegal vector types. We can 14563 // "legalize" wide vector types into multiple interleaved accesses as long as 14564 // the vector types are divisible by 128. 14565 if (!Subtarget->hasNEON() || 14566 !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) 14567 return false; 14568 14569 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable); 14570 14571 Value *Op0 = SVI->getOperand(0); 14572 Value *Op1 = SVI->getOperand(1); 14573 IRBuilder<> Builder(SI); 14574 14575 // StN intrinsics don't support pointer vectors as arguments. Convert pointer 14576 // vectors to integer vectors. 14577 if (EltTy->isPointerTy()) { 14578 Type *IntTy = DL.getIntPtrType(EltTy); 14579 unsigned NumOpElts = 14580 cast<FixedVectorType>(Op0->getType())->getNumElements(); 14581 14582 // Convert to the corresponding integer vector. 14583 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts); 14584 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); 14585 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); 14586 14587 SubVecTy = FixedVectorType::get(IntTy, LaneLen); 14588 } 14589 14590 // If we're going to generate more than one store, reset the lane length 14591 // and sub-vector type to something legal. 14592 LaneLen /= NumStores; 14593 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen); 14594 14595 auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy)) 14596 : SubVecTy; 14597 14598 // The base address of the store. 14599 Value *BaseAddr = SI->getPointerOperand(); 14600 14601 if (NumStores > 1) { 14602 // We will compute the pointer operand of each store from the original base 14603 // address using GEPs. Cast the base address to a pointer to the scalar 14604 // element type. 14605 BaseAddr = Builder.CreateBitCast( 14606 BaseAddr, 14607 SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace())); 14608 } 14609 14610 auto Mask = SVI->getShuffleMask(); 14611 14612 // Sanity check if all the indices are NOT in range. 14613 // If mask is `undef` or `poison`, `Mask` may be a vector of -1s. 14614 // If all of them are `undef`, OOB read will happen later. 14615 if (llvm::all_of(Mask, [](int Idx) { return Idx == UndefMaskElem; })) { 14616 return false; 14617 } 14618 14619 Type *PtrTy = 14620 UseScalable 14621 ? STVTy->getElementType()->getPointerTo(SI->getPointerAddressSpace()) 14622 : STVTy->getPointerTo(SI->getPointerAddressSpace()); 14623 Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()), 14624 STVTy->getElementCount()); 14625 14626 static const Intrinsic::ID SVEStoreIntrs[3] = {Intrinsic::aarch64_sve_st2, 14627 Intrinsic::aarch64_sve_st3, 14628 Intrinsic::aarch64_sve_st4}; 14629 static const Intrinsic::ID NEONStoreIntrs[3] = {Intrinsic::aarch64_neon_st2, 14630 Intrinsic::aarch64_neon_st3, 14631 Intrinsic::aarch64_neon_st4}; 14632 Function *StNFunc; 14633 if (UseScalable) 14634 StNFunc = Intrinsic::getDeclaration(SI->getModule(), 14635 SVEStoreIntrs[Factor - 2], {STVTy}); 14636 else 14637 StNFunc = Intrinsic::getDeclaration( 14638 SI->getModule(), NEONStoreIntrs[Factor - 2], {STVTy, PtrTy}); 14639 14640 Value *PTrue = nullptr; 14641 if (UseScalable) { 14642 std::optional<unsigned> PgPattern = 14643 getSVEPredPatternFromNumElements(SubVecTy->getNumElements()); 14644 if (Subtarget->getMinSVEVectorSizeInBits() == 14645 Subtarget->getMaxSVEVectorSizeInBits() && 14646 Subtarget->getMinSVEVectorSizeInBits() == 14647 DL.getTypeSizeInBits(SubVecTy)) 14648 PgPattern = AArch64SVEPredPattern::all; 14649 14650 auto *PTruePat = 14651 ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern); 14652 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy}, 14653 {PTruePat}); 14654 } 14655 14656 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { 14657 14658 SmallVector<Value *, 5> Ops; 14659 14660 // Split the shufflevector operands into sub vectors for the new stN call. 14661 for (unsigned i = 0; i < Factor; i++) { 14662 Value *Shuffle; 14663 unsigned IdxI = StoreCount * LaneLen * Factor + i; 14664 if (Mask[IdxI] >= 0) { 14665 Shuffle = Builder.CreateShuffleVector( 14666 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)); 14667 } else { 14668 unsigned StartMask = 0; 14669 for (unsigned j = 1; j < LaneLen; j++) { 14670 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i; 14671 if (Mask[IdxJ] >= 0) { 14672 StartMask = Mask[IdxJ] - j; 14673 break; 14674 } 14675 } 14676 // Note: Filling undef gaps with random elements is ok, since 14677 // those elements were being written anyway (with undefs). 14678 // In the case of all undefs we're defaulting to using elems from 0 14679 // Note: StartMask cannot be negative, it's checked in 14680 // isReInterleaveMask 14681 Shuffle = Builder.CreateShuffleVector( 14682 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)); 14683 } 14684 14685 if (UseScalable) 14686 Shuffle = Builder.CreateInsertVector( 14687 STVTy, UndefValue::get(STVTy), Shuffle, 14688 ConstantInt::get(Type::getInt64Ty(STVTy->getContext()), 0)); 14689 14690 Ops.push_back(Shuffle); 14691 } 14692 14693 if (UseScalable) 14694 Ops.push_back(PTrue); 14695 14696 // If we generating more than one store, we compute the base address of 14697 // subsequent stores as an offset from the previous. 14698 if (StoreCount > 0) 14699 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(), 14700 BaseAddr, LaneLen * Factor); 14701 14702 Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy)); 14703 Builder.CreateCall(StNFunc, Ops); 14704 } 14705 return true; 14706 } 14707 14708 EVT AArch64TargetLowering::getOptimalMemOpType( 14709 const MemOp &Op, const AttributeList &FuncAttributes) const { 14710 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat); 14711 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat; 14712 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat; 14713 // Only use AdvSIMD to implement memset of 32-byte and above. It would have 14714 // taken one instruction to materialize the v2i64 zero and one store (with 14715 // restrictive addressing mode). Just do i64 stores. 14716 bool IsSmallMemset = Op.isMemset() && Op.size() < 32; 14717 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) { 14718 if (Op.isAligned(AlignCheck)) 14719 return true; 14720 unsigned Fast; 14721 return allowsMisalignedMemoryAccesses(VT, 0, Align(1), 14722 MachineMemOperand::MONone, &Fast) && 14723 Fast; 14724 }; 14725 14726 if (CanUseNEON && Op.isMemset() && !IsSmallMemset && 14727 AlignmentIsAcceptable(MVT::v16i8, Align(16))) 14728 return MVT::v16i8; 14729 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16))) 14730 return MVT::f128; 14731 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8))) 14732 return MVT::i64; 14733 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4))) 14734 return MVT::i32; 14735 return MVT::Other; 14736 } 14737 14738 LLT AArch64TargetLowering::getOptimalMemOpLLT( 14739 const MemOp &Op, const AttributeList &FuncAttributes) const { 14740 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat); 14741 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat; 14742 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat; 14743 // Only use AdvSIMD to implement memset of 32-byte and above. It would have 14744 // taken one instruction to materialize the v2i64 zero and one store (with 14745 // restrictive addressing mode). Just do i64 stores. 14746 bool IsSmallMemset = Op.isMemset() && Op.size() < 32; 14747 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) { 14748 if (Op.isAligned(AlignCheck)) 14749 return true; 14750 unsigned Fast; 14751 return allowsMisalignedMemoryAccesses(VT, 0, Align(1), 14752 MachineMemOperand::MONone, &Fast) && 14753 Fast; 14754 }; 14755 14756 if (CanUseNEON && Op.isMemset() && !IsSmallMemset && 14757 AlignmentIsAcceptable(MVT::v2i64, Align(16))) 14758 return LLT::fixed_vector(2, 64); 14759 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16))) 14760 return LLT::scalar(128); 14761 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8))) 14762 return LLT::scalar(64); 14763 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4))) 14764 return LLT::scalar(32); 14765 return LLT(); 14766 } 14767 14768 // 12-bit optionally shifted immediates are legal for adds. 14769 bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const { 14770 if (Immed == std::numeric_limits<int64_t>::min()) { 14771 LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed 14772 << ": avoid UB for INT64_MIN\n"); 14773 return false; 14774 } 14775 // Same encoding for add/sub, just flip the sign. 14776 Immed = std::abs(Immed); 14777 bool IsLegal = ((Immed >> 12) == 0 || 14778 ((Immed & 0xfff) == 0 && Immed >> 24 == 0)); 14779 LLVM_DEBUG(dbgs() << "Is " << Immed 14780 << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n"); 14781 return IsLegal; 14782 } 14783 14784 // Return false to prevent folding 14785 // (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine, 14786 // if the folding leads to worse code. 14787 bool AArch64TargetLowering::isMulAddWithConstProfitable( 14788 SDValue AddNode, SDValue ConstNode) const { 14789 // Let the DAGCombiner decide for vector types and large types. 14790 const EVT VT = AddNode.getValueType(); 14791 if (VT.isVector() || VT.getScalarSizeInBits() > 64) 14792 return true; 14793 14794 // It is worse if c1 is legal add immediate, while c1*c2 is not 14795 // and has to be composed by at least two instructions. 14796 const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1)); 14797 const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode); 14798 const int64_t C1 = C1Node->getSExtValue(); 14799 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue(); 14800 if (!isLegalAddImmediate(C1) || isLegalAddImmediate(C1C2.getSExtValue())) 14801 return true; 14802 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 14803 AArch64_IMM::expandMOVImm(C1C2.getZExtValue(), VT.getSizeInBits(), Insn); 14804 if (Insn.size() > 1) 14805 return false; 14806 14807 // Default to true and let the DAGCombiner decide. 14808 return true; 14809 } 14810 14811 // Integer comparisons are implemented with ADDS/SUBS, so the range of valid 14812 // immediates is the same as for an add or a sub. 14813 bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const { 14814 return isLegalAddImmediate(Immed); 14815 } 14816 14817 /// isLegalAddressingMode - Return true if the addressing mode represented 14818 /// by AM is legal for this target, for a load/store of the specified type. 14819 bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL, 14820 const AddrMode &AM, Type *Ty, 14821 unsigned AS, Instruction *I) const { 14822 // AArch64 has five basic addressing modes: 14823 // reg 14824 // reg + 9-bit signed offset 14825 // reg + SIZE_IN_BYTES * 12-bit unsigned offset 14826 // reg1 + reg2 14827 // reg + SIZE_IN_BYTES * reg 14828 14829 // No global is ever allowed as a base. 14830 if (AM.BaseGV) 14831 return false; 14832 14833 // No reg+reg+imm addressing. 14834 if (AM.HasBaseReg && AM.BaseOffs && AM.Scale) 14835 return false; 14836 14837 // FIXME: Update this method to support scalable addressing modes. 14838 if (isa<ScalableVectorType>(Ty)) { 14839 uint64_t VecElemNumBytes = 14840 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8; 14841 return AM.HasBaseReg && !AM.BaseOffs && 14842 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes); 14843 } 14844 14845 // check reg + imm case: 14846 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12 14847 uint64_t NumBytes = 0; 14848 if (Ty->isSized()) { 14849 uint64_t NumBits = DL.getTypeSizeInBits(Ty); 14850 NumBytes = NumBits / 8; 14851 if (!isPowerOf2_64(NumBits)) 14852 NumBytes = 0; 14853 } 14854 14855 if (!AM.Scale) { 14856 int64_t Offset = AM.BaseOffs; 14857 14858 // 9-bit signed offset 14859 if (isInt<9>(Offset)) 14860 return true; 14861 14862 // 12-bit unsigned offset 14863 unsigned shift = Log2_64(NumBytes); 14864 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 && 14865 // Must be a multiple of NumBytes (NumBytes is a power of 2) 14866 (Offset >> shift) << shift == Offset) 14867 return true; 14868 return false; 14869 } 14870 14871 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2 14872 14873 return AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes); 14874 } 14875 14876 bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const { 14877 // Consider splitting large offset of struct or array. 14878 return true; 14879 } 14880 14881 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd( 14882 const MachineFunction &MF, EVT VT) const { 14883 VT = VT.getScalarType(); 14884 14885 if (!VT.isSimple()) 14886 return false; 14887 14888 switch (VT.getSimpleVT().SimpleTy) { 14889 case MVT::f16: 14890 return Subtarget->hasFullFP16(); 14891 case MVT::f32: 14892 case MVT::f64: 14893 return true; 14894 default: 14895 break; 14896 } 14897 14898 return false; 14899 } 14900 14901 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F, 14902 Type *Ty) const { 14903 switch (Ty->getScalarType()->getTypeID()) { 14904 case Type::FloatTyID: 14905 case Type::DoubleTyID: 14906 return true; 14907 default: 14908 return false; 14909 } 14910 } 14911 14912 bool AArch64TargetLowering::generateFMAsInMachineCombiner( 14913 EVT VT, CodeGenOpt::Level OptLevel) const { 14914 return (OptLevel >= CodeGenOpt::Aggressive) && !VT.isScalableVector() && 14915 !useSVEForFixedLengthVectorVT(VT); 14916 } 14917 14918 const MCPhysReg * 14919 AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const { 14920 // LR is a callee-save register, but we must treat it as clobbered by any call 14921 // site. Hence we include LR in the scratch registers, which are in turn added 14922 // as implicit-defs for stackmaps and patchpoints. 14923 static const MCPhysReg ScratchRegs[] = { 14924 AArch64::X16, AArch64::X17, AArch64::LR, 0 14925 }; 14926 return ScratchRegs; 14927 } 14928 14929 bool 14930 AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N, 14931 CombineLevel Level) const { 14932 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA || 14933 N->getOpcode() == ISD::SRL) && 14934 "Expected shift op"); 14935 14936 SDValue ShiftLHS = N->getOperand(0); 14937 EVT VT = N->getValueType(0); 14938 14939 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not 14940 // combine it with shift 'N' to let it be lowered to UBFX except: 14941 // ((x >> C) & mask) << C. 14942 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) && 14943 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) { 14944 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1); 14945 if (isMask_64(TruncMask)) { 14946 SDValue AndLHS = ShiftLHS.getOperand(0); 14947 if (AndLHS.getOpcode() == ISD::SRL) { 14948 if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) { 14949 if (N->getOpcode() == ISD::SHL) 14950 if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1))) 14951 return SRLC->getZExtValue() == SHLC->getZExtValue(); 14952 return false; 14953 } 14954 } 14955 } 14956 } 14957 return true; 14958 } 14959 14960 bool AArch64TargetLowering::isDesirableToCommuteXorWithShift( 14961 const SDNode *N) const { 14962 assert(N->getOpcode() == ISD::XOR && 14963 (N->getOperand(0).getOpcode() == ISD::SHL || 14964 N->getOperand(0).getOpcode() == ISD::SRL) && 14965 "Expected XOR(SHIFT) pattern"); 14966 14967 // Only commute if the entire NOT mask is a hidden shifted mask. 14968 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1)); 14969 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1)); 14970 if (XorC && ShiftC) { 14971 unsigned MaskIdx, MaskLen; 14972 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) { 14973 unsigned ShiftAmt = ShiftC->getZExtValue(); 14974 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits(); 14975 if (N->getOperand(0).getOpcode() == ISD::SHL) 14976 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt); 14977 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt); 14978 } 14979 } 14980 14981 return false; 14982 } 14983 14984 bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask( 14985 const SDNode *N, CombineLevel Level) const { 14986 assert(((N->getOpcode() == ISD::SHL && 14987 N->getOperand(0).getOpcode() == ISD::SRL) || 14988 (N->getOpcode() == ISD::SRL && 14989 N->getOperand(0).getOpcode() == ISD::SHL)) && 14990 "Expected shift-shift mask"); 14991 // Don't allow multiuse shift folding with the same shift amount. 14992 if (!N->getOperand(0)->hasOneUse()) 14993 return false; 14994 14995 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns. 14996 EVT VT = N->getValueType(0); 14997 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) { 14998 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1)); 14999 auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1)); 15000 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue()); 15001 } 15002 15003 return true; 15004 } 15005 15006 bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 15007 Type *Ty) const { 15008 assert(Ty->isIntegerTy()); 15009 15010 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 15011 if (BitSize == 0) 15012 return false; 15013 15014 int64_t Val = Imm.getSExtValue(); 15015 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize)) 15016 return true; 15017 15018 if ((int64_t)Val < 0) 15019 Val = ~Val; 15020 if (BitSize == 32) 15021 Val &= (1LL << 32) - 1; 15022 15023 unsigned LZ = countLeadingZeros((uint64_t)Val); 15024 unsigned Shift = (63 - LZ) / 16; 15025 // MOVZ is free so return true for one or fewer MOVK. 15026 return Shift < 3; 15027 } 15028 15029 bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, 15030 unsigned Index) const { 15031 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) 15032 return false; 15033 15034 return (Index == 0 || Index == ResVT.getVectorMinNumElements()); 15035 } 15036 15037 /// Turn vector tests of the signbit in the form of: 15038 /// xor (sra X, elt_size(X)-1), -1 15039 /// into: 15040 /// cmge X, X, #0 15041 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, 15042 const AArch64Subtarget *Subtarget) { 15043 EVT VT = N->getValueType(0); 15044 if (!Subtarget->hasNEON() || !VT.isVector()) 15045 return SDValue(); 15046 15047 // There must be a shift right algebraic before the xor, and the xor must be a 15048 // 'not' operation. 15049 SDValue Shift = N->getOperand(0); 15050 SDValue Ones = N->getOperand(1); 15051 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() || 15052 !ISD::isBuildVectorAllOnes(Ones.getNode())) 15053 return SDValue(); 15054 15055 // The shift should be smearing the sign bit across each vector element. 15056 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1)); 15057 EVT ShiftEltTy = Shift.getValueType().getVectorElementType(); 15058 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1) 15059 return SDValue(); 15060 15061 return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0)); 15062 } 15063 15064 // Given a vecreduce_add node, detect the below pattern and convert it to the 15065 // node sequence with UABDL, [S|U]ADB and UADDLP. 15066 // 15067 // i32 vecreduce_add( 15068 // v16i32 abs( 15069 // v16i32 sub( 15070 // v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b)))) 15071 // =================> 15072 // i32 vecreduce_add( 15073 // v4i32 UADDLP( 15074 // v8i16 add( 15075 // v8i16 zext( 15076 // v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b 15077 // v8i16 zext( 15078 // v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b 15079 static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N, 15080 SelectionDAG &DAG) { 15081 // Assumed i32 vecreduce_add 15082 if (N->getValueType(0) != MVT::i32) 15083 return SDValue(); 15084 15085 SDValue VecReduceOp0 = N->getOperand(0); 15086 unsigned Opcode = VecReduceOp0.getOpcode(); 15087 // Assumed v16i32 abs 15088 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32) 15089 return SDValue(); 15090 15091 SDValue ABS = VecReduceOp0; 15092 // Assumed v16i32 sub 15093 if (ABS->getOperand(0)->getOpcode() != ISD::SUB || 15094 ABS->getOperand(0)->getValueType(0) != MVT::v16i32) 15095 return SDValue(); 15096 15097 SDValue SUB = ABS->getOperand(0); 15098 unsigned Opcode0 = SUB->getOperand(0).getOpcode(); 15099 unsigned Opcode1 = SUB->getOperand(1).getOpcode(); 15100 // Assumed v16i32 type 15101 if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 || 15102 SUB->getOperand(1)->getValueType(0) != MVT::v16i32) 15103 return SDValue(); 15104 15105 // Assumed zext or sext 15106 bool IsZExt = false; 15107 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) { 15108 IsZExt = true; 15109 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) { 15110 IsZExt = false; 15111 } else 15112 return SDValue(); 15113 15114 SDValue EXT0 = SUB->getOperand(0); 15115 SDValue EXT1 = SUB->getOperand(1); 15116 // Assumed zext's operand has v16i8 type 15117 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 || 15118 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8) 15119 return SDValue(); 15120 15121 // Pattern is dectected. Let's convert it to sequence of nodes. 15122 SDLoc DL(N); 15123 15124 // First, create the node pattern of UABD/SABD. 15125 SDValue UABDHigh8Op0 = 15126 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0), 15127 DAG.getConstant(8, DL, MVT::i64)); 15128 SDValue UABDHigh8Op1 = 15129 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0), 15130 DAG.getConstant(8, DL, MVT::i64)); 15131 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8, 15132 UABDHigh8Op0, UABDHigh8Op1); 15133 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8); 15134 15135 // Second, create the node pattern of UABAL. 15136 SDValue UABDLo8Op0 = 15137 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0), 15138 DAG.getConstant(0, DL, MVT::i64)); 15139 SDValue UABDLo8Op1 = 15140 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0), 15141 DAG.getConstant(0, DL, MVT::i64)); 15142 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8, 15143 UABDLo8Op0, UABDLo8Op1); 15144 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8); 15145 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD); 15146 15147 // Third, create the node of UADDLP. 15148 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL); 15149 15150 // Fourth, create the node of VECREDUCE_ADD. 15151 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP); 15152 } 15153 15154 // Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce 15155 // vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one)) 15156 // vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B)) 15157 static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, 15158 const AArch64Subtarget *ST) { 15159 if (!ST->hasDotProd()) 15160 return performVecReduceAddCombineWithUADDLP(N, DAG); 15161 15162 SDValue Op0 = N->getOperand(0); 15163 if (N->getValueType(0) != MVT::i32 || 15164 Op0.getValueType().getVectorElementType() != MVT::i32) 15165 return SDValue(); 15166 15167 unsigned ExtOpcode = Op0.getOpcode(); 15168 SDValue A = Op0; 15169 SDValue B; 15170 if (ExtOpcode == ISD::MUL) { 15171 A = Op0.getOperand(0); 15172 B = Op0.getOperand(1); 15173 if (A.getOpcode() != B.getOpcode() || 15174 A.getOperand(0).getValueType() != B.getOperand(0).getValueType()) 15175 return SDValue(); 15176 ExtOpcode = A.getOpcode(); 15177 } 15178 if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND) 15179 return SDValue(); 15180 15181 EVT Op0VT = A.getOperand(0).getValueType(); 15182 if (Op0VT != MVT::v8i8 && Op0VT != MVT::v16i8) 15183 return SDValue(); 15184 15185 SDLoc DL(Op0); 15186 // For non-mla reductions B can be set to 1. For MLA we take the operand of 15187 // the extend B. 15188 if (!B) 15189 B = DAG.getConstant(1, DL, Op0VT); 15190 else 15191 B = B.getOperand(0); 15192 15193 SDValue Zeros = 15194 DAG.getConstant(0, DL, Op0VT == MVT::v8i8 ? MVT::v2i32 : MVT::v4i32); 15195 auto DotOpcode = 15196 (ExtOpcode == ISD::ZERO_EXTEND) ? AArch64ISD::UDOT : AArch64ISD::SDOT; 15197 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, 15198 A.getOperand(0), B); 15199 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot); 15200 } 15201 15202 // Given an (integer) vecreduce, we know the order of the inputs does not 15203 // matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x)))) 15204 // into UADDV(UADDLP(x)). This can also happen through an extra add, where we 15205 // transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))). 15206 static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) { 15207 auto DetectAddExtract = [&](SDValue A) { 15208 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning 15209 // UADDLP(x) if found. 15210 if (A.getOpcode() != ISD::ADD) 15211 return SDValue(); 15212 EVT VT = A.getValueType(); 15213 SDValue Op0 = A.getOperand(0); 15214 SDValue Op1 = A.getOperand(1); 15215 if (Op0.getOpcode() != Op0.getOpcode() || 15216 (Op0.getOpcode() != ISD::ZERO_EXTEND && 15217 Op0.getOpcode() != ISD::SIGN_EXTEND)) 15218 return SDValue(); 15219 SDValue Ext0 = Op0.getOperand(0); 15220 SDValue Ext1 = Op1.getOperand(0); 15221 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR || 15222 Ext1.getOpcode() != ISD::EXTRACT_SUBVECTOR || 15223 Ext0.getOperand(0) != Ext1.getOperand(0)) 15224 return SDValue(); 15225 // Check that the type is twice the add types, and the extract are from 15226 // upper/lower parts of the same source. 15227 if (Ext0.getOperand(0).getValueType().getVectorNumElements() != 15228 VT.getVectorNumElements() * 2) 15229 return SDValue(); 15230 if ((Ext0.getConstantOperandVal(1) != 0 && 15231 Ext1.getConstantOperandVal(1) != VT.getVectorNumElements()) && 15232 (Ext1.getConstantOperandVal(1) != 0 && 15233 Ext0.getConstantOperandVal(1) != VT.getVectorNumElements())) 15234 return SDValue(); 15235 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP 15236 : AArch64ISD::SADDLP; 15237 return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0)); 15238 }; 15239 15240 SDValue A = N->getOperand(0); 15241 if (SDValue R = DetectAddExtract(A)) 15242 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R); 15243 if (A.getOpcode() == ISD::ADD) { 15244 if (SDValue R = DetectAddExtract(A.getOperand(0))) 15245 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), 15246 DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R, 15247 A.getOperand(1))); 15248 if (SDValue R = DetectAddExtract(A.getOperand(1))) 15249 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), 15250 DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R, 15251 A.getOperand(0))); 15252 } 15253 return SDValue(); 15254 } 15255 15256 15257 static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, 15258 TargetLowering::DAGCombinerInfo &DCI, 15259 const AArch64Subtarget *Subtarget) { 15260 if (DCI.isBeforeLegalizeOps()) 15261 return SDValue(); 15262 15263 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget); 15264 } 15265 15266 SDValue 15267 AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, 15268 SelectionDAG &DAG, 15269 SmallVectorImpl<SDNode *> &Created) const { 15270 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); 15271 if (isIntDivCheap(N->getValueType(0), Attr)) 15272 return SDValue(N,0); // Lower SDIV as SDIV 15273 15274 EVT VT = N->getValueType(0); 15275 15276 // For scalable and fixed types, mark them as cheap so we can handle it much 15277 // later. This allows us to handle larger than legal types. 15278 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors()) 15279 return SDValue(N, 0); 15280 15281 // fold (sdiv X, pow2) 15282 if ((VT != MVT::i32 && VT != MVT::i64) || 15283 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2())) 15284 return SDValue(); 15285 15286 SDLoc DL(N); 15287 SDValue N0 = N->getOperand(0); 15288 unsigned Lg2 = Divisor.countTrailingZeros(); 15289 SDValue Zero = DAG.getConstant(0, DL, VT); 15290 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT); 15291 15292 // Add (N0 < 0) ? Pow2 - 1 : 0; 15293 SDValue CCVal; 15294 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL); 15295 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne); 15296 SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp); 15297 15298 Created.push_back(Cmp.getNode()); 15299 Created.push_back(Add.getNode()); 15300 Created.push_back(CSel.getNode()); 15301 15302 // Divide by pow2. 15303 SDValue SRA = 15304 DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64)); 15305 15306 // If we're dividing by a positive value, we're done. Otherwise, we must 15307 // negate the result. 15308 if (Divisor.isNonNegative()) 15309 return SRA; 15310 15311 Created.push_back(SRA.getNode()); 15312 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA); 15313 } 15314 15315 SDValue 15316 AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor, 15317 SelectionDAG &DAG, 15318 SmallVectorImpl<SDNode *> &Created) const { 15319 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); 15320 if (isIntDivCheap(N->getValueType(0), Attr)) 15321 return SDValue(N, 0); // Lower SREM as SREM 15322 15323 EVT VT = N->getValueType(0); 15324 15325 // For scalable and fixed types, mark them as cheap so we can handle it much 15326 // later. This allows us to handle larger than legal types. 15327 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors()) 15328 return SDValue(N, 0); 15329 15330 // fold (srem X, pow2) 15331 if ((VT != MVT::i32 && VT != MVT::i64) || 15332 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2())) 15333 return SDValue(); 15334 15335 unsigned Lg2 = Divisor.countTrailingZeros(); 15336 if (Lg2 == 0) 15337 return SDValue(); 15338 15339 SDLoc DL(N); 15340 SDValue N0 = N->getOperand(0); 15341 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT); 15342 SDValue Zero = DAG.getConstant(0, DL, VT); 15343 SDValue CCVal, CSNeg; 15344 if (Lg2 == 1) { 15345 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL); 15346 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne); 15347 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp); 15348 15349 Created.push_back(Cmp.getNode()); 15350 Created.push_back(And.getNode()); 15351 } else { 15352 SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC); 15353 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 15354 15355 SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0); 15356 SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne); 15357 SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne); 15358 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal, 15359 Negs.getValue(1)); 15360 15361 Created.push_back(Negs.getNode()); 15362 Created.push_back(AndPos.getNode()); 15363 Created.push_back(AndNeg.getNode()); 15364 } 15365 15366 return CSNeg; 15367 } 15368 15369 static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) { 15370 switch(getIntrinsicID(S.getNode())) { 15371 default: 15372 break; 15373 case Intrinsic::aarch64_sve_cntb: 15374 return 8; 15375 case Intrinsic::aarch64_sve_cnth: 15376 return 16; 15377 case Intrinsic::aarch64_sve_cntw: 15378 return 32; 15379 case Intrinsic::aarch64_sve_cntd: 15380 return 64; 15381 } 15382 return {}; 15383 } 15384 15385 /// Calculates what the pre-extend type is, based on the extension 15386 /// operation node provided by \p Extend. 15387 /// 15388 /// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the 15389 /// pre-extend type is pulled directly from the operand, while other extend 15390 /// operations need a bit more inspection to get this information. 15391 /// 15392 /// \param Extend The SDNode from the DAG that represents the extend operation 15393 /// 15394 /// \returns The type representing the \p Extend source type, or \p MVT::Other 15395 /// if no valid type can be determined 15396 static EVT calculatePreExtendType(SDValue Extend) { 15397 switch (Extend.getOpcode()) { 15398 case ISD::SIGN_EXTEND: 15399 case ISD::ZERO_EXTEND: 15400 return Extend.getOperand(0).getValueType(); 15401 case ISD::AssertSext: 15402 case ISD::AssertZext: 15403 case ISD::SIGN_EXTEND_INREG: { 15404 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1)); 15405 if (!TypeNode) 15406 return MVT::Other; 15407 return TypeNode->getVT(); 15408 } 15409 case ISD::AND: { 15410 ConstantSDNode *Constant = 15411 dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode()); 15412 if (!Constant) 15413 return MVT::Other; 15414 15415 uint32_t Mask = Constant->getZExtValue(); 15416 15417 if (Mask == UCHAR_MAX) 15418 return MVT::i8; 15419 else if (Mask == USHRT_MAX) 15420 return MVT::i16; 15421 else if (Mask == UINT_MAX) 15422 return MVT::i32; 15423 15424 return MVT::Other; 15425 } 15426 default: 15427 return MVT::Other; 15428 } 15429 } 15430 15431 /// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern 15432 /// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector 15433 /// SExt/ZExt rather than the scalar SExt/ZExt 15434 static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG) { 15435 EVT VT = BV.getValueType(); 15436 if (BV.getOpcode() != ISD::BUILD_VECTOR && 15437 BV.getOpcode() != ISD::VECTOR_SHUFFLE) 15438 return SDValue(); 15439 15440 // Use the first item in the buildvector/shuffle to get the size of the 15441 // extend, and make sure it looks valid. 15442 SDValue Extend = BV->getOperand(0); 15443 unsigned ExtendOpcode = Extend.getOpcode(); 15444 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND || 15445 ExtendOpcode == ISD::SIGN_EXTEND_INREG || 15446 ExtendOpcode == ISD::AssertSext; 15447 if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND && 15448 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND) 15449 return SDValue(); 15450 // Shuffle inputs are vector, limit to SIGN_EXTEND and ZERO_EXTEND to ensure 15451 // calculatePreExtendType will work without issue. 15452 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE && 15453 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND) 15454 return SDValue(); 15455 15456 // Restrict valid pre-extend data type 15457 EVT PreExtendType = calculatePreExtendType(Extend); 15458 if (PreExtendType == MVT::Other || 15459 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2) 15460 return SDValue(); 15461 15462 // Make sure all other operands are equally extended 15463 for (SDValue Op : drop_begin(BV->ops())) { 15464 if (Op.isUndef()) 15465 continue; 15466 unsigned Opc = Op.getOpcode(); 15467 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG || 15468 Opc == ISD::AssertSext; 15469 if (OpcIsSExt != IsSExt || calculatePreExtendType(Op) != PreExtendType) 15470 return SDValue(); 15471 } 15472 15473 SDValue NBV; 15474 SDLoc DL(BV); 15475 if (BV.getOpcode() == ISD::BUILD_VECTOR) { 15476 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType); 15477 EVT PreExtendLegalType = 15478 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType; 15479 SmallVector<SDValue, 8> NewOps; 15480 for (SDValue Op : BV->ops()) 15481 NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType) 15482 : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, 15483 PreExtendLegalType)); 15484 NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps); 15485 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE 15486 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType()); 15487 NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0), 15488 BV.getOperand(1).isUndef() 15489 ? DAG.getUNDEF(PreExtendVT) 15490 : BV.getOperand(1).getOperand(0), 15491 cast<ShuffleVectorSDNode>(BV)->getMask()); 15492 } 15493 return DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, NBV); 15494 } 15495 15496 /// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) 15497 /// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt 15498 static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) { 15499 // If the value type isn't a vector, none of the operands are going to be dups 15500 EVT VT = Mul->getValueType(0); 15501 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64) 15502 return SDValue(); 15503 15504 SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG); 15505 SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG); 15506 15507 // Neither operands have been changed, don't make any further changes 15508 if (!Op0 && !Op1) 15509 return SDValue(); 15510 15511 SDLoc DL(Mul); 15512 return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0), 15513 Op1 ? Op1 : Mul->getOperand(1)); 15514 } 15515 15516 // Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz 15517 // Same for other types with equivalent constants. 15518 static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG) { 15519 EVT VT = N->getValueType(0); 15520 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 && 15521 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16) 15522 return SDValue(); 15523 if (N->getOperand(0).getOpcode() != ISD::AND || 15524 N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL) 15525 return SDValue(); 15526 15527 SDValue And = N->getOperand(0); 15528 SDValue Srl = And.getOperand(0); 15529 15530 APInt V1, V2, V3; 15531 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) || 15532 !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) || 15533 !ISD::isConstantSplatVector(Srl.getOperand(1).getNode(), V3)) 15534 return SDValue(); 15535 15536 unsigned HalfSize = VT.getScalarSizeInBits() / 2; 15537 if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) || 15538 V3 != (HalfSize - 1)) 15539 return SDValue(); 15540 15541 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), 15542 EVT::getIntegerVT(*DAG.getContext(), HalfSize), 15543 VT.getVectorElementCount() * 2); 15544 15545 SDLoc DL(N); 15546 SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0)); 15547 SDValue CM = DAG.getNode(AArch64ISD::CMLTz, DL, HalfVT, In); 15548 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM); 15549 } 15550 15551 static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, 15552 TargetLowering::DAGCombinerInfo &DCI, 15553 const AArch64Subtarget *Subtarget) { 15554 15555 if (SDValue Ext = performMulVectorExtendCombine(N, DAG)) 15556 return Ext; 15557 if (SDValue Ext = performMulVectorCmpZeroCombine(N, DAG)) 15558 return Ext; 15559 15560 if (DCI.isBeforeLegalizeOps()) 15561 return SDValue(); 15562 15563 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y, 15564 // and in MachineCombiner pass, add+mul will be combined into madd. 15565 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X. 15566 SDLoc DL(N); 15567 EVT VT = N->getValueType(0); 15568 SDValue N0 = N->getOperand(0); 15569 SDValue N1 = N->getOperand(1); 15570 SDValue MulOper; 15571 unsigned AddSubOpc; 15572 15573 auto IsAddSubWith1 = [&](SDValue V) -> bool { 15574 AddSubOpc = V->getOpcode(); 15575 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) { 15576 SDValue Opnd = V->getOperand(1); 15577 MulOper = V->getOperand(0); 15578 if (AddSubOpc == ISD::SUB) 15579 std::swap(Opnd, MulOper); 15580 if (auto C = dyn_cast<ConstantSDNode>(Opnd)) 15581 return C->isOne(); 15582 } 15583 return false; 15584 }; 15585 15586 if (IsAddSubWith1(N0)) { 15587 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper); 15588 return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal); 15589 } 15590 15591 if (IsAddSubWith1(N1)) { 15592 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper); 15593 return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal); 15594 } 15595 15596 // The below optimizations require a constant RHS. 15597 if (!isa<ConstantSDNode>(N1)) 15598 return SDValue(); 15599 15600 ConstantSDNode *C = cast<ConstantSDNode>(N1); 15601 const APInt &ConstValue = C->getAPIntValue(); 15602 15603 // Allow the scaling to be folded into the `cnt` instruction by preventing 15604 // the scaling to be obscured here. This makes it easier to pattern match. 15605 if (IsSVECntIntrinsic(N0) || 15606 (N0->getOpcode() == ISD::TRUNCATE && 15607 (IsSVECntIntrinsic(N0->getOperand(0))))) 15608 if (ConstValue.sge(1) && ConstValue.sle(16)) 15609 return SDValue(); 15610 15611 // Multiplication of a power of two plus/minus one can be done more 15612 // cheaply as as shift+add/sub. For now, this is true unilaterally. If 15613 // future CPUs have a cheaper MADD instruction, this may need to be 15614 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and 15615 // 64-bit is 5 cycles, so this is always a win. 15616 // More aggressively, some multiplications N0 * C can be lowered to 15617 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M, 15618 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8) 15619 // TODO: lower more cases. 15620 15621 // TrailingZeroes is used to test if the mul can be lowered to 15622 // shift+add+shift. 15623 unsigned TrailingZeroes = ConstValue.countTrailingZeros(); 15624 if (TrailingZeroes) { 15625 // Conservatively do not lower to shift+add+shift if the mul might be 15626 // folded into smul or umul. 15627 if (N0->hasOneUse() && (isSignExtended(N0.getNode(), DAG) || 15628 isZeroExtended(N0.getNode(), DAG))) 15629 return SDValue(); 15630 // Conservatively do not lower to shift+add+shift if the mul might be 15631 // folded into madd or msub. 15632 if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD || 15633 N->use_begin()->getOpcode() == ISD::SUB)) 15634 return SDValue(); 15635 } 15636 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub 15637 // and shift+add+shift. 15638 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes); 15639 unsigned ShiftAmt; 15640 15641 auto Shl = [&](SDValue N0, unsigned N1) { 15642 SDValue RHS = DAG.getConstant(N1, DL, MVT::i64); 15643 return DAG.getNode(ISD::SHL, DL, VT, N0, RHS); 15644 }; 15645 auto Add = [&](SDValue N0, SDValue N1) { 15646 return DAG.getNode(ISD::ADD, DL, VT, N0, N1); 15647 }; 15648 auto Sub = [&](SDValue N0, SDValue N1) { 15649 return DAG.getNode(ISD::SUB, DL, VT, N0, N1); 15650 }; 15651 auto Negate = [&](SDValue N) { 15652 SDValue Zero = DAG.getConstant(0, DL, VT); 15653 return DAG.getNode(ISD::SUB, DL, VT, Zero, N); 15654 }; 15655 15656 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg: 15657 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as 15658 // the (2^N - 1) can't be execused via a single instruction. 15659 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) { 15660 unsigned BitWidth = C.getBitWidth(); 15661 for (unsigned i = 1; i < BitWidth / 2; i++) { 15662 APInt Rem; 15663 APInt X(BitWidth, (1 << i) + 1); 15664 APInt::sdivrem(C, X, N, Rem); 15665 APInt NVMinus1 = N - 1; 15666 if (Rem == 0 && NVMinus1.isPowerOf2()) { 15667 M = X; 15668 return true; 15669 } 15670 } 15671 return false; 15672 }; 15673 15674 if (ConstValue.isNonNegative()) { 15675 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M) 15676 // (mul x, 2^N - 1) => (sub (shl x, N), x) 15677 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M)) 15678 // (mul x, (2^M + 1) * (2^N + 1)) 15679 // => MV = (add (shl x, M), x); (add (shl MV, N), MV) 15680 APInt SCVMinus1 = ShiftedConstValue - 1; 15681 APInt SCVPlus1 = ShiftedConstValue + 1; 15682 APInt CVPlus1 = ConstValue + 1; 15683 APInt CVM, CVN; 15684 if (SCVMinus1.isPowerOf2()) { 15685 ShiftAmt = SCVMinus1.logBase2(); 15686 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes); 15687 } else if (CVPlus1.isPowerOf2()) { 15688 ShiftAmt = CVPlus1.logBase2(); 15689 return Sub(Shl(N0, ShiftAmt), N0); 15690 } else if (SCVPlus1.isPowerOf2()) { 15691 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes; 15692 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes)); 15693 } else if (Subtarget->hasLSLFast() && 15694 isPowPlusPlusConst(ConstValue, CVM, CVN)) { 15695 APInt CVMMinus1 = CVM - 1; 15696 APInt CVNMinus1 = CVN - 1; 15697 unsigned ShiftM1 = CVMMinus1.logBase2(); 15698 unsigned ShiftN1 = CVNMinus1.logBase2(); 15699 // LSLFast implicate that Shifts <= 3 places are fast 15700 if (ShiftM1 <= 3 && ShiftN1 <= 3) { 15701 SDValue MVal = Add(Shl(N0, ShiftM1), N0); 15702 return Add(Shl(MVal, ShiftN1), MVal); 15703 } 15704 } 15705 } else { 15706 // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) 15707 // (mul x, -(2^N + 1)) => - (add (shl x, N), x) 15708 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N)) 15709 APInt SCVPlus1 = -ShiftedConstValue + 1; 15710 APInt CVNegPlus1 = -ConstValue + 1; 15711 APInt CVNegMinus1 = -ConstValue - 1; 15712 if (CVNegPlus1.isPowerOf2()) { 15713 ShiftAmt = CVNegPlus1.logBase2(); 15714 return Sub(N0, Shl(N0, ShiftAmt)); 15715 } else if (CVNegMinus1.isPowerOf2()) { 15716 ShiftAmt = CVNegMinus1.logBase2(); 15717 return Negate(Add(Shl(N0, ShiftAmt), N0)); 15718 } else if (SCVPlus1.isPowerOf2()) { 15719 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes; 15720 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt)); 15721 } 15722 } 15723 15724 return SDValue(); 15725 } 15726 15727 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, 15728 SelectionDAG &DAG) { 15729 // Take advantage of vector comparisons producing 0 or -1 in each lane to 15730 // optimize away operation when it's from a constant. 15731 // 15732 // The general transformation is: 15733 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> 15734 // AND(VECTOR_CMP(x,y), constant2) 15735 // constant2 = UNARYOP(constant) 15736 15737 // Early exit if this isn't a vector operation, the operand of the 15738 // unary operation isn't a bitwise AND, or if the sizes of the operations 15739 // aren't the same. 15740 EVT VT = N->getValueType(0); 15741 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || 15742 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC || 15743 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits()) 15744 return SDValue(); 15745 15746 // Now check that the other operand of the AND is a constant. We could 15747 // make the transformation for non-constant splats as well, but it's unclear 15748 // that would be a benefit as it would not eliminate any operations, just 15749 // perform one more step in scalar code before moving to the vector unit. 15750 if (BuildVectorSDNode *BV = 15751 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) { 15752 // Bail out if the vector isn't a constant. 15753 if (!BV->isConstant()) 15754 return SDValue(); 15755 15756 // Everything checks out. Build up the new and improved node. 15757 SDLoc DL(N); 15758 EVT IntVT = BV->getValueType(0); 15759 // Create a new constant of the appropriate type for the transformed 15760 // DAG. 15761 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); 15762 // The AND node needs bitcasts to/from an integer vector type around it. 15763 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst); 15764 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, 15765 N->getOperand(0)->getOperand(0), MaskConst); 15766 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd); 15767 return Res; 15768 } 15769 15770 return SDValue(); 15771 } 15772 15773 static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, 15774 const AArch64Subtarget *Subtarget) { 15775 // First try to optimize away the conversion when it's conditionally from 15776 // a constant. Vectors only. 15777 if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG)) 15778 return Res; 15779 15780 EVT VT = N->getValueType(0); 15781 if (VT != MVT::f32 && VT != MVT::f64) 15782 return SDValue(); 15783 15784 // Only optimize when the source and destination types have the same width. 15785 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits()) 15786 return SDValue(); 15787 15788 // If the result of an integer load is only used by an integer-to-float 15789 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead. 15790 // This eliminates an "integer-to-vector-move" UOP and improves throughput. 15791 SDValue N0 = N->getOperand(0); 15792 if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && 15793 // Do not change the width of a volatile load. 15794 !cast<LoadSDNode>(N0)->isVolatile()) { 15795 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 15796 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(), 15797 LN0->getPointerInfo(), LN0->getAlign(), 15798 LN0->getMemOperand()->getFlags()); 15799 15800 // Make sure successors of the original load stay after it by updating them 15801 // to use the new Chain. 15802 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1)); 15803 15804 unsigned Opcode = 15805 (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF; 15806 return DAG.getNode(Opcode, SDLoc(N), VT, Load); 15807 } 15808 15809 return SDValue(); 15810 } 15811 15812 /// Fold a floating-point multiply by power of two into floating-point to 15813 /// fixed-point conversion. 15814 static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, 15815 TargetLowering::DAGCombinerInfo &DCI, 15816 const AArch64Subtarget *Subtarget) { 15817 if (!Subtarget->hasNEON() || Subtarget->forceStreamingCompatibleSVE()) 15818 return SDValue(); 15819 15820 if (!N->getValueType(0).isSimple()) 15821 return SDValue(); 15822 15823 SDValue Op = N->getOperand(0); 15824 if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL) 15825 return SDValue(); 15826 15827 if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector()) 15828 return SDValue(); 15829 15830 SDValue ConstVec = Op->getOperand(1); 15831 if (!isa<BuildVectorSDNode>(ConstVec)) 15832 return SDValue(); 15833 15834 MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); 15835 uint32_t FloatBits = FloatTy.getSizeInBits(); 15836 if (FloatBits != 32 && FloatBits != 64 && 15837 (FloatBits != 16 || !Subtarget->hasFullFP16())) 15838 return SDValue(); 15839 15840 MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); 15841 uint32_t IntBits = IntTy.getSizeInBits(); 15842 if (IntBits != 16 && IntBits != 32 && IntBits != 64) 15843 return SDValue(); 15844 15845 // Avoid conversions where iN is larger than the float (e.g., float -> i64). 15846 if (IntBits > FloatBits) 15847 return SDValue(); 15848 15849 BitVector UndefElements; 15850 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 15851 int32_t Bits = IntBits == 64 ? 64 : 32; 15852 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1); 15853 if (C == -1 || C == 0 || C > Bits) 15854 return SDValue(); 15855 15856 EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger(); 15857 if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy)) 15858 return SDValue(); 15859 15860 if (N->getOpcode() == ISD::FP_TO_SINT_SAT || 15861 N->getOpcode() == ISD::FP_TO_UINT_SAT) { 15862 EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT(); 15863 if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits) 15864 return SDValue(); 15865 } 15866 15867 SDLoc DL(N); 15868 bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT || 15869 N->getOpcode() == ISD::FP_TO_SINT_SAT); 15870 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs 15871 : Intrinsic::aarch64_neon_vcvtfp2fxu; 15872 SDValue FixConv = 15873 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy, 15874 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), 15875 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32)); 15876 // We can handle smaller integers by generating an extra trunc. 15877 if (IntBits < FloatBits) 15878 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv); 15879 15880 return FixConv; 15881 } 15882 15883 /// Fold a floating-point divide by power of two into fixed-point to 15884 /// floating-point conversion. 15885 static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG, 15886 TargetLowering::DAGCombinerInfo &DCI, 15887 const AArch64Subtarget *Subtarget) { 15888 if (!Subtarget->hasNEON()) 15889 return SDValue(); 15890 15891 SDValue Op = N->getOperand(0); 15892 unsigned Opc = Op->getOpcode(); 15893 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || 15894 !Op.getOperand(0).getValueType().isSimple() || 15895 (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP)) 15896 return SDValue(); 15897 15898 SDValue ConstVec = N->getOperand(1); 15899 if (!isa<BuildVectorSDNode>(ConstVec)) 15900 return SDValue(); 15901 15902 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); 15903 int32_t IntBits = IntTy.getSizeInBits(); 15904 if (IntBits != 16 && IntBits != 32 && IntBits != 64) 15905 return SDValue(); 15906 15907 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); 15908 int32_t FloatBits = FloatTy.getSizeInBits(); 15909 if (FloatBits != 32 && FloatBits != 64) 15910 return SDValue(); 15911 15912 // Avoid conversions where iN is larger than the float (e.g., i64 -> float). 15913 if (IntBits > FloatBits) 15914 return SDValue(); 15915 15916 BitVector UndefElements; 15917 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 15918 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1); 15919 if (C == -1 || C == 0 || C > FloatBits) 15920 return SDValue(); 15921 15922 MVT ResTy; 15923 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 15924 switch (NumLanes) { 15925 default: 15926 return SDValue(); 15927 case 2: 15928 ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; 15929 break; 15930 case 4: 15931 ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64; 15932 break; 15933 } 15934 15935 if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps()) 15936 return SDValue(); 15937 15938 SDLoc DL(N); 15939 SDValue ConvInput = Op.getOperand(0); 15940 bool IsSigned = Opc == ISD::SINT_TO_FP; 15941 if (IntBits < FloatBits) 15942 ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, 15943 ResTy, ConvInput); 15944 15945 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp 15946 : Intrinsic::aarch64_neon_vcvtfxu2fp; 15947 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), 15948 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput, 15949 DAG.getConstant(C, DL, MVT::i32)); 15950 } 15951 15952 /// An EXTR instruction is made up of two shifts, ORed together. This helper 15953 /// searches for and classifies those shifts. 15954 static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, 15955 bool &FromHi) { 15956 if (N.getOpcode() == ISD::SHL) 15957 FromHi = false; 15958 else if (N.getOpcode() == ISD::SRL) 15959 FromHi = true; 15960 else 15961 return false; 15962 15963 if (!isa<ConstantSDNode>(N.getOperand(1))) 15964 return false; 15965 15966 ShiftAmount = N->getConstantOperandVal(1); 15967 Src = N->getOperand(0); 15968 return true; 15969 } 15970 15971 /// EXTR instruction extracts a contiguous chunk of bits from two existing 15972 /// registers viewed as a high/low pair. This function looks for the pattern: 15973 /// <tt>(or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N))</tt> and replaces it 15974 /// with an EXTR. Can't quite be done in TableGen because the two immediates 15975 /// aren't independent. 15976 static SDValue tryCombineToEXTR(SDNode *N, 15977 TargetLowering::DAGCombinerInfo &DCI) { 15978 SelectionDAG &DAG = DCI.DAG; 15979 SDLoc DL(N); 15980 EVT VT = N->getValueType(0); 15981 15982 assert(N->getOpcode() == ISD::OR && "Unexpected root"); 15983 15984 if (VT != MVT::i32 && VT != MVT::i64) 15985 return SDValue(); 15986 15987 SDValue LHS; 15988 uint32_t ShiftLHS = 0; 15989 bool LHSFromHi = false; 15990 if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi)) 15991 return SDValue(); 15992 15993 SDValue RHS; 15994 uint32_t ShiftRHS = 0; 15995 bool RHSFromHi = false; 15996 if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi)) 15997 return SDValue(); 15998 15999 // If they're both trying to come from the high part of the register, they're 16000 // not really an EXTR. 16001 if (LHSFromHi == RHSFromHi) 16002 return SDValue(); 16003 16004 if (ShiftLHS + ShiftRHS != VT.getSizeInBits()) 16005 return SDValue(); 16006 16007 if (LHSFromHi) { 16008 std::swap(LHS, RHS); 16009 std::swap(ShiftLHS, ShiftRHS); 16010 } 16011 16012 return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS, 16013 DAG.getConstant(ShiftRHS, DL, MVT::i64)); 16014 } 16015 16016 static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 16017 const AArch64TargetLowering &TLI) { 16018 EVT VT = N->getValueType(0); 16019 SelectionDAG &DAG = DCI.DAG; 16020 SDLoc DL(N); 16021 16022 if (!VT.isVector()) 16023 return SDValue(); 16024 16025 // The combining code currently only works for NEON vectors. In particular, 16026 // it does not work for SVE when dealing with vectors wider than 128 bits. 16027 // It also doesn't work for streaming mode because it causes generating 16028 // bsl instructions that are invalid in streaming mode. 16029 if (TLI.useSVEForFixedLengthVectorVT( 16030 VT, 16031 DAG.getSubtarget<AArch64Subtarget>().forceStreamingCompatibleSVE())) 16032 return SDValue(); 16033 16034 SDValue N0 = N->getOperand(0); 16035 if (N0.getOpcode() != ISD::AND) 16036 return SDValue(); 16037 16038 SDValue N1 = N->getOperand(1); 16039 if (N1.getOpcode() != ISD::AND) 16040 return SDValue(); 16041 16042 // InstCombine does (not (neg a)) => (add a -1). 16043 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c) 16044 // Loop over all combinations of AND operands. 16045 for (int i = 1; i >= 0; --i) { 16046 for (int j = 1; j >= 0; --j) { 16047 SDValue O0 = N0->getOperand(i); 16048 SDValue O1 = N1->getOperand(j); 16049 SDValue Sub, Add, SubSibling, AddSibling; 16050 16051 // Find a SUB and an ADD operand, one from each AND. 16052 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) { 16053 Sub = O0; 16054 Add = O1; 16055 SubSibling = N0->getOperand(1 - i); 16056 AddSibling = N1->getOperand(1 - j); 16057 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) { 16058 Add = O0; 16059 Sub = O1; 16060 AddSibling = N0->getOperand(1 - i); 16061 SubSibling = N1->getOperand(1 - j); 16062 } else 16063 continue; 16064 16065 if (!ISD::isBuildVectorAllZeros(Sub.getOperand(0).getNode())) 16066 continue; 16067 16068 // Constant ones is always righthand operand of the Add. 16069 if (!ISD::isBuildVectorAllOnes(Add.getOperand(1).getNode())) 16070 continue; 16071 16072 if (Sub.getOperand(1) != Add.getOperand(0)) 16073 continue; 16074 16075 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling); 16076 } 16077 } 16078 16079 // (or (and a b) (and (not a) c)) => (bsl a b c) 16080 // We only have to look for constant vectors here since the general, variable 16081 // case can be handled in TableGen. 16082 unsigned Bits = VT.getScalarSizeInBits(); 16083 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1); 16084 for (int i = 1; i >= 0; --i) 16085 for (int j = 1; j >= 0; --j) { 16086 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i)); 16087 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j)); 16088 if (!BVN0 || !BVN1) 16089 continue; 16090 16091 bool FoundMatch = true; 16092 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) { 16093 ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k)); 16094 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k)); 16095 if (!CN0 || !CN1 || 16096 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) { 16097 FoundMatch = false; 16098 break; 16099 } 16100 } 16101 16102 if (FoundMatch) 16103 return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0), 16104 N0->getOperand(1 - i), N1->getOperand(1 - j)); 16105 } 16106 16107 return SDValue(); 16108 } 16109 16110 // Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to 16111 // convert to csel(ccmp(.., cc0)), depending on cc1: 16112 16113 // (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1))) 16114 // => 16115 // (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0)) 16116 // 16117 // (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1))) 16118 // => 16119 // (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0)) 16120 static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) { 16121 EVT VT = N->getValueType(0); 16122 SDValue CSel0 = N->getOperand(0); 16123 SDValue CSel1 = N->getOperand(1); 16124 16125 if (CSel0.getOpcode() != AArch64ISD::CSEL || 16126 CSel1.getOpcode() != AArch64ISD::CSEL) 16127 return SDValue(); 16128 16129 if (!CSel0->hasOneUse() || !CSel1->hasOneUse()) 16130 return SDValue(); 16131 16132 if (!isNullConstant(CSel0.getOperand(0)) || 16133 !isOneConstant(CSel0.getOperand(1)) || 16134 !isNullConstant(CSel1.getOperand(0)) || 16135 !isOneConstant(CSel1.getOperand(1))) 16136 return SDValue(); 16137 16138 SDValue Cmp0 = CSel0.getOperand(3); 16139 SDValue Cmp1 = CSel1.getOperand(3); 16140 AArch64CC::CondCode CC0 = (AArch64CC::CondCode)CSel0.getConstantOperandVal(2); 16141 AArch64CC::CondCode CC1 = (AArch64CC::CondCode)CSel1.getConstantOperandVal(2); 16142 if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse()) 16143 return SDValue(); 16144 if (Cmp1.getOpcode() != AArch64ISD::SUBS && 16145 Cmp0.getOpcode() == AArch64ISD::SUBS) { 16146 std::swap(Cmp0, Cmp1); 16147 std::swap(CC0, CC1); 16148 } 16149 16150 if (Cmp1.getOpcode() != AArch64ISD::SUBS) 16151 return SDValue(); 16152 16153 SDLoc DL(N); 16154 SDValue CCmp, Condition; 16155 unsigned NZCV; 16156 16157 if (N->getOpcode() == ISD::AND) { 16158 AArch64CC::CondCode InvCC0 = AArch64CC::getInvertedCondCode(CC0); 16159 Condition = DAG.getConstant(InvCC0, DL, MVT_CC); 16160 NZCV = AArch64CC::getNZCVToSatisfyCondCode(CC1); 16161 } else { 16162 AArch64CC::CondCode InvCC1 = AArch64CC::getInvertedCondCode(CC1); 16163 Condition = DAG.getConstant(CC0, DL, MVT_CC); 16164 NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvCC1); 16165 } 16166 16167 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); 16168 16169 auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1)); 16170 if (Op1 && Op1->getAPIntValue().isNegative() && 16171 Op1->getAPIntValue().sgt(-32)) { 16172 // CCMP accept the constant int the range [0, 31] 16173 // if the Op1 is a constant in the range [-31, -1], we 16174 // can select to CCMN to avoid the extra mov 16175 SDValue AbsOp1 = 16176 DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0)); 16177 CCmp = DAG.getNode(AArch64ISD::CCMN, DL, MVT_CC, Cmp1.getOperand(0), AbsOp1, 16178 NZCVOp, Condition, Cmp0); 16179 } else { 16180 CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0), 16181 Cmp1.getOperand(1), NZCVOp, Condition, Cmp0); 16182 } 16183 return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0), 16184 CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32), 16185 CCmp); 16186 } 16187 16188 static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 16189 const AArch64Subtarget *Subtarget, 16190 const AArch64TargetLowering &TLI) { 16191 SelectionDAG &DAG = DCI.DAG; 16192 EVT VT = N->getValueType(0); 16193 16194 if (SDValue R = performANDORCSELCombine(N, DAG)) 16195 return R; 16196 16197 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 16198 return SDValue(); 16199 16200 // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) 16201 if (SDValue Res = tryCombineToEXTR(N, DCI)) 16202 return Res; 16203 16204 if (SDValue Res = tryCombineToBSL(N, DCI, TLI)) 16205 return Res; 16206 16207 return SDValue(); 16208 } 16209 16210 static bool isConstantSplatVectorMaskForType(SDNode *N, EVT MemVT) { 16211 if (!MemVT.getVectorElementType().isSimple()) 16212 return false; 16213 16214 uint64_t MaskForTy = 0ull; 16215 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) { 16216 case MVT::i8: 16217 MaskForTy = 0xffull; 16218 break; 16219 case MVT::i16: 16220 MaskForTy = 0xffffull; 16221 break; 16222 case MVT::i32: 16223 MaskForTy = 0xffffffffull; 16224 break; 16225 default: 16226 return false; 16227 break; 16228 } 16229 16230 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR) 16231 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0))) 16232 return Op0->getAPIntValue().getLimitedValue() == MaskForTy; 16233 16234 return false; 16235 } 16236 16237 static bool isAllInactivePredicate(SDValue N) { 16238 // Look through cast. 16239 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) 16240 N = N.getOperand(0); 16241 16242 return ISD::isConstantSplatVectorAllZeros(N.getNode()); 16243 } 16244 16245 static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) { 16246 unsigned NumElts = N.getValueType().getVectorMinNumElements(); 16247 16248 // Look through cast. 16249 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) { 16250 N = N.getOperand(0); 16251 // When reinterpreting from a type with fewer elements the "new" elements 16252 // are not active, so bail if they're likely to be used. 16253 if (N.getValueType().getVectorMinNumElements() < NumElts) 16254 return false; 16255 } 16256 16257 if (ISD::isConstantSplatVectorAllOnes(N.getNode())) 16258 return true; 16259 16260 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size 16261 // or smaller than the implicit element type represented by N. 16262 // NOTE: A larger element count implies a smaller element type. 16263 if (N.getOpcode() == AArch64ISD::PTRUE && 16264 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all) 16265 return N.getValueType().getVectorMinNumElements() >= NumElts; 16266 16267 // If we're compiling for a specific vector-length, we can check if the 16268 // pattern's VL equals that of the scalable vector at runtime. 16269 if (N.getOpcode() == AArch64ISD::PTRUE) { 16270 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>(); 16271 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits(); 16272 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits(); 16273 if (MaxSVESize && MinSVESize == MaxSVESize) { 16274 unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock; 16275 unsigned PatNumElts = 16276 getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0)); 16277 return PatNumElts == (NumElts * VScale); 16278 } 16279 } 16280 16281 return false; 16282 } 16283 16284 static SDValue performReinterpretCastCombine(SDNode *N) { 16285 SDValue LeafOp = SDValue(N, 0); 16286 SDValue Op = N->getOperand(0); 16287 while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST && 16288 LeafOp.getValueType() != Op.getValueType()) 16289 Op = Op->getOperand(0); 16290 if (LeafOp.getValueType() == Op.getValueType()) 16291 return Op; 16292 return SDValue(); 16293 } 16294 16295 static SDValue performSVEAndCombine(SDNode *N, 16296 TargetLowering::DAGCombinerInfo &DCI) { 16297 if (DCI.isBeforeLegalizeOps()) 16298 return SDValue(); 16299 16300 SelectionDAG &DAG = DCI.DAG; 16301 SDValue Src = N->getOperand(0); 16302 unsigned Opc = Src->getOpcode(); 16303 16304 // Zero/any extend of an unsigned unpack 16305 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) { 16306 SDValue UnpkOp = Src->getOperand(0); 16307 SDValue Dup = N->getOperand(1); 16308 16309 if (Dup.getOpcode() != ISD::SPLAT_VECTOR) 16310 return SDValue(); 16311 16312 SDLoc DL(N); 16313 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0)); 16314 if (!C) 16315 return SDValue(); 16316 16317 uint64_t ExtVal = C->getZExtValue(); 16318 16319 // If the mask is fully covered by the unpack, we don't need to push 16320 // a new AND onto the operand 16321 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType(); 16322 if ((ExtVal == 0xFF && EltTy == MVT::i8) || 16323 (ExtVal == 0xFFFF && EltTy == MVT::i16) || 16324 (ExtVal == 0xFFFFFFFF && EltTy == MVT::i32)) 16325 return Src; 16326 16327 // Truncate to prevent a DUP with an over wide constant 16328 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits()); 16329 16330 // Otherwise, make sure we propagate the AND to the operand 16331 // of the unpack 16332 Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0), 16333 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32)); 16334 16335 SDValue And = DAG.getNode(ISD::AND, DL, 16336 UnpkOp->getValueType(0), UnpkOp, Dup); 16337 16338 return DAG.getNode(Opc, DL, N->getValueType(0), And); 16339 } 16340 16341 // If both sides of AND operations are i1 splat_vectors then 16342 // we can produce just i1 splat_vector as the result. 16343 if (isAllActivePredicate(DAG, N->getOperand(0))) 16344 return N->getOperand(1); 16345 if (isAllActivePredicate(DAG, N->getOperand(1))) 16346 return N->getOperand(0); 16347 16348 if (!EnableCombineMGatherIntrinsics) 16349 return SDValue(); 16350 16351 SDValue Mask = N->getOperand(1); 16352 16353 if (!Src.hasOneUse()) 16354 return SDValue(); 16355 16356 EVT MemVT; 16357 16358 // SVE load instructions perform an implicit zero-extend, which makes them 16359 // perfect candidates for combining. 16360 switch (Opc) { 16361 case AArch64ISD::LD1_MERGE_ZERO: 16362 case AArch64ISD::LDNF1_MERGE_ZERO: 16363 case AArch64ISD::LDFF1_MERGE_ZERO: 16364 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT(); 16365 break; 16366 case AArch64ISD::GLD1_MERGE_ZERO: 16367 case AArch64ISD::GLD1_SCALED_MERGE_ZERO: 16368 case AArch64ISD::GLD1_SXTW_MERGE_ZERO: 16369 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO: 16370 case AArch64ISD::GLD1_UXTW_MERGE_ZERO: 16371 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO: 16372 case AArch64ISD::GLD1_IMM_MERGE_ZERO: 16373 case AArch64ISD::GLDFF1_MERGE_ZERO: 16374 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO: 16375 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO: 16376 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO: 16377 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO: 16378 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO: 16379 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO: 16380 case AArch64ISD::GLDNT1_MERGE_ZERO: 16381 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT(); 16382 break; 16383 default: 16384 return SDValue(); 16385 } 16386 16387 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT)) 16388 return Src; 16389 16390 return SDValue(); 16391 } 16392 16393 static SDValue performANDCombine(SDNode *N, 16394 TargetLowering::DAGCombinerInfo &DCI) { 16395 SelectionDAG &DAG = DCI.DAG; 16396 SDValue LHS = N->getOperand(0); 16397 SDValue RHS = N->getOperand(1); 16398 EVT VT = N->getValueType(0); 16399 16400 if (SDValue R = performANDORCSELCombine(N, DAG)) 16401 return R; 16402 16403 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 16404 return SDValue(); 16405 16406 if (VT.isScalableVector()) 16407 return performSVEAndCombine(N, DCI); 16408 16409 // The combining code below works only for NEON vectors. In particular, it 16410 // does not work for SVE when dealing with vectors wider than 128 bits. 16411 if (!VT.is64BitVector() && !VT.is128BitVector()) 16412 return SDValue(); 16413 16414 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode()); 16415 if (!BVN) 16416 return SDValue(); 16417 16418 // AND does not accept an immediate, so check if we can use a BIC immediate 16419 // instruction instead. We do this here instead of using a (and x, (mvni imm)) 16420 // pattern in isel, because some immediates may be lowered to the preferred 16421 // (and x, (movi imm)) form, even though an mvni representation also exists. 16422 APInt DefBits(VT.getSizeInBits(), 0); 16423 APInt UndefBits(VT.getSizeInBits(), 0); 16424 if (resolveBuildVector(BVN, DefBits, UndefBits)) { 16425 SDValue NewOp; 16426 16427 DefBits = ~DefBits; 16428 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG, 16429 DefBits, &LHS)) || 16430 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG, 16431 DefBits, &LHS))) 16432 return NewOp; 16433 16434 UndefBits = ~UndefBits; 16435 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG, 16436 UndefBits, &LHS)) || 16437 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG, 16438 UndefBits, &LHS))) 16439 return NewOp; 16440 } 16441 16442 return SDValue(); 16443 } 16444 16445 static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) { 16446 switch (Opcode) { 16447 case ISD::STRICT_FADD: 16448 case ISD::FADD: 16449 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64; 16450 case ISD::ADD: 16451 return VT == MVT::i64; 16452 default: 16453 return false; 16454 } 16455 } 16456 16457 static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, 16458 AArch64CC::CondCode Cond); 16459 16460 static bool isPredicateCCSettingOp(SDValue N) { 16461 if ((N.getOpcode() == ISD::SETCC) || 16462 (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN && 16463 (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege || 16464 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt || 16465 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi || 16466 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs || 16467 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele || 16468 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo || 16469 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels || 16470 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt || 16471 // get_active_lane_mask is lowered to a whilelo instruction. 16472 N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask))) 16473 return true; 16474 16475 return false; 16476 } 16477 16478 // Materialize : i1 = extract_vector_elt t37, Constant:i64<0> 16479 // ... into: "ptrue p, all" + PTEST 16480 static SDValue 16481 performFirstTrueTestVectorCombine(SDNode *N, 16482 TargetLowering::DAGCombinerInfo &DCI, 16483 const AArch64Subtarget *Subtarget) { 16484 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT); 16485 // Make sure PTEST can be legalised with illegal types. 16486 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize()) 16487 return SDValue(); 16488 16489 SDValue N0 = N->getOperand(0); 16490 EVT VT = N0.getValueType(); 16491 16492 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 || 16493 !isNullConstant(N->getOperand(1))) 16494 return SDValue(); 16495 16496 // Restricted the DAG combine to only cases where we're extracting from a 16497 // flag-setting operation. 16498 if (!isPredicateCCSettingOp(N0)) 16499 return SDValue(); 16500 16501 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0 16502 SelectionDAG &DAG = DCI.DAG; 16503 SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all); 16504 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE); 16505 } 16506 16507 // Materialize : Idx = (add (mul vscale, NumEls), -1) 16508 // i1 = extract_vector_elt t37, Constant:i64<Idx> 16509 // ... into: "ptrue p, all" + PTEST 16510 static SDValue 16511 performLastTrueTestVectorCombine(SDNode *N, 16512 TargetLowering::DAGCombinerInfo &DCI, 16513 const AArch64Subtarget *Subtarget) { 16514 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT); 16515 // Make sure PTEST is legal types. 16516 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize()) 16517 return SDValue(); 16518 16519 SDValue N0 = N->getOperand(0); 16520 EVT OpVT = N0.getValueType(); 16521 16522 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1) 16523 return SDValue(); 16524 16525 // Idx == (add (mul vscale, NumEls), -1) 16526 SDValue Idx = N->getOperand(1); 16527 if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1))) 16528 return SDValue(); 16529 16530 SDValue VS = Idx.getOperand(0); 16531 if (VS.getOpcode() != ISD::VSCALE) 16532 return SDValue(); 16533 16534 unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue(); 16535 if (VS.getConstantOperandVal(0) != NumEls) 16536 return SDValue(); 16537 16538 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0 16539 SelectionDAG &DAG = DCI.DAG; 16540 SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all); 16541 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE); 16542 } 16543 16544 static SDValue 16545 performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 16546 const AArch64Subtarget *Subtarget) { 16547 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT); 16548 if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget)) 16549 return Res; 16550 if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget)) 16551 return Res; 16552 16553 SelectionDAG &DAG = DCI.DAG; 16554 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); 16555 ConstantSDNode *ConstantN1 = dyn_cast<ConstantSDNode>(N1); 16556 16557 EVT VT = N->getValueType(0); 16558 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16(); 16559 bool IsStrict = N0->isStrictFPOpcode(); 16560 16561 // extract(dup x) -> x 16562 if (N0.getOpcode() == AArch64ISD::DUP) 16563 return DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT); 16564 16565 // Rewrite for pairwise fadd pattern 16566 // (f32 (extract_vector_elt 16567 // (fadd (vXf32 Other) 16568 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0)) 16569 // -> 16570 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0) 16571 // (extract_vector_elt (vXf32 Other) 1)) 16572 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so 16573 // we can only do this when it's used only by the extract_vector_elt. 16574 if (ConstantN1 && ConstantN1->getZExtValue() == 0 && 16575 hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) && 16576 (!IsStrict || N0.hasOneUse())) { 16577 SDLoc DL(N0); 16578 SDValue N00 = N0->getOperand(IsStrict ? 1 : 0); 16579 SDValue N01 = N0->getOperand(IsStrict ? 2 : 1); 16580 16581 ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01); 16582 SDValue Other = N00; 16583 16584 // And handle the commutative case. 16585 if (!Shuffle) { 16586 Shuffle = dyn_cast<ShuffleVectorSDNode>(N00); 16587 Other = N01; 16588 } 16589 16590 if (Shuffle && Shuffle->getMaskElt(0) == 1 && 16591 Other == Shuffle->getOperand(0)) { 16592 SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other, 16593 DAG.getConstant(0, DL, MVT::i64)); 16594 SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other, 16595 DAG.getConstant(1, DL, MVT::i64)); 16596 if (!IsStrict) 16597 return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2); 16598 16599 // For strict_fadd we need uses of the final extract_vector to be replaced 16600 // with the strict_fadd, but we also need uses of the chain output of the 16601 // original strict_fadd to use the chain output of the new strict_fadd as 16602 // otherwise it may not be deleted. 16603 SDValue Ret = DAG.getNode(N0->getOpcode(), DL, 16604 {VT, MVT::Other}, 16605 {N0->getOperand(0), Extract1, Extract2}); 16606 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret); 16607 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1)); 16608 return SDValue(N, 0); 16609 } 16610 } 16611 16612 return SDValue(); 16613 } 16614 16615 static SDValue performConcatVectorsCombine(SDNode *N, 16616 TargetLowering::DAGCombinerInfo &DCI, 16617 SelectionDAG &DAG) { 16618 SDLoc dl(N); 16619 EVT VT = N->getValueType(0); 16620 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); 16621 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode(); 16622 16623 if (VT.isScalableVector()) 16624 return SDValue(); 16625 16626 // Optimize concat_vectors of truncated vectors, where the intermediate 16627 // type is illegal, to avoid said illegality, e.g., 16628 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))), 16629 // (v2i16 (truncate (v2i64))))) 16630 // -> 16631 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))), 16632 // (v4i32 (bitcast (v2i64))), 16633 // <0, 2, 4, 6>))) 16634 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed 16635 // on both input and result type, so we might generate worse code. 16636 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8. 16637 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE && 16638 N1Opc == ISD::TRUNCATE) { 16639 SDValue N00 = N0->getOperand(0); 16640 SDValue N10 = N1->getOperand(0); 16641 EVT N00VT = N00.getValueType(); 16642 16643 if (N00VT == N10.getValueType() && 16644 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) && 16645 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) { 16646 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16); 16647 SmallVector<int, 8> Mask(MidVT.getVectorNumElements()); 16648 for (size_t i = 0; i < Mask.size(); ++i) 16649 Mask[i] = i * 2; 16650 return DAG.getNode(ISD::TRUNCATE, dl, VT, 16651 DAG.getVectorShuffle( 16652 MidVT, dl, 16653 DAG.getNode(ISD::BITCAST, dl, MidVT, N00), 16654 DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask)); 16655 } 16656 } 16657 16658 if (N->getOperand(0).getValueType() == MVT::v4i8) { 16659 // If we have a concat of v4i8 loads, convert them to a buildvector of f32 16660 // loads to prevent having to go through the v4i8 load legalization that 16661 // needs to extend each element into a larger type. 16662 if (N->getNumOperands() % 2 == 0 && all_of(N->op_values(), [](SDValue V) { 16663 if (V.getValueType() != MVT::v4i8) 16664 return false; 16665 if (V.isUndef()) 16666 return true; 16667 LoadSDNode *LD = dyn_cast<LoadSDNode>(V); 16668 return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() && 16669 LD->getExtensionType() == ISD::NON_EXTLOAD; 16670 })) { 16671 EVT NVT = 16672 EVT::getVectorVT(*DAG.getContext(), MVT::f32, N->getNumOperands()); 16673 SmallVector<SDValue> Ops; 16674 16675 for (unsigned i = 0; i < N->getNumOperands(); i++) { 16676 SDValue V = N->getOperand(i); 16677 if (V.isUndef()) 16678 Ops.push_back(DAG.getUNDEF(MVT::f32)); 16679 else { 16680 LoadSDNode *LD = cast<LoadSDNode>(V); 16681 SDValue NewLoad = 16682 DAG.getLoad(MVT::f32, dl, LD->getChain(), LD->getBasePtr(), 16683 LD->getMemOperand()); 16684 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1)); 16685 Ops.push_back(NewLoad); 16686 } 16687 } 16688 return DAG.getBitcast(N->getValueType(0), 16689 DAG.getBuildVector(NVT, dl, Ops)); 16690 } 16691 } 16692 16693 // Canonicalise concat_vectors to replace concatenations of truncated nots 16694 // with nots of concatenated truncates. This in some cases allows for multiple 16695 // redundant negations to be eliminated. 16696 // (concat_vectors (v4i16 (truncate (not (v4i32)))), 16697 // (v4i16 (truncate (not (v4i32))))) 16698 // -> 16699 // (not (concat_vectors (v4i16 (truncate (v4i32))), 16700 // (v4i16 (truncate (v4i32))))) 16701 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE && 16702 N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) && 16703 N->isOnlyUserOf(N1.getNode())) { 16704 auto isBitwiseVectorNegate = [](SDValue V) { 16705 return V->getOpcode() == ISD::XOR && 16706 ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode()); 16707 }; 16708 SDValue N00 = N0->getOperand(0); 16709 SDValue N10 = N1->getOperand(0); 16710 if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) && 16711 isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) { 16712 return DAG.getNOT( 16713 dl, 16714 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, 16715 DAG.getNode(ISD::TRUNCATE, dl, N0.getValueType(), 16716 N00->getOperand(0)), 16717 DAG.getNode(ISD::TRUNCATE, dl, N1.getValueType(), 16718 N10->getOperand(0))), 16719 VT); 16720 } 16721 } 16722 16723 // Wait till after everything is legalized to try this. That way we have 16724 // legal vector types and such. 16725 if (DCI.isBeforeLegalizeOps()) 16726 return SDValue(); 16727 16728 // Optimise concat_vectors of two [us]avgceils or [us]avgfloors that use 16729 // extracted subvectors from the same original vectors. Combine these into a 16730 // single avg that operates on the two original vectors. 16731 // avgceil is the target independant name for rhadd, avgfloor is a hadd. 16732 // Example: 16733 // (concat_vectors (v8i8 (avgceils (extract_subvector (v16i8 OpA, <0>), 16734 // extract_subvector (v16i8 OpB, <0>))), 16735 // (v8i8 (avgceils (extract_subvector (v16i8 OpA, <8>), 16736 // extract_subvector (v16i8 OpB, <8>))))) 16737 // -> 16738 // (v16i8(avgceils(v16i8 OpA, v16i8 OpB))) 16739 if (N->getNumOperands() == 2 && N0Opc == N1Opc && 16740 (N0Opc == ISD::AVGCEILU || N0Opc == ISD::AVGCEILS || 16741 N0Opc == ISD::AVGFLOORU || N0Opc == ISD::AVGFLOORS)) { 16742 SDValue N00 = N0->getOperand(0); 16743 SDValue N01 = N0->getOperand(1); 16744 SDValue N10 = N1->getOperand(0); 16745 SDValue N11 = N1->getOperand(1); 16746 16747 EVT N00VT = N00.getValueType(); 16748 EVT N10VT = N10.getValueType(); 16749 16750 if (N00->getOpcode() == ISD::EXTRACT_SUBVECTOR && 16751 N01->getOpcode() == ISD::EXTRACT_SUBVECTOR && 16752 N10->getOpcode() == ISD::EXTRACT_SUBVECTOR && 16753 N11->getOpcode() == ISD::EXTRACT_SUBVECTOR && N00VT == N10VT) { 16754 SDValue N00Source = N00->getOperand(0); 16755 SDValue N01Source = N01->getOperand(0); 16756 SDValue N10Source = N10->getOperand(0); 16757 SDValue N11Source = N11->getOperand(0); 16758 16759 if (N00Source == N10Source && N01Source == N11Source && 16760 N00Source.getValueType() == VT && N01Source.getValueType() == VT) { 16761 assert(N0.getValueType() == N1.getValueType()); 16762 16763 uint64_t N00Index = N00.getConstantOperandVal(1); 16764 uint64_t N01Index = N01.getConstantOperandVal(1); 16765 uint64_t N10Index = N10.getConstantOperandVal(1); 16766 uint64_t N11Index = N11.getConstantOperandVal(1); 16767 16768 if (N00Index == N01Index && N10Index == N11Index && N00Index == 0 && 16769 N10Index == N00VT.getVectorNumElements()) 16770 return DAG.getNode(N0Opc, dl, VT, N00Source, N01Source); 16771 } 16772 } 16773 } 16774 16775 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector 16776 // splat. The indexed instructions are going to be expecting a DUPLANE64, so 16777 // canonicalise to that. 16778 if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) { 16779 assert(VT.getScalarSizeInBits() == 64); 16780 return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG), 16781 DAG.getConstant(0, dl, MVT::i64)); 16782 } 16783 16784 // Canonicalise concat_vectors so that the right-hand vector has as few 16785 // bit-casts as possible before its real operation. The primary matching 16786 // destination for these operations will be the narrowing "2" instructions, 16787 // which depend on the operation being performed on this right-hand vector. 16788 // For example, 16789 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS)))) 16790 // becomes 16791 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS)) 16792 16793 if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST) 16794 return SDValue(); 16795 SDValue RHS = N1->getOperand(0); 16796 MVT RHSTy = RHS.getValueType().getSimpleVT(); 16797 // If the RHS is not a vector, this is not the pattern we're looking for. 16798 if (!RHSTy.isVector()) 16799 return SDValue(); 16800 16801 LLVM_DEBUG( 16802 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n"); 16803 16804 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(), 16805 RHSTy.getVectorNumElements() * 2); 16806 return DAG.getNode(ISD::BITCAST, dl, VT, 16807 DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy, 16808 DAG.getNode(ISD::BITCAST, dl, RHSTy, N0), 16809 RHS)); 16810 } 16811 16812 static SDValue 16813 performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 16814 SelectionDAG &DAG) { 16815 if (DCI.isBeforeLegalizeOps()) 16816 return SDValue(); 16817 16818 EVT VT = N->getValueType(0); 16819 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1) 16820 return SDValue(); 16821 16822 SDValue V = N->getOperand(0); 16823 16824 // NOTE: This combine exists in DAGCombiner, but that version's legality check 16825 // blocks this combine because the non-const case requires custom lowering. 16826 // 16827 // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const) 16828 if (V.getOpcode() == ISD::SPLAT_VECTOR) 16829 if (isa<ConstantSDNode>(V.getOperand(0))) 16830 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0)); 16831 16832 return SDValue(); 16833 } 16834 16835 static SDValue 16836 performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 16837 SelectionDAG &DAG) { 16838 SDLoc DL(N); 16839 SDValue Vec = N->getOperand(0); 16840 SDValue SubVec = N->getOperand(1); 16841 uint64_t IdxVal = N->getConstantOperandVal(2); 16842 EVT VecVT = Vec.getValueType(); 16843 EVT SubVT = SubVec.getValueType(); 16844 16845 // Only do this for legal fixed vector types. 16846 if (!VecVT.isFixedLengthVector() || 16847 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) || 16848 !DAG.getTargetLoweringInfo().isTypeLegal(SubVT)) 16849 return SDValue(); 16850 16851 // Ignore widening patterns. 16852 if (IdxVal == 0 && Vec.isUndef()) 16853 return SDValue(); 16854 16855 // Subvector must be half the width and an "aligned" insertion. 16856 unsigned NumSubElts = SubVT.getVectorNumElements(); 16857 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() || 16858 (IdxVal != 0 && IdxVal != NumSubElts)) 16859 return SDValue(); 16860 16861 // Fold insert_subvector -> concat_vectors 16862 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi)) 16863 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub) 16864 SDValue Lo, Hi; 16865 if (IdxVal == 0) { 16866 Lo = SubVec; 16867 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec, 16868 DAG.getVectorIdxConstant(NumSubElts, DL)); 16869 } else { 16870 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec, 16871 DAG.getVectorIdxConstant(0, DL)); 16872 Hi = SubVec; 16873 } 16874 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi); 16875 } 16876 16877 static SDValue tryCombineFixedPointConvert(SDNode *N, 16878 TargetLowering::DAGCombinerInfo &DCI, 16879 SelectionDAG &DAG) { 16880 // Wait until after everything is legalized to try this. That way we have 16881 // legal vector types and such. 16882 if (DCI.isBeforeLegalizeOps()) 16883 return SDValue(); 16884 // Transform a scalar conversion of a value from a lane extract into a 16885 // lane extract of a vector conversion. E.g., from foo1 to foo2: 16886 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); } 16887 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; } 16888 // 16889 // The second form interacts better with instruction selection and the 16890 // register allocator to avoid cross-class register copies that aren't 16891 // coalescable due to a lane reference. 16892 16893 // Check the operand and see if it originates from a lane extract. 16894 SDValue Op1 = N->getOperand(1); 16895 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT) 16896 return SDValue(); 16897 16898 // Yep, no additional predication needed. Perform the transform. 16899 SDValue IID = N->getOperand(0); 16900 SDValue Shift = N->getOperand(2); 16901 SDValue Vec = Op1.getOperand(0); 16902 SDValue Lane = Op1.getOperand(1); 16903 EVT ResTy = N->getValueType(0); 16904 EVT VecResTy; 16905 SDLoc DL(N); 16906 16907 // The vector width should be 128 bits by the time we get here, even 16908 // if it started as 64 bits (the extract_vector handling will have 16909 // done so). Bail if it is not. 16910 if (Vec.getValueSizeInBits() != 128) 16911 return SDValue(); 16912 16913 if (Vec.getValueType() == MVT::v4i32) 16914 VecResTy = MVT::v4f32; 16915 else if (Vec.getValueType() == MVT::v2i64) 16916 VecResTy = MVT::v2f64; 16917 else 16918 return SDValue(); 16919 16920 SDValue Convert = 16921 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift); 16922 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane); 16923 } 16924 16925 // AArch64 high-vector "long" operations are formed by performing the non-high 16926 // version on an extract_subvector of each operand which gets the high half: 16927 // 16928 // (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS)) 16929 // 16930 // However, there are cases which don't have an extract_high explicitly, but 16931 // have another operation that can be made compatible with one for free. For 16932 // example: 16933 // 16934 // (dupv64 scalar) --> (extract_high (dup128 scalar)) 16935 // 16936 // This routine does the actual conversion of such DUPs, once outer routines 16937 // have determined that everything else is in order. 16938 // It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold 16939 // similarly here. 16940 static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) { 16941 MVT VT = N.getSimpleValueType(); 16942 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR && 16943 N.getConstantOperandVal(1) == 0) 16944 N = N.getOperand(0); 16945 16946 switch (N.getOpcode()) { 16947 case AArch64ISD::DUP: 16948 case AArch64ISD::DUPLANE8: 16949 case AArch64ISD::DUPLANE16: 16950 case AArch64ISD::DUPLANE32: 16951 case AArch64ISD::DUPLANE64: 16952 case AArch64ISD::MOVI: 16953 case AArch64ISD::MOVIshift: 16954 case AArch64ISD::MOVIedit: 16955 case AArch64ISD::MOVImsl: 16956 case AArch64ISD::MVNIshift: 16957 case AArch64ISD::MVNImsl: 16958 break; 16959 default: 16960 // FMOV could be supported, but isn't very useful, as it would only occur 16961 // if you passed a bitcast' floating point immediate to an eligible long 16962 // integer op (addl, smull, ...). 16963 return SDValue(); 16964 } 16965 16966 if (!VT.is64BitVector()) 16967 return SDValue(); 16968 16969 SDLoc DL(N); 16970 unsigned NumElems = VT.getVectorNumElements(); 16971 if (N.getValueType().is64BitVector()) { 16972 MVT ElementTy = VT.getVectorElementType(); 16973 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2); 16974 N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops()); 16975 } 16976 16977 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N, 16978 DAG.getConstant(NumElems, DL, MVT::i64)); 16979 } 16980 16981 static bool isEssentiallyExtractHighSubvector(SDValue N) { 16982 if (N.getOpcode() == ISD::BITCAST) 16983 N = N.getOperand(0); 16984 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR) 16985 return false; 16986 if (N.getOperand(0).getValueType().isScalableVector()) 16987 return false; 16988 return cast<ConstantSDNode>(N.getOperand(1))->getAPIntValue() == 16989 N.getOperand(0).getValueType().getVectorNumElements() / 2; 16990 } 16991 16992 /// Helper structure to keep track of ISD::SET_CC operands. 16993 struct GenericSetCCInfo { 16994 const SDValue *Opnd0; 16995 const SDValue *Opnd1; 16996 ISD::CondCode CC; 16997 }; 16998 16999 /// Helper structure to keep track of a SET_CC lowered into AArch64 code. 17000 struct AArch64SetCCInfo { 17001 const SDValue *Cmp; 17002 AArch64CC::CondCode CC; 17003 }; 17004 17005 /// Helper structure to keep track of SetCC information. 17006 union SetCCInfo { 17007 GenericSetCCInfo Generic; 17008 AArch64SetCCInfo AArch64; 17009 }; 17010 17011 /// Helper structure to be able to read SetCC information. If set to 17012 /// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a 17013 /// GenericSetCCInfo. 17014 struct SetCCInfoAndKind { 17015 SetCCInfo Info; 17016 bool IsAArch64; 17017 }; 17018 17019 /// Check whether or not \p Op is a SET_CC operation, either a generic or 17020 /// an 17021 /// AArch64 lowered one. 17022 /// \p SetCCInfo is filled accordingly. 17023 /// \post SetCCInfo is meanginfull only when this function returns true. 17024 /// \return True when Op is a kind of SET_CC operation. 17025 static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) { 17026 // If this is a setcc, this is straight forward. 17027 if (Op.getOpcode() == ISD::SETCC) { 17028 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0); 17029 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1); 17030 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 17031 SetCCInfo.IsAArch64 = false; 17032 return true; 17033 } 17034 // Otherwise, check if this is a matching csel instruction. 17035 // In other words: 17036 // - csel 1, 0, cc 17037 // - csel 0, 1, !cc 17038 if (Op.getOpcode() != AArch64ISD::CSEL) 17039 return false; 17040 // Set the information about the operands. 17041 // TODO: we want the operands of the Cmp not the csel 17042 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3); 17043 SetCCInfo.IsAArch64 = true; 17044 SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>( 17045 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); 17046 17047 // Check that the operands matches the constraints: 17048 // (1) Both operands must be constants. 17049 // (2) One must be 1 and the other must be 0. 17050 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0)); 17051 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 17052 17053 // Check (1). 17054 if (!TValue || !FValue) 17055 return false; 17056 17057 // Check (2). 17058 if (!TValue->isOne()) { 17059 // Update the comparison when we are interested in !cc. 17060 std::swap(TValue, FValue); 17061 SetCCInfo.Info.AArch64.CC = 17062 AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC); 17063 } 17064 return TValue->isOne() && FValue->isZero(); 17065 } 17066 17067 // Returns true if Op is setcc or zext of setcc. 17068 static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) { 17069 if (isSetCC(Op, Info)) 17070 return true; 17071 return ((Op.getOpcode() == ISD::ZERO_EXTEND) && 17072 isSetCC(Op->getOperand(0), Info)); 17073 } 17074 17075 // The folding we want to perform is: 17076 // (add x, [zext] (setcc cc ...) ) 17077 // --> 17078 // (csel x, (add x, 1), !cc ...) 17079 // 17080 // The latter will get matched to a CSINC instruction. 17081 static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) { 17082 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!"); 17083 SDValue LHS = Op->getOperand(0); 17084 SDValue RHS = Op->getOperand(1); 17085 SetCCInfoAndKind InfoAndKind; 17086 17087 // If both operands are a SET_CC, then we don't want to perform this 17088 // folding and create another csel as this results in more instructions 17089 // (and higher register usage). 17090 if (isSetCCOrZExtSetCC(LHS, InfoAndKind) && 17091 isSetCCOrZExtSetCC(RHS, InfoAndKind)) 17092 return SDValue(); 17093 17094 // If neither operand is a SET_CC, give up. 17095 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) { 17096 std::swap(LHS, RHS); 17097 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) 17098 return SDValue(); 17099 } 17100 17101 // FIXME: This could be generatized to work for FP comparisons. 17102 EVT CmpVT = InfoAndKind.IsAArch64 17103 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType() 17104 : InfoAndKind.Info.Generic.Opnd0->getValueType(); 17105 if (CmpVT != MVT::i32 && CmpVT != MVT::i64) 17106 return SDValue(); 17107 17108 SDValue CCVal; 17109 SDValue Cmp; 17110 SDLoc dl(Op); 17111 if (InfoAndKind.IsAArch64) { 17112 CCVal = DAG.getConstant( 17113 AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl, 17114 MVT::i32); 17115 Cmp = *InfoAndKind.Info.AArch64.Cmp; 17116 } else 17117 Cmp = getAArch64Cmp( 17118 *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1, 17119 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG, 17120 dl); 17121 17122 EVT VT = Op->getValueType(0); 17123 LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT)); 17124 return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp); 17125 } 17126 17127 // ADD(UADDV a, UADDV b) --> UADDV(ADD a, b) 17128 static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG) { 17129 EVT VT = N->getValueType(0); 17130 // Only scalar integer and vector types. 17131 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger()) 17132 return SDValue(); 17133 17134 SDValue LHS = N->getOperand(0); 17135 SDValue RHS = N->getOperand(1); 17136 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 17137 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT) 17138 return SDValue(); 17139 17140 auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); 17141 auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1)); 17142 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero()) 17143 return SDValue(); 17144 17145 SDValue Op1 = LHS->getOperand(0); 17146 SDValue Op2 = RHS->getOperand(0); 17147 EVT OpVT1 = Op1.getValueType(); 17148 EVT OpVT2 = Op2.getValueType(); 17149 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 || 17150 Op2.getOpcode() != AArch64ISD::UADDV || 17151 OpVT1.getVectorElementType() != VT) 17152 return SDValue(); 17153 17154 SDValue Val1 = Op1.getOperand(0); 17155 SDValue Val2 = Op2.getOperand(0); 17156 EVT ValVT = Val1->getValueType(0); 17157 SDLoc DL(N); 17158 SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2); 17159 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, 17160 DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal), 17161 DAG.getConstant(0, DL, MVT::i64)); 17162 } 17163 17164 /// Perform the scalar expression combine in the form of: 17165 /// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc) 17166 /// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc) 17167 static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG) { 17168 EVT VT = N->getValueType(0); 17169 if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD) 17170 return SDValue(); 17171 17172 SDValue LHS = N->getOperand(0); 17173 SDValue RHS = N->getOperand(1); 17174 17175 // Handle commutivity. 17176 if (LHS.getOpcode() != AArch64ISD::CSEL && 17177 LHS.getOpcode() != AArch64ISD::CSNEG) { 17178 std::swap(LHS, RHS); 17179 if (LHS.getOpcode() != AArch64ISD::CSEL && 17180 LHS.getOpcode() != AArch64ISD::CSNEG) { 17181 return SDValue(); 17182 } 17183 } 17184 17185 if (!LHS.hasOneUse()) 17186 return SDValue(); 17187 17188 AArch64CC::CondCode AArch64CC = 17189 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2)); 17190 17191 // The CSEL should include a const one operand, and the CSNEG should include 17192 // One or NegOne operand. 17193 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0)); 17194 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); 17195 if (!CTVal || !CFVal) 17196 return SDValue(); 17197 17198 if (!(LHS.getOpcode() == AArch64ISD::CSEL && 17199 (CTVal->isOne() || CFVal->isOne())) && 17200 !(LHS.getOpcode() == AArch64ISD::CSNEG && 17201 (CTVal->isOne() || CFVal->isAllOnes()))) 17202 return SDValue(); 17203 17204 // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc) 17205 if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() && 17206 !CFVal->isOne()) { 17207 std::swap(CTVal, CFVal); 17208 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC); 17209 } 17210 17211 SDLoc DL(N); 17212 // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc) 17213 if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() && 17214 !CFVal->isAllOnes()) { 17215 APInt C = -1 * CFVal->getAPIntValue(); 17216 CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT)); 17217 CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT)); 17218 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC); 17219 } 17220 17221 // It might be neutral for larger constants, as the immediate need to be 17222 // materialized in a register. 17223 APInt ADDC = CTVal->getAPIntValue(); 17224 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 17225 if (!TLI.isLegalAddImmediate(ADDC.getSExtValue())) 17226 return SDValue(); 17227 17228 assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) || 17229 (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) && 17230 "Unexpected constant value"); 17231 17232 SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0)); 17233 SDValue CCVal = DAG.getConstant(AArch64CC, DL, MVT::i32); 17234 SDValue Cmp = LHS.getOperand(3); 17235 17236 return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp); 17237 } 17238 17239 // ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y) 17240 static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) { 17241 EVT VT = N->getValueType(0); 17242 if (N->getOpcode() != ISD::ADD) 17243 return SDValue(); 17244 17245 SDValue Dot = N->getOperand(0); 17246 SDValue A = N->getOperand(1); 17247 // Handle commutivity 17248 auto isZeroDot = [](SDValue Dot) { 17249 return (Dot.getOpcode() == AArch64ISD::UDOT || 17250 Dot.getOpcode() == AArch64ISD::SDOT) && 17251 isZerosVector(Dot.getOperand(0).getNode()); 17252 }; 17253 if (!isZeroDot(Dot)) 17254 std::swap(Dot, A); 17255 if (!isZeroDot(Dot)) 17256 return SDValue(); 17257 17258 return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1), 17259 Dot.getOperand(2)); 17260 } 17261 17262 static bool isNegatedInteger(SDValue Op) { 17263 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)); 17264 } 17265 17266 static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG) { 17267 SDLoc DL(Op); 17268 EVT VT = Op.getValueType(); 17269 SDValue Zero = DAG.getConstant(0, DL, VT); 17270 return DAG.getNode(ISD::SUB, DL, VT, Zero, Op); 17271 } 17272 17273 // Try to fold 17274 // 17275 // (neg (csel X, Y)) -> (csel (neg X), (neg Y)) 17276 // 17277 // The folding helps csel to be matched with csneg without generating 17278 // redundant neg instruction, which includes negation of the csel expansion 17279 // of abs node lowered by lowerABS. 17280 static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG) { 17281 if (!isNegatedInteger(SDValue(N, 0))) 17282 return SDValue(); 17283 17284 SDValue CSel = N->getOperand(1); 17285 if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse()) 17286 return SDValue(); 17287 17288 SDValue N0 = CSel.getOperand(0); 17289 SDValue N1 = CSel.getOperand(1); 17290 17291 // If both of them is not negations, it's not worth the folding as it 17292 // introduces two additional negations while reducing one negation. 17293 if (!isNegatedInteger(N0) && !isNegatedInteger(N1)) 17294 return SDValue(); 17295 17296 SDValue N0N = getNegatedInteger(N0, DAG); 17297 SDValue N1N = getNegatedInteger(N1, DAG); 17298 17299 SDLoc DL(N); 17300 EVT VT = CSel.getValueType(); 17301 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2), 17302 CSel.getOperand(3)); 17303 } 17304 17305 // The basic add/sub long vector instructions have variants with "2" on the end 17306 // which act on the high-half of their inputs. They are normally matched by 17307 // patterns like: 17308 // 17309 // (add (zeroext (extract_high LHS)), 17310 // (zeroext (extract_high RHS))) 17311 // -> uaddl2 vD, vN, vM 17312 // 17313 // However, if one of the extracts is something like a duplicate, this 17314 // instruction can still be used profitably. This function puts the DAG into a 17315 // more appropriate form for those patterns to trigger. 17316 static SDValue performAddSubLongCombine(SDNode *N, 17317 TargetLowering::DAGCombinerInfo &DCI, 17318 SelectionDAG &DAG) { 17319 if (DCI.isBeforeLegalizeOps()) 17320 return SDValue(); 17321 17322 MVT VT = N->getSimpleValueType(0); 17323 if (!VT.is128BitVector()) { 17324 if (N->getOpcode() == ISD::ADD) 17325 return performSetccAddFolding(N, DAG); 17326 return SDValue(); 17327 } 17328 17329 // Make sure both branches are extended in the same way. 17330 SDValue LHS = N->getOperand(0); 17331 SDValue RHS = N->getOperand(1); 17332 if ((LHS.getOpcode() != ISD::ZERO_EXTEND && 17333 LHS.getOpcode() != ISD::SIGN_EXTEND) || 17334 LHS.getOpcode() != RHS.getOpcode()) 17335 return SDValue(); 17336 17337 unsigned ExtType = LHS.getOpcode(); 17338 17339 // It's not worth doing if at least one of the inputs isn't already an 17340 // extract, but we don't know which it'll be so we have to try both. 17341 if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) { 17342 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG); 17343 if (!RHS.getNode()) 17344 return SDValue(); 17345 17346 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS); 17347 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) { 17348 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG); 17349 if (!LHS.getNode()) 17350 return SDValue(); 17351 17352 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS); 17353 } 17354 17355 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS); 17356 } 17357 17358 static bool isCMP(SDValue Op) { 17359 return Op.getOpcode() == AArch64ISD::SUBS && 17360 !Op.getNode()->hasAnyUseOfValue(0); 17361 } 17362 17363 // (CSEL 1 0 CC Cond) => CC 17364 // (CSEL 0 1 CC Cond) => !CC 17365 static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) { 17366 if (Op.getOpcode() != AArch64ISD::CSEL) 17367 return std::nullopt; 17368 auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2)); 17369 if (CC == AArch64CC::AL || CC == AArch64CC::NV) 17370 return std::nullopt; 17371 SDValue OpLHS = Op.getOperand(0); 17372 SDValue OpRHS = Op.getOperand(1); 17373 if (isOneConstant(OpLHS) && isNullConstant(OpRHS)) 17374 return CC; 17375 if (isNullConstant(OpLHS) && isOneConstant(OpRHS)) 17376 return getInvertedCondCode(CC); 17377 17378 return std::nullopt; 17379 } 17380 17381 // (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry) 17382 // (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry) 17383 static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) { 17384 SDValue CmpOp = Op->getOperand(2); 17385 if (!isCMP(CmpOp)) 17386 return SDValue(); 17387 17388 if (IsAdd) { 17389 if (!isOneConstant(CmpOp.getOperand(1))) 17390 return SDValue(); 17391 } else { 17392 if (!isNullConstant(CmpOp.getOperand(0))) 17393 return SDValue(); 17394 } 17395 17396 SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1); 17397 auto CC = getCSETCondCode(CsetOp); 17398 if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO)) 17399 return SDValue(); 17400 17401 return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(), 17402 Op->getOperand(0), Op->getOperand(1), 17403 CsetOp.getOperand(3)); 17404 } 17405 17406 // (ADC x 0 cond) => (CINC x HS cond) 17407 static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG) { 17408 SDValue LHS = N->getOperand(0); 17409 SDValue RHS = N->getOperand(1); 17410 SDValue Cond = N->getOperand(2); 17411 17412 if (!isNullConstant(RHS)) 17413 return SDValue(); 17414 17415 EVT VT = N->getValueType(0); 17416 SDLoc DL(N); 17417 17418 // (CINC x cc cond) <=> (CSINC x x !cc cond) 17419 SDValue CC = DAG.getConstant(AArch64CC::LO, DL, MVT::i32); 17420 return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond); 17421 } 17422 17423 // Transform vector add(zext i8 to i32, zext i8 to i32) 17424 // into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32) 17425 // This allows extra uses of saddl/uaddl at the lower vector widths, and less 17426 // extends. 17427 static SDValue performVectorAddSubExtCombine(SDNode *N, SelectionDAG &DAG) { 17428 EVT VT = N->getValueType(0); 17429 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 || 17430 (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND && 17431 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) || 17432 (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND && 17433 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) || 17434 N->getOperand(0).getOperand(0).getValueType() != 17435 N->getOperand(1).getOperand(0).getValueType()) 17436 return SDValue(); 17437 17438 SDValue N0 = N->getOperand(0).getOperand(0); 17439 SDValue N1 = N->getOperand(1).getOperand(0); 17440 EVT InVT = N0.getValueType(); 17441 17442 EVT S1 = InVT.getScalarType(); 17443 EVT S2 = VT.getScalarType(); 17444 if ((S2 == MVT::i32 && S1 == MVT::i8) || 17445 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) { 17446 SDLoc DL(N); 17447 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), 17448 S2.getHalfSizedIntegerVT(*DAG.getContext()), 17449 VT.getVectorElementCount()); 17450 SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0); 17451 SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1); 17452 SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1); 17453 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewOp); 17454 } 17455 return SDValue(); 17456 } 17457 17458 static SDValue performBuildVectorCombine(SDNode *N, 17459 TargetLowering::DAGCombinerInfo &DCI, 17460 SelectionDAG &DAG) { 17461 SDLoc DL(N); 17462 EVT VT = N->getValueType(0); 17463 17464 // A build vector of two extracted elements is equivalent to an 17465 // extract subvector where the inner vector is any-extended to the 17466 // extract_vector_elt VT. 17467 // (build_vector (extract_elt_iXX_to_i32 vec Idx+0) 17468 // (extract_elt_iXX_to_i32 vec Idx+1)) 17469 // => (extract_subvector (anyext_iXX_to_i32 vec) Idx) 17470 17471 // For now, only consider the v2i32 case, which arises as a result of 17472 // legalization. 17473 if (VT != MVT::v2i32) 17474 return SDValue(); 17475 17476 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1); 17477 // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT. 17478 if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT && 17479 Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT && 17480 // Constant index. 17481 isa<ConstantSDNode>(Elt0->getOperand(1)) && 17482 isa<ConstantSDNode>(Elt1->getOperand(1)) && 17483 // Both EXTRACT_VECTOR_ELT from same vector... 17484 Elt0->getOperand(0) == Elt1->getOperand(0) && 17485 // ... and contiguous. First element's index +1 == second element's index. 17486 Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1) && 17487 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of 17488 // ResultType's known minimum vector length. 17489 Elt0->getConstantOperandVal(1) % VT.getVectorMinNumElements() == 0) { 17490 SDValue VecToExtend = Elt0->getOperand(0); 17491 EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32); 17492 if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT)) 17493 return SDValue(); 17494 17495 SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL); 17496 17497 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend); 17498 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext, 17499 SubvectorIdx); 17500 } 17501 17502 return SDValue(); 17503 } 17504 17505 static SDValue performTruncateCombine(SDNode *N, 17506 SelectionDAG &DAG) { 17507 EVT VT = N->getValueType(0); 17508 SDValue N0 = N->getOperand(0); 17509 if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() && 17510 N0.getOpcode() == AArch64ISD::DUP) { 17511 SDValue Op = N0.getOperand(0); 17512 if (VT.getScalarType() == MVT::i32 && 17513 N0.getOperand(0).getValueType().getScalarType() == MVT::i64) 17514 Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i32, Op); 17515 return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Op); 17516 } 17517 17518 return SDValue(); 17519 } 17520 17521 // Check an node is an extend or shift operand 17522 static bool isExtendOrShiftOperand(SDValue N) { 17523 unsigned Opcode = N.getOpcode(); 17524 if (Opcode == ISD::SIGN_EXTEND || Opcode == ISD::SIGN_EXTEND_INREG || 17525 Opcode == ISD::ZERO_EXTEND || Opcode == ISD::ANY_EXTEND) { 17526 EVT SrcVT; 17527 if (Opcode == ISD::SIGN_EXTEND_INREG) 17528 SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT(); 17529 else 17530 SrcVT = N.getOperand(0).getValueType(); 17531 17532 return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8; 17533 } else if (Opcode == ISD::AND) { 17534 ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1)); 17535 if (!CSD) 17536 return false; 17537 uint64_t AndMask = CSD->getZExtValue(); 17538 return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff; 17539 } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) { 17540 return isa<ConstantSDNode>(N.getOperand(1)); 17541 } 17542 17543 return false; 17544 } 17545 17546 // (N - Y) + Z --> (Z - Y) + N 17547 // when N is an extend or shift operand 17548 static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z, 17549 SelectionDAG &DAG) { 17550 auto IsOneUseExtend = [](SDValue N) { 17551 return N.hasOneUse() && isExtendOrShiftOperand(N); 17552 }; 17553 17554 // DAGCombiner will revert the combination when Z is constant cause 17555 // dead loop. So don't enable the combination when Z is constant. 17556 // If Z is one use shift C, we also can't do the optimization. 17557 // It will falling to self infinite loop. 17558 if (isa<ConstantSDNode>(Z) || IsOneUseExtend(Z)) 17559 return SDValue(); 17560 17561 if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse()) 17562 return SDValue(); 17563 17564 SDValue Shift = SUB.getOperand(0); 17565 if (!IsOneUseExtend(Shift)) 17566 return SDValue(); 17567 17568 SDLoc DL(N); 17569 EVT VT = N->getValueType(0); 17570 17571 SDValue Y = SUB.getOperand(1); 17572 SDValue NewSub = DAG.getNode(ISD::SUB, DL, VT, Z, Y); 17573 return DAG.getNode(ISD::ADD, DL, VT, NewSub, Shift); 17574 } 17575 17576 static SDValue performAddCombineForShiftedOperands(SDNode *N, 17577 SelectionDAG &DAG) { 17578 // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not 17579 // commutative. 17580 if (N->getOpcode() != ISD::ADD) 17581 return SDValue(); 17582 17583 // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with 17584 // shifted register is only available for i32 and i64. 17585 EVT VT = N->getValueType(0); 17586 if (VT != MVT::i32 && VT != MVT::i64) 17587 return SDValue(); 17588 17589 SDLoc DL(N); 17590 SDValue LHS = N->getOperand(0); 17591 SDValue RHS = N->getOperand(1); 17592 17593 if (SDValue Val = performAddCombineSubShift(N, LHS, RHS, DAG)) 17594 return Val; 17595 if (SDValue Val = performAddCombineSubShift(N, RHS, LHS, DAG)) 17596 return Val; 17597 17598 uint64_t LHSImm = 0, RHSImm = 0; 17599 // If both operand are shifted by imm and shift amount is not greater than 4 17600 // for one operand, swap LHS and RHS to put operand with smaller shift amount 17601 // on RHS. 17602 // 17603 // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with 17604 // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD 17605 // with LSL (shift > 4). For the rest of processors, this is no-op for 17606 // performance or correctness. 17607 if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) && 17608 isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 && 17609 RHSImm > 4 && LHS.hasOneUse()) 17610 return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS); 17611 17612 return SDValue(); 17613 } 17614 17615 static SDValue performAddSubCombine(SDNode *N, 17616 TargetLowering::DAGCombinerInfo &DCI, 17617 SelectionDAG &DAG) { 17618 // Try to change sum of two reductions. 17619 if (SDValue Val = performAddUADDVCombine(N, DAG)) 17620 return Val; 17621 if (SDValue Val = performAddDotCombine(N, DAG)) 17622 return Val; 17623 if (SDValue Val = performAddCSelIntoCSinc(N, DAG)) 17624 return Val; 17625 if (SDValue Val = performNegCSelCombine(N, DAG)) 17626 return Val; 17627 if (SDValue Val = performVectorAddSubExtCombine(N, DAG)) 17628 return Val; 17629 if (SDValue Val = performAddCombineForShiftedOperands(N, DAG)) 17630 return Val; 17631 17632 return performAddSubLongCombine(N, DCI, DAG); 17633 } 17634 17635 // Massage DAGs which we can use the high-half "long" operations on into 17636 // something isel will recognize better. E.g. 17637 // 17638 // (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) --> 17639 // (aarch64_neon_umull (extract_high (v2i64 vec))) 17640 // (extract_high (v2i64 (dup128 scalar))))) 17641 // 17642 static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, 17643 TargetLowering::DAGCombinerInfo &DCI, 17644 SelectionDAG &DAG) { 17645 if (DCI.isBeforeLegalizeOps()) 17646 return SDValue(); 17647 17648 SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1); 17649 SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2); 17650 assert(LHS.getValueType().is64BitVector() && 17651 RHS.getValueType().is64BitVector() && 17652 "unexpected shape for long operation"); 17653 17654 // Either node could be a DUP, but it's not worth doing both of them (you'd 17655 // just as well use the non-high version) so look for a corresponding extract 17656 // operation on the other "wing". 17657 if (isEssentiallyExtractHighSubvector(LHS)) { 17658 RHS = tryExtendDUPToExtractHigh(RHS, DAG); 17659 if (!RHS.getNode()) 17660 return SDValue(); 17661 } else if (isEssentiallyExtractHighSubvector(RHS)) { 17662 LHS = tryExtendDUPToExtractHigh(LHS, DAG); 17663 if (!LHS.getNode()) 17664 return SDValue(); 17665 } 17666 17667 if (IID == Intrinsic::not_intrinsic) 17668 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS); 17669 17670 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0), 17671 N->getOperand(0), LHS, RHS); 17672 } 17673 17674 static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) { 17675 MVT ElemTy = N->getSimpleValueType(0).getScalarType(); 17676 unsigned ElemBits = ElemTy.getSizeInBits(); 17677 17678 int64_t ShiftAmount; 17679 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) { 17680 APInt SplatValue, SplatUndef; 17681 unsigned SplatBitSize; 17682 bool HasAnyUndefs; 17683 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, 17684 HasAnyUndefs, ElemBits) || 17685 SplatBitSize != ElemBits) 17686 return SDValue(); 17687 17688 ShiftAmount = SplatValue.getSExtValue(); 17689 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) { 17690 ShiftAmount = CVN->getSExtValue(); 17691 } else 17692 return SDValue(); 17693 17694 unsigned Opcode; 17695 bool IsRightShift; 17696 switch (IID) { 17697 default: 17698 llvm_unreachable("Unknown shift intrinsic"); 17699 case Intrinsic::aarch64_neon_sqshl: 17700 Opcode = AArch64ISD::SQSHL_I; 17701 IsRightShift = false; 17702 break; 17703 case Intrinsic::aarch64_neon_uqshl: 17704 Opcode = AArch64ISD::UQSHL_I; 17705 IsRightShift = false; 17706 break; 17707 case Intrinsic::aarch64_neon_srshl: 17708 Opcode = AArch64ISD::SRSHR_I; 17709 IsRightShift = true; 17710 break; 17711 case Intrinsic::aarch64_neon_urshl: 17712 Opcode = AArch64ISD::URSHR_I; 17713 IsRightShift = true; 17714 break; 17715 case Intrinsic::aarch64_neon_sqshlu: 17716 Opcode = AArch64ISD::SQSHLU_I; 17717 IsRightShift = false; 17718 break; 17719 case Intrinsic::aarch64_neon_sshl: 17720 case Intrinsic::aarch64_neon_ushl: 17721 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular 17722 // left shift for positive shift amounts. Below, we only replace the current 17723 // node with VSHL, if this condition is met. 17724 Opcode = AArch64ISD::VSHL; 17725 IsRightShift = false; 17726 break; 17727 } 17728 17729 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) { 17730 SDLoc dl(N); 17731 return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1), 17732 DAG.getConstant(-ShiftAmount, dl, MVT::i32)); 17733 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) { 17734 SDLoc dl(N); 17735 return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1), 17736 DAG.getConstant(ShiftAmount, dl, MVT::i32)); 17737 } 17738 17739 return SDValue(); 17740 } 17741 17742 // The CRC32[BH] instructions ignore the high bits of their data operand. Since 17743 // the intrinsics must be legal and take an i32, this means there's almost 17744 // certainly going to be a zext in the DAG which we can eliminate. 17745 static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) { 17746 SDValue AndN = N->getOperand(2); 17747 if (AndN.getOpcode() != ISD::AND) 17748 return SDValue(); 17749 17750 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1)); 17751 if (!CMask || CMask->getZExtValue() != Mask) 17752 return SDValue(); 17753 17754 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32, 17755 N->getOperand(0), N->getOperand(1), AndN.getOperand(0)); 17756 } 17757 17758 static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, 17759 SelectionDAG &DAG) { 17760 SDLoc dl(N); 17761 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), 17762 DAG.getNode(Opc, dl, 17763 N->getOperand(1).getSimpleValueType(), 17764 N->getOperand(1)), 17765 DAG.getConstant(0, dl, MVT::i64)); 17766 } 17767 17768 static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG) { 17769 SDLoc DL(N); 17770 SDValue Op1 = N->getOperand(1); 17771 SDValue Op2 = N->getOperand(2); 17772 EVT ScalarTy = Op2.getValueType(); 17773 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) 17774 ScalarTy = MVT::i32; 17775 17776 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base). 17777 SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0)); 17778 SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2); 17779 SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step); 17780 SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1); 17781 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base); 17782 } 17783 17784 static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) { 17785 SDLoc dl(N); 17786 SDValue Scalar = N->getOperand(3); 17787 EVT ScalarTy = Scalar.getValueType(); 17788 17789 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) 17790 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar); 17791 17792 SDValue Passthru = N->getOperand(1); 17793 SDValue Pred = N->getOperand(2); 17794 return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0), 17795 Pred, Scalar, Passthru); 17796 } 17797 17798 static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) { 17799 SDLoc dl(N); 17800 LLVMContext &Ctx = *DAG.getContext(); 17801 EVT VT = N->getValueType(0); 17802 17803 assert(VT.isScalableVector() && "Expected a scalable vector."); 17804 17805 // Current lowering only supports the SVE-ACLE types. 17806 if (VT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock) 17807 return SDValue(); 17808 17809 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8; 17810 unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8; 17811 EVT ByteVT = 17812 EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize)); 17813 17814 // Convert everything to the domain of EXT (i.e bytes). 17815 SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1)); 17816 SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2)); 17817 SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3), 17818 DAG.getConstant(ElemSize, dl, MVT::i32)); 17819 17820 SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2); 17821 return DAG.getNode(ISD::BITCAST, dl, VT, EXT); 17822 } 17823 17824 static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, 17825 TargetLowering::DAGCombinerInfo &DCI, 17826 SelectionDAG &DAG) { 17827 if (DCI.isBeforeLegalize()) 17828 return SDValue(); 17829 17830 SDValue Comparator = N->getOperand(3); 17831 if (Comparator.getOpcode() == AArch64ISD::DUP || 17832 Comparator.getOpcode() == ISD::SPLAT_VECTOR) { 17833 unsigned IID = getIntrinsicID(N); 17834 EVT VT = N->getValueType(0); 17835 EVT CmpVT = N->getOperand(2).getValueType(); 17836 SDValue Pred = N->getOperand(1); 17837 SDValue Imm; 17838 SDLoc DL(N); 17839 17840 switch (IID) { 17841 default: 17842 llvm_unreachable("Called with wrong intrinsic!"); 17843 break; 17844 17845 // Signed comparisons 17846 case Intrinsic::aarch64_sve_cmpeq_wide: 17847 case Intrinsic::aarch64_sve_cmpne_wide: 17848 case Intrinsic::aarch64_sve_cmpge_wide: 17849 case Intrinsic::aarch64_sve_cmpgt_wide: 17850 case Intrinsic::aarch64_sve_cmplt_wide: 17851 case Intrinsic::aarch64_sve_cmple_wide: { 17852 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) { 17853 int64_t ImmVal = CN->getSExtValue(); 17854 if (ImmVal >= -16 && ImmVal <= 15) 17855 Imm = DAG.getConstant(ImmVal, DL, MVT::i32); 17856 else 17857 return SDValue(); 17858 } 17859 break; 17860 } 17861 // Unsigned comparisons 17862 case Intrinsic::aarch64_sve_cmphs_wide: 17863 case Intrinsic::aarch64_sve_cmphi_wide: 17864 case Intrinsic::aarch64_sve_cmplo_wide: 17865 case Intrinsic::aarch64_sve_cmpls_wide: { 17866 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) { 17867 uint64_t ImmVal = CN->getZExtValue(); 17868 if (ImmVal <= 127) 17869 Imm = DAG.getConstant(ImmVal, DL, MVT::i32); 17870 else 17871 return SDValue(); 17872 } 17873 break; 17874 } 17875 } 17876 17877 if (!Imm) 17878 return SDValue(); 17879 17880 SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm); 17881 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred, 17882 N->getOperand(2), Splat, DAG.getCondCode(CC)); 17883 } 17884 17885 return SDValue(); 17886 } 17887 17888 static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, 17889 AArch64CC::CondCode Cond) { 17890 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 17891 17892 SDLoc DL(Op); 17893 assert(Op.getValueType().isScalableVector() && 17894 TLI.isTypeLegal(Op.getValueType()) && 17895 "Expected legal scalable vector type!"); 17896 assert(Op.getValueType() == Pg.getValueType() && 17897 "Expected same type for PTEST operands"); 17898 17899 // Ensure target specific opcodes are using legal type. 17900 EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); 17901 SDValue TVal = DAG.getConstant(1, DL, OutVT); 17902 SDValue FVal = DAG.getConstant(0, DL, OutVT); 17903 17904 // Ensure operands have type nxv16i1. 17905 if (Op.getValueType() != MVT::nxv16i1) { 17906 if ((Cond == AArch64CC::ANY_ACTIVE || Cond == AArch64CC::NONE_ACTIVE) && 17907 isZeroingInactiveLanes(Op)) 17908 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg); 17909 else 17910 Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG); 17911 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op); 17912 } 17913 17914 // Set condition code (CC) flags. 17915 SDValue Test = DAG.getNode( 17916 Cond == AArch64CC::ANY_ACTIVE ? AArch64ISD::PTEST_ANY : AArch64ISD::PTEST, 17917 DL, MVT::Other, Pg, Op); 17918 17919 // Convert CC to integer based on requested condition. 17920 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare. 17921 SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32); 17922 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test); 17923 return DAG.getZExtOrTrunc(Res, DL, VT); 17924 } 17925 17926 static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc, 17927 SelectionDAG &DAG) { 17928 SDLoc DL(N); 17929 17930 SDValue Pred = N->getOperand(1); 17931 SDValue VecToReduce = N->getOperand(2); 17932 17933 // NOTE: The integer reduction's result type is not always linked to the 17934 // operand's element type so we construct it from the intrinsic's result type. 17935 EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0)); 17936 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce); 17937 17938 // SVE reductions set the whole vector register with the first element 17939 // containing the reduction result, which we'll now extract. 17940 SDValue Zero = DAG.getConstant(0, DL, MVT::i64); 17941 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce, 17942 Zero); 17943 } 17944 17945 static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, 17946 SelectionDAG &DAG) { 17947 SDLoc DL(N); 17948 17949 SDValue Pred = N->getOperand(1); 17950 SDValue VecToReduce = N->getOperand(2); 17951 17952 EVT ReduceVT = VecToReduce.getValueType(); 17953 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce); 17954 17955 // SVE reductions set the whole vector register with the first element 17956 // containing the reduction result, which we'll now extract. 17957 SDValue Zero = DAG.getConstant(0, DL, MVT::i64); 17958 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce, 17959 Zero); 17960 } 17961 17962 static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, 17963 SelectionDAG &DAG) { 17964 SDLoc DL(N); 17965 17966 SDValue Pred = N->getOperand(1); 17967 SDValue InitVal = N->getOperand(2); 17968 SDValue VecToReduce = N->getOperand(3); 17969 EVT ReduceVT = VecToReduce.getValueType(); 17970 17971 // Ordered reductions use the first lane of the result vector as the 17972 // reduction's initial value. 17973 SDValue Zero = DAG.getConstant(0, DL, MVT::i64); 17974 InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT, 17975 DAG.getUNDEF(ReduceVT), InitVal, Zero); 17976 17977 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce); 17978 17979 // SVE reductions set the whole vector register with the first element 17980 // containing the reduction result, which we'll now extract. 17981 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce, 17982 Zero); 17983 } 17984 17985 // If a merged operation has no inactive lanes we can relax it to a predicated 17986 // or unpredicated operation, which potentially allows better isel (perhaps 17987 // using immediate forms) or relaxing register reuse requirements. 17988 static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc, 17989 SelectionDAG &DAG, bool UnpredOp = false, 17990 bool SwapOperands = false) { 17991 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!"); 17992 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!"); 17993 SDValue Pg = N->getOperand(1); 17994 SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2); 17995 SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3); 17996 17997 // ISD way to specify an all active predicate. 17998 if (isAllActivePredicate(DAG, Pg)) { 17999 if (UnpredOp) 18000 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2); 18001 18002 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2); 18003 } 18004 18005 // FUTURE: SplatVector(true) 18006 return SDValue(); 18007 } 18008 18009 static SDValue performIntrinsicCombine(SDNode *N, 18010 TargetLowering::DAGCombinerInfo &DCI, 18011 const AArch64Subtarget *Subtarget) { 18012 SelectionDAG &DAG = DCI.DAG; 18013 unsigned IID = getIntrinsicID(N); 18014 switch (IID) { 18015 default: 18016 break; 18017 case Intrinsic::get_active_lane_mask: { 18018 SDValue Res = SDValue(); 18019 EVT VT = N->getValueType(0); 18020 if (VT.isFixedLengthVector()) { 18021 // We can use the SVE whilelo instruction to lower this intrinsic by 18022 // creating the appropriate sequence of scalable vector operations and 18023 // then extracting a fixed-width subvector from the scalable vector. 18024 18025 SDLoc DL(N); 18026 SDValue ID = 18027 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64); 18028 18029 EVT WhileVT = EVT::getVectorVT( 18030 *DAG.getContext(), MVT::i1, 18031 ElementCount::getScalable(VT.getVectorNumElements())); 18032 18033 // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32. 18034 EVT PromVT = getPromotedVTForPredicate(WhileVT); 18035 18036 // Get the fixed-width equivalent of PromVT for extraction. 18037 EVT ExtVT = 18038 EVT::getVectorVT(*DAG.getContext(), PromVT.getVectorElementType(), 18039 VT.getVectorElementCount()); 18040 18041 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID, 18042 N->getOperand(1), N->getOperand(2)); 18043 Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res); 18044 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res, 18045 DAG.getConstant(0, DL, MVT::i64)); 18046 Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res); 18047 } 18048 return Res; 18049 } 18050 case Intrinsic::aarch64_neon_vcvtfxs2fp: 18051 case Intrinsic::aarch64_neon_vcvtfxu2fp: 18052 return tryCombineFixedPointConvert(N, DCI, DAG); 18053 case Intrinsic::aarch64_neon_saddv: 18054 return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG); 18055 case Intrinsic::aarch64_neon_uaddv: 18056 return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG); 18057 case Intrinsic::aarch64_neon_sminv: 18058 return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG); 18059 case Intrinsic::aarch64_neon_uminv: 18060 return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG); 18061 case Intrinsic::aarch64_neon_smaxv: 18062 return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG); 18063 case Intrinsic::aarch64_neon_umaxv: 18064 return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG); 18065 case Intrinsic::aarch64_neon_fmax: 18066 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0), 18067 N->getOperand(1), N->getOperand(2)); 18068 case Intrinsic::aarch64_neon_fmin: 18069 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0), 18070 N->getOperand(1), N->getOperand(2)); 18071 case Intrinsic::aarch64_neon_fmaxnm: 18072 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0), 18073 N->getOperand(1), N->getOperand(2)); 18074 case Intrinsic::aarch64_neon_fminnm: 18075 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0), 18076 N->getOperand(1), N->getOperand(2)); 18077 case Intrinsic::aarch64_neon_smull: 18078 return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0), 18079 N->getOperand(1), N->getOperand(2)); 18080 case Intrinsic::aarch64_neon_umull: 18081 return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0), 18082 N->getOperand(1), N->getOperand(2)); 18083 case Intrinsic::aarch64_neon_pmull: 18084 return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0), 18085 N->getOperand(1), N->getOperand(2)); 18086 case Intrinsic::aarch64_neon_sqdmull: 18087 return tryCombineLongOpWithDup(IID, N, DCI, DAG); 18088 case Intrinsic::aarch64_neon_sqshl: 18089 case Intrinsic::aarch64_neon_uqshl: 18090 case Intrinsic::aarch64_neon_sqshlu: 18091 case Intrinsic::aarch64_neon_srshl: 18092 case Intrinsic::aarch64_neon_urshl: 18093 case Intrinsic::aarch64_neon_sshl: 18094 case Intrinsic::aarch64_neon_ushl: 18095 return tryCombineShiftImm(IID, N, DAG); 18096 case Intrinsic::aarch64_neon_rshrn: { 18097 EVT VT = N->getOperand(1).getValueType(); 18098 SDLoc DL(N); 18099 SDValue Imm = 18100 DAG.getConstant(1LLU << (N->getConstantOperandVal(2) - 1), DL, VT); 18101 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(1), Imm); 18102 SDValue Sht = 18103 DAG.getNode(ISD::SRL, DL, VT, Add, 18104 DAG.getConstant(N->getConstantOperandVal(2), DL, VT)); 18105 return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), Sht); 18106 } 18107 case Intrinsic::aarch64_crc32b: 18108 case Intrinsic::aarch64_crc32cb: 18109 return tryCombineCRC32(0xff, N, DAG); 18110 case Intrinsic::aarch64_crc32h: 18111 case Intrinsic::aarch64_crc32ch: 18112 return tryCombineCRC32(0xffff, N, DAG); 18113 case Intrinsic::aarch64_sve_saddv: 18114 // There is no i64 version of SADDV because the sign is irrelevant. 18115 if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64) 18116 return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG); 18117 else 18118 return combineSVEReductionInt(N, AArch64ISD::SADDV_PRED, DAG); 18119 case Intrinsic::aarch64_sve_uaddv: 18120 return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG); 18121 case Intrinsic::aarch64_sve_smaxv: 18122 return combineSVEReductionInt(N, AArch64ISD::SMAXV_PRED, DAG); 18123 case Intrinsic::aarch64_sve_umaxv: 18124 return combineSVEReductionInt(N, AArch64ISD::UMAXV_PRED, DAG); 18125 case Intrinsic::aarch64_sve_sminv: 18126 return combineSVEReductionInt(N, AArch64ISD::SMINV_PRED, DAG); 18127 case Intrinsic::aarch64_sve_uminv: 18128 return combineSVEReductionInt(N, AArch64ISD::UMINV_PRED, DAG); 18129 case Intrinsic::aarch64_sve_orv: 18130 return combineSVEReductionInt(N, AArch64ISD::ORV_PRED, DAG); 18131 case Intrinsic::aarch64_sve_eorv: 18132 return combineSVEReductionInt(N, AArch64ISD::EORV_PRED, DAG); 18133 case Intrinsic::aarch64_sve_andv: 18134 return combineSVEReductionInt(N, AArch64ISD::ANDV_PRED, DAG); 18135 case Intrinsic::aarch64_sve_index: 18136 return LowerSVEIntrinsicIndex(N, DAG); 18137 case Intrinsic::aarch64_sve_dup: 18138 return LowerSVEIntrinsicDUP(N, DAG); 18139 case Intrinsic::aarch64_sve_dup_x: 18140 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0), 18141 N->getOperand(1)); 18142 case Intrinsic::aarch64_sve_ext: 18143 return LowerSVEIntrinsicEXT(N, DAG); 18144 case Intrinsic::aarch64_sve_mul: 18145 return convertMergedOpToPredOp(N, AArch64ISD::MUL_PRED, DAG); 18146 case Intrinsic::aarch64_sve_mul_u: 18147 return DAG.getNode(AArch64ISD::MUL_PRED, SDLoc(N), N->getValueType(0), 18148 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 18149 case Intrinsic::aarch64_sve_smulh: 18150 return convertMergedOpToPredOp(N, AArch64ISD::MULHS_PRED, DAG); 18151 case Intrinsic::aarch64_sve_smulh_u: 18152 return DAG.getNode(AArch64ISD::MULHS_PRED, SDLoc(N), N->getValueType(0), 18153 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 18154 case Intrinsic::aarch64_sve_umulh: 18155 return convertMergedOpToPredOp(N, AArch64ISD::MULHU_PRED, DAG); 18156 case Intrinsic::aarch64_sve_umulh_u: 18157 return DAG.getNode(AArch64ISD::MULHU_PRED, SDLoc(N), N->getValueType(0), 18158 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 18159 case Intrinsic::aarch64_sve_smin: 18160 return convertMergedOpToPredOp(N, AArch64ISD::SMIN_PRED, DAG); 18161 case Intrinsic::aarch64_sve_smin_u: 18162 return DAG.getNode(AArch64ISD::SMIN_PRED, SDLoc(N), N->getValueType(0), 18163 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 18164 case Intrinsic::aarch64_sve_umin: 18165 return convertMergedOpToPredOp(N, AArch64ISD::UMIN_PRED, DAG); 18166 case Intrinsic::aarch64_sve_umin_u: 18167 return DAG.getNode(AArch64ISD::UMIN_PRED, SDLoc(N), N->getValueType(0), 18168 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 18169 case Intrinsic::aarch64_sve_smax: 18170 return convertMergedOpToPredOp(N, AArch64ISD::SMAX_PRED, DAG); 18171 case Intrinsic::aarch64_sve_smax_u: 18172 return DAG.getNode(AArch64ISD::SMAX_PRED, SDLoc(N), N->getValueType(0), 18173 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 18174 case Intrinsic::aarch64_sve_umax: 18175 return convertMergedOpToPredOp(N, AArch64ISD::UMAX_PRED, DAG); 18176 case Intrinsic::aarch64_sve_umax_u: 18177 return DAG.getNode(AArch64ISD::UMAX_PRED, SDLoc(N), N->getValueType(0), 18178 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 18179 case Intrinsic::aarch64_sve_lsl: 18180 return convertMergedOpToPredOp(N, AArch64ISD::SHL_PRED, DAG); 18181 case Intrinsic::aarch64_sve_lsl_u: 18182 return DAG.getNode(AArch64ISD::SHL_PRED, SDLoc(N), N->getValueType(0), 18183 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 18184 case Intrinsic::aarch64_sve_lsr: 18185 return convertMergedOpToPredOp(N, AArch64ISD::SRL_PRED, DAG); 18186 case Intrinsic::aarch64_sve_lsr_u: 18187 return DAG.getNode(AArch64ISD::SRL_PRED, SDLoc(N), N->getValueType(0), 18188 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 18189 case Intrinsic::aarch64_sve_asr: 18190 return convertMergedOpToPredOp(N, AArch64ISD::SRA_PRED, DAG); 18191 case Intrinsic::aarch64_sve_asr_u: 18192 return DAG.getNode(AArch64ISD::SRA_PRED, SDLoc(N), N->getValueType(0), 18193 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 18194 case Intrinsic::aarch64_sve_fadd: 18195 return convertMergedOpToPredOp(N, AArch64ISD::FADD_PRED, DAG); 18196 case Intrinsic::aarch64_sve_fsub: 18197 return convertMergedOpToPredOp(N, AArch64ISD::FSUB_PRED, DAG); 18198 case Intrinsic::aarch64_sve_fmul: 18199 return convertMergedOpToPredOp(N, AArch64ISD::FMUL_PRED, DAG); 18200 case Intrinsic::aarch64_sve_add: 18201 return convertMergedOpToPredOp(N, ISD::ADD, DAG, true); 18202 case Intrinsic::aarch64_sve_add_u: 18203 return DAG.getNode(ISD::ADD, SDLoc(N), N->getValueType(0), N->getOperand(2), 18204 N->getOperand(3)); 18205 case Intrinsic::aarch64_sve_sub: 18206 return convertMergedOpToPredOp(N, ISD::SUB, DAG, true); 18207 case Intrinsic::aarch64_sve_sub_u: 18208 return DAG.getNode(ISD::SUB, SDLoc(N), N->getValueType(0), N->getOperand(2), 18209 N->getOperand(3)); 18210 case Intrinsic::aarch64_sve_subr: 18211 return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true); 18212 case Intrinsic::aarch64_sve_and: 18213 return convertMergedOpToPredOp(N, ISD::AND, DAG, true); 18214 case Intrinsic::aarch64_sve_bic: 18215 return convertMergedOpToPredOp(N, AArch64ISD::BIC, DAG, true); 18216 case Intrinsic::aarch64_sve_eor: 18217 return convertMergedOpToPredOp(N, ISD::XOR, DAG, true); 18218 case Intrinsic::aarch64_sve_orr: 18219 return convertMergedOpToPredOp(N, ISD::OR, DAG, true); 18220 case Intrinsic::aarch64_sve_sabd: 18221 return convertMergedOpToPredOp(N, ISD::ABDS, DAG, true); 18222 case Intrinsic::aarch64_sve_sabd_u: 18223 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0), 18224 N->getOperand(2), N->getOperand(3)); 18225 case Intrinsic::aarch64_sve_uabd: 18226 return convertMergedOpToPredOp(N, ISD::ABDU, DAG, true); 18227 case Intrinsic::aarch64_sve_uabd_u: 18228 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0), 18229 N->getOperand(2), N->getOperand(3)); 18230 case Intrinsic::aarch64_sve_sdiv_u: 18231 return DAG.getNode(AArch64ISD::SDIV_PRED, SDLoc(N), N->getValueType(0), 18232 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 18233 case Intrinsic::aarch64_sve_udiv_u: 18234 return DAG.getNode(AArch64ISD::UDIV_PRED, SDLoc(N), N->getValueType(0), 18235 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 18236 case Intrinsic::aarch64_sve_sqadd: 18237 return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true); 18238 case Intrinsic::aarch64_sve_sqsub: 18239 return convertMergedOpToPredOp(N, ISD::SSUBSAT, DAG, true); 18240 case Intrinsic::aarch64_sve_uqadd: 18241 return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true); 18242 case Intrinsic::aarch64_sve_uqsub: 18243 return convertMergedOpToPredOp(N, ISD::USUBSAT, DAG, true); 18244 case Intrinsic::aarch64_sve_sqadd_x: 18245 return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0), 18246 N->getOperand(1), N->getOperand(2)); 18247 case Intrinsic::aarch64_sve_sqsub_x: 18248 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0), 18249 N->getOperand(1), N->getOperand(2)); 18250 case Intrinsic::aarch64_sve_uqadd_x: 18251 return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0), 18252 N->getOperand(1), N->getOperand(2)); 18253 case Intrinsic::aarch64_sve_uqsub_x: 18254 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0), 18255 N->getOperand(1), N->getOperand(2)); 18256 case Intrinsic::aarch64_sve_asrd: 18257 return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0), 18258 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 18259 case Intrinsic::aarch64_sve_cmphs: 18260 if (!N->getOperand(2).getValueType().isFloatingPoint()) 18261 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), 18262 N->getValueType(0), N->getOperand(1), N->getOperand(2), 18263 N->getOperand(3), DAG.getCondCode(ISD::SETUGE)); 18264 break; 18265 case Intrinsic::aarch64_sve_cmphi: 18266 if (!N->getOperand(2).getValueType().isFloatingPoint()) 18267 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), 18268 N->getValueType(0), N->getOperand(1), N->getOperand(2), 18269 N->getOperand(3), DAG.getCondCode(ISD::SETUGT)); 18270 break; 18271 case Intrinsic::aarch64_sve_fcmpge: 18272 case Intrinsic::aarch64_sve_cmpge: 18273 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), 18274 N->getValueType(0), N->getOperand(1), N->getOperand(2), 18275 N->getOperand(3), DAG.getCondCode(ISD::SETGE)); 18276 break; 18277 case Intrinsic::aarch64_sve_fcmpgt: 18278 case Intrinsic::aarch64_sve_cmpgt: 18279 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), 18280 N->getValueType(0), N->getOperand(1), N->getOperand(2), 18281 N->getOperand(3), DAG.getCondCode(ISD::SETGT)); 18282 break; 18283 case Intrinsic::aarch64_sve_fcmpeq: 18284 case Intrinsic::aarch64_sve_cmpeq: 18285 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), 18286 N->getValueType(0), N->getOperand(1), N->getOperand(2), 18287 N->getOperand(3), DAG.getCondCode(ISD::SETEQ)); 18288 break; 18289 case Intrinsic::aarch64_sve_fcmpne: 18290 case Intrinsic::aarch64_sve_cmpne: 18291 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), 18292 N->getValueType(0), N->getOperand(1), N->getOperand(2), 18293 N->getOperand(3), DAG.getCondCode(ISD::SETNE)); 18294 break; 18295 case Intrinsic::aarch64_sve_fcmpuo: 18296 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), 18297 N->getValueType(0), N->getOperand(1), N->getOperand(2), 18298 N->getOperand(3), DAG.getCondCode(ISD::SETUO)); 18299 break; 18300 case Intrinsic::aarch64_sve_fadda: 18301 return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG); 18302 case Intrinsic::aarch64_sve_faddv: 18303 return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG); 18304 case Intrinsic::aarch64_sve_fmaxnmv: 18305 return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG); 18306 case Intrinsic::aarch64_sve_fmaxv: 18307 return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG); 18308 case Intrinsic::aarch64_sve_fminnmv: 18309 return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG); 18310 case Intrinsic::aarch64_sve_fminv: 18311 return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG); 18312 case Intrinsic::aarch64_sve_sel: 18313 return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0), 18314 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 18315 case Intrinsic::aarch64_sve_cmpeq_wide: 18316 return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG); 18317 case Intrinsic::aarch64_sve_cmpne_wide: 18318 return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG); 18319 case Intrinsic::aarch64_sve_cmpge_wide: 18320 return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG); 18321 case Intrinsic::aarch64_sve_cmpgt_wide: 18322 return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG); 18323 case Intrinsic::aarch64_sve_cmplt_wide: 18324 return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG); 18325 case Intrinsic::aarch64_sve_cmple_wide: 18326 return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG); 18327 case Intrinsic::aarch64_sve_cmphs_wide: 18328 return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG); 18329 case Intrinsic::aarch64_sve_cmphi_wide: 18330 return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG); 18331 case Intrinsic::aarch64_sve_cmplo_wide: 18332 return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG); 18333 case Intrinsic::aarch64_sve_cmpls_wide: 18334 return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG); 18335 case Intrinsic::aarch64_sve_ptest_any: 18336 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2), 18337 AArch64CC::ANY_ACTIVE); 18338 case Intrinsic::aarch64_sve_ptest_first: 18339 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2), 18340 AArch64CC::FIRST_ACTIVE); 18341 case Intrinsic::aarch64_sve_ptest_last: 18342 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2), 18343 AArch64CC::LAST_ACTIVE); 18344 } 18345 return SDValue(); 18346 } 18347 18348 static bool isCheapToExtend(const SDValue &N) { 18349 unsigned OC = N->getOpcode(); 18350 return OC == ISD::LOAD || OC == ISD::MLOAD || 18351 ISD::isConstantSplatVectorAllZeros(N.getNode()); 18352 } 18353 18354 static SDValue 18355 performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 18356 SelectionDAG &DAG) { 18357 // If we have (sext (setcc A B)) and A and B are cheap to extend, 18358 // we can move the sext into the arguments and have the same result. For 18359 // example, if A and B are both loads, we can make those extending loads and 18360 // avoid an extra instruction. This pattern appears often in VLS code 18361 // generation where the inputs to the setcc have a different size to the 18362 // instruction that wants to use the result of the setcc. 18363 assert(N->getOpcode() == ISD::SIGN_EXTEND && 18364 N->getOperand(0)->getOpcode() == ISD::SETCC); 18365 const SDValue SetCC = N->getOperand(0); 18366 18367 const SDValue CCOp0 = SetCC.getOperand(0); 18368 const SDValue CCOp1 = SetCC.getOperand(1); 18369 if (!CCOp0->getValueType(0).isInteger() || 18370 !CCOp1->getValueType(0).isInteger()) 18371 return SDValue(); 18372 18373 ISD::CondCode Code = 18374 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get(); 18375 18376 ISD::NodeType ExtType = 18377 isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 18378 18379 if (isCheapToExtend(SetCC.getOperand(0)) && 18380 isCheapToExtend(SetCC.getOperand(1))) { 18381 const SDValue Ext1 = 18382 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0); 18383 const SDValue Ext2 = 18384 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1); 18385 18386 return DAG.getSetCC( 18387 SDLoc(SetCC), N->getValueType(0), Ext1, Ext2, 18388 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get()); 18389 } 18390 18391 return SDValue(); 18392 } 18393 18394 static SDValue performExtendCombine(SDNode *N, 18395 TargetLowering::DAGCombinerInfo &DCI, 18396 SelectionDAG &DAG) { 18397 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then 18398 // we can convert that DUP into another extract_high (of a bigger DUP), which 18399 // helps the backend to decide that an sabdl2 would be useful, saving a real 18400 // extract_high operation. 18401 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND && 18402 (N->getOperand(0).getOpcode() == ISD::ABDU || 18403 N->getOperand(0).getOpcode() == ISD::ABDS)) { 18404 SDNode *ABDNode = N->getOperand(0).getNode(); 18405 SDValue NewABD = 18406 tryCombineLongOpWithDup(Intrinsic::not_intrinsic, ABDNode, DCI, DAG); 18407 if (!NewABD.getNode()) 18408 return SDValue(); 18409 18410 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD); 18411 } 18412 18413 if (N->getValueType(0).isFixedLengthVector() && 18414 N->getOpcode() == ISD::SIGN_EXTEND && 18415 N->getOperand(0)->getOpcode() == ISD::SETCC) 18416 return performSignExtendSetCCCombine(N, DCI, DAG); 18417 18418 return SDValue(); 18419 } 18420 18421 static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, 18422 SDValue SplatVal, unsigned NumVecElts) { 18423 assert(!St.isTruncatingStore() && "cannot split truncating vector store"); 18424 Align OrigAlignment = St.getAlign(); 18425 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8; 18426 18427 // Create scalar stores. This is at least as good as the code sequence for a 18428 // split unaligned store which is a dup.s, ext.b, and two stores. 18429 // Most of the time the three stores should be replaced by store pair 18430 // instructions (stp). 18431 SDLoc DL(&St); 18432 SDValue BasePtr = St.getBasePtr(); 18433 uint64_t BaseOffset = 0; 18434 18435 const MachinePointerInfo &PtrInfo = St.getPointerInfo(); 18436 SDValue NewST1 = 18437 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo, 18438 OrigAlignment, St.getMemOperand()->getFlags()); 18439 18440 // As this in ISel, we will not merge this add which may degrade results. 18441 if (BasePtr->getOpcode() == ISD::ADD && 18442 isa<ConstantSDNode>(BasePtr->getOperand(1))) { 18443 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue(); 18444 BasePtr = BasePtr->getOperand(0); 18445 } 18446 18447 unsigned Offset = EltOffset; 18448 while (--NumVecElts) { 18449 Align Alignment = commonAlignment(OrigAlignment, Offset); 18450 SDValue OffsetPtr = 18451 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, 18452 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64)); 18453 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr, 18454 PtrInfo.getWithOffset(Offset), Alignment, 18455 St.getMemOperand()->getFlags()); 18456 Offset += EltOffset; 18457 } 18458 return NewST1; 18459 } 18460 18461 // Returns an SVE type that ContentTy can be trivially sign or zero extended 18462 // into. 18463 static MVT getSVEContainerType(EVT ContentTy) { 18464 assert(ContentTy.isSimple() && "No SVE containers for extended types"); 18465 18466 switch (ContentTy.getSimpleVT().SimpleTy) { 18467 default: 18468 llvm_unreachable("No known SVE container for this MVT type"); 18469 case MVT::nxv2i8: 18470 case MVT::nxv2i16: 18471 case MVT::nxv2i32: 18472 case MVT::nxv2i64: 18473 case MVT::nxv2f32: 18474 case MVT::nxv2f64: 18475 return MVT::nxv2i64; 18476 case MVT::nxv4i8: 18477 case MVT::nxv4i16: 18478 case MVT::nxv4i32: 18479 case MVT::nxv4f32: 18480 return MVT::nxv4i32; 18481 case MVT::nxv8i8: 18482 case MVT::nxv8i16: 18483 case MVT::nxv8f16: 18484 case MVT::nxv8bf16: 18485 return MVT::nxv8i16; 18486 case MVT::nxv16i8: 18487 return MVT::nxv16i8; 18488 } 18489 } 18490 18491 static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) { 18492 SDLoc DL(N); 18493 EVT VT = N->getValueType(0); 18494 18495 if (VT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock) 18496 return SDValue(); 18497 18498 EVT ContainerVT = VT; 18499 if (ContainerVT.isInteger()) 18500 ContainerVT = getSVEContainerType(ContainerVT); 18501 18502 SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other); 18503 SDValue Ops[] = { N->getOperand(0), // Chain 18504 N->getOperand(2), // Pg 18505 N->getOperand(3), // Base 18506 DAG.getValueType(VT) }; 18507 18508 SDValue Load = DAG.getNode(Opc, DL, VTs, Ops); 18509 SDValue LoadChain = SDValue(Load.getNode(), 1); 18510 18511 if (ContainerVT.isInteger() && (VT != ContainerVT)) 18512 Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0)); 18513 18514 return DAG.getMergeValues({ Load, LoadChain }, DL); 18515 } 18516 18517 static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) { 18518 SDLoc DL(N); 18519 EVT VT = N->getValueType(0); 18520 EVT PtrTy = N->getOperand(3).getValueType(); 18521 18522 EVT LoadVT = VT; 18523 if (VT.isFloatingPoint()) 18524 LoadVT = VT.changeTypeToInteger(); 18525 18526 auto *MINode = cast<MemIntrinsicSDNode>(N); 18527 SDValue PassThru = DAG.getConstant(0, DL, LoadVT); 18528 SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(), 18529 MINode->getOperand(3), DAG.getUNDEF(PtrTy), 18530 MINode->getOperand(2), PassThru, 18531 MINode->getMemoryVT(), MINode->getMemOperand(), 18532 ISD::UNINDEXED, ISD::NON_EXTLOAD, false); 18533 18534 if (VT.isFloatingPoint()) { 18535 SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) }; 18536 return DAG.getMergeValues(Ops, DL); 18537 } 18538 18539 return L; 18540 } 18541 18542 template <unsigned Opcode> 18543 static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) { 18544 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO || 18545 Opcode == AArch64ISD::LD1RO_MERGE_ZERO, 18546 "Unsupported opcode."); 18547 SDLoc DL(N); 18548 EVT VT = N->getValueType(0); 18549 18550 EVT LoadVT = VT; 18551 if (VT.isFloatingPoint()) 18552 LoadVT = VT.changeTypeToInteger(); 18553 18554 SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)}; 18555 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops); 18556 SDValue LoadChain = SDValue(Load.getNode(), 1); 18557 18558 if (VT.isFloatingPoint()) 18559 Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0)); 18560 18561 return DAG.getMergeValues({Load, LoadChain}, DL); 18562 } 18563 18564 static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) { 18565 SDLoc DL(N); 18566 SDValue Data = N->getOperand(2); 18567 EVT DataVT = Data.getValueType(); 18568 EVT HwSrcVt = getSVEContainerType(DataVT); 18569 SDValue InputVT = DAG.getValueType(DataVT); 18570 18571 if (DataVT.isFloatingPoint()) 18572 InputVT = DAG.getValueType(HwSrcVt); 18573 18574 SDValue SrcNew; 18575 if (Data.getValueType().isFloatingPoint()) 18576 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data); 18577 else 18578 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data); 18579 18580 SDValue Ops[] = { N->getOperand(0), // Chain 18581 SrcNew, 18582 N->getOperand(4), // Base 18583 N->getOperand(3), // Pg 18584 InputVT 18585 }; 18586 18587 return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops); 18588 } 18589 18590 static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) { 18591 SDLoc DL(N); 18592 18593 SDValue Data = N->getOperand(2); 18594 EVT DataVT = Data.getValueType(); 18595 EVT PtrTy = N->getOperand(4).getValueType(); 18596 18597 if (DataVT.isFloatingPoint()) 18598 Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data); 18599 18600 auto *MINode = cast<MemIntrinsicSDNode>(N); 18601 return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4), 18602 DAG.getUNDEF(PtrTy), MINode->getOperand(3), 18603 MINode->getMemoryVT(), MINode->getMemOperand(), 18604 ISD::UNINDEXED, false, false); 18605 } 18606 18607 /// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The 18608 /// load store optimizer pass will merge them to store pair stores. This should 18609 /// be better than a movi to create the vector zero followed by a vector store 18610 /// if the zero constant is not re-used, since one instructions and one register 18611 /// live range will be removed. 18612 /// 18613 /// For example, the final generated code should be: 18614 /// 18615 /// stp xzr, xzr, [x0] 18616 /// 18617 /// instead of: 18618 /// 18619 /// movi v0.2d, #0 18620 /// str q0, [x0] 18621 /// 18622 static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) { 18623 SDValue StVal = St.getValue(); 18624 EVT VT = StVal.getValueType(); 18625 18626 // Avoid scalarizing zero splat stores for scalable vectors. 18627 if (VT.isScalableVector()) 18628 return SDValue(); 18629 18630 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or 18631 // 2, 3 or 4 i32 elements. 18632 int NumVecElts = VT.getVectorNumElements(); 18633 if (!(((NumVecElts == 2 || NumVecElts == 3) && 18634 VT.getVectorElementType().getSizeInBits() == 64) || 18635 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) && 18636 VT.getVectorElementType().getSizeInBits() == 32))) 18637 return SDValue(); 18638 18639 if (StVal.getOpcode() != ISD::BUILD_VECTOR) 18640 return SDValue(); 18641 18642 // If the zero constant has more than one use then the vector store could be 18643 // better since the constant mov will be amortized and stp q instructions 18644 // should be able to be formed. 18645 if (!StVal.hasOneUse()) 18646 return SDValue(); 18647 18648 // If the store is truncating then it's going down to i16 or smaller, which 18649 // means it can be implemented in a single store anyway. 18650 if (St.isTruncatingStore()) 18651 return SDValue(); 18652 18653 // If the immediate offset of the address operand is too large for the stp 18654 // instruction, then bail out. 18655 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) { 18656 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1); 18657 if (Offset < -512 || Offset > 504) 18658 return SDValue(); 18659 } 18660 18661 for (int I = 0; I < NumVecElts; ++I) { 18662 SDValue EltVal = StVal.getOperand(I); 18663 if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal)) 18664 return SDValue(); 18665 } 18666 18667 // Use a CopyFromReg WZR/XZR here to prevent 18668 // DAGCombiner::MergeConsecutiveStores from undoing this transformation. 18669 SDLoc DL(&St); 18670 unsigned ZeroReg; 18671 EVT ZeroVT; 18672 if (VT.getVectorElementType().getSizeInBits() == 32) { 18673 ZeroReg = AArch64::WZR; 18674 ZeroVT = MVT::i32; 18675 } else { 18676 ZeroReg = AArch64::XZR; 18677 ZeroVT = MVT::i64; 18678 } 18679 SDValue SplatVal = 18680 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT); 18681 return splitStoreSplat(DAG, St, SplatVal, NumVecElts); 18682 } 18683 18684 /// Replace a splat of a scalar to a vector store by scalar stores of the scalar 18685 /// value. The load store optimizer pass will merge them to store pair stores. 18686 /// This has better performance than a splat of the scalar followed by a split 18687 /// vector store. Even if the stores are not merged it is four stores vs a dup, 18688 /// followed by an ext.b and two stores. 18689 static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) { 18690 SDValue StVal = St.getValue(); 18691 EVT VT = StVal.getValueType(); 18692 18693 // Don't replace floating point stores, they possibly won't be transformed to 18694 // stp because of the store pair suppress pass. 18695 if (VT.isFloatingPoint()) 18696 return SDValue(); 18697 18698 // We can express a splat as store pair(s) for 2 or 4 elements. 18699 unsigned NumVecElts = VT.getVectorNumElements(); 18700 if (NumVecElts != 4 && NumVecElts != 2) 18701 return SDValue(); 18702 18703 // If the store is truncating then it's going down to i16 or smaller, which 18704 // means it can be implemented in a single store anyway. 18705 if (St.isTruncatingStore()) 18706 return SDValue(); 18707 18708 // Check that this is a splat. 18709 // Make sure that each of the relevant vector element locations are inserted 18710 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32. 18711 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1); 18712 SDValue SplatVal; 18713 for (unsigned I = 0; I < NumVecElts; ++I) { 18714 // Check for insert vector elements. 18715 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT) 18716 return SDValue(); 18717 18718 // Check that same value is inserted at each vector element. 18719 if (I == 0) 18720 SplatVal = StVal.getOperand(1); 18721 else if (StVal.getOperand(1) != SplatVal) 18722 return SDValue(); 18723 18724 // Check insert element index. 18725 ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2)); 18726 if (!CIndex) 18727 return SDValue(); 18728 uint64_t IndexVal = CIndex->getZExtValue(); 18729 if (IndexVal >= NumVecElts) 18730 return SDValue(); 18731 IndexNotInserted.reset(IndexVal); 18732 18733 StVal = StVal.getOperand(0); 18734 } 18735 // Check that all vector element locations were inserted to. 18736 if (IndexNotInserted.any()) 18737 return SDValue(); 18738 18739 return splitStoreSplat(DAG, St, SplatVal, NumVecElts); 18740 } 18741 18742 static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 18743 SelectionDAG &DAG, 18744 const AArch64Subtarget *Subtarget) { 18745 18746 StoreSDNode *S = cast<StoreSDNode>(N); 18747 if (S->isVolatile() || S->isIndexed()) 18748 return SDValue(); 18749 18750 SDValue StVal = S->getValue(); 18751 EVT VT = StVal.getValueType(); 18752 18753 if (!VT.isFixedLengthVector()) 18754 return SDValue(); 18755 18756 // If we get a splat of zeros, convert this vector store to a store of 18757 // scalars. They will be merged into store pairs of xzr thereby removing one 18758 // instruction and one register. 18759 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S)) 18760 return ReplacedZeroSplat; 18761 18762 // FIXME: The logic for deciding if an unaligned store should be split should 18763 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be 18764 // a call to that function here. 18765 18766 if (!Subtarget->isMisaligned128StoreSlow()) 18767 return SDValue(); 18768 18769 // Don't split at -Oz. 18770 if (DAG.getMachineFunction().getFunction().hasMinSize()) 18771 return SDValue(); 18772 18773 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting 18774 // those up regresses performance on micro-benchmarks and olden/bh. 18775 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64) 18776 return SDValue(); 18777 18778 // Split unaligned 16B stores. They are terrible for performance. 18779 // Don't split stores with alignment of 1 or 2. Code that uses clang vector 18780 // extensions can use this to mark that it does not want splitting to happen 18781 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of 18782 // eliminating alignment hazards is only 1 in 8 for alignment of 2. 18783 if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) || 18784 S->getAlign() <= Align(2)) 18785 return SDValue(); 18786 18787 // If we get a splat of a scalar convert this vector store to a store of 18788 // scalars. They will be merged into store pairs thereby removing two 18789 // instructions. 18790 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S)) 18791 return ReplacedSplat; 18792 18793 SDLoc DL(S); 18794 18795 // Split VT into two. 18796 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); 18797 unsigned NumElts = HalfVT.getVectorNumElements(); 18798 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, 18799 DAG.getConstant(0, DL, MVT::i64)); 18800 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, 18801 DAG.getConstant(NumElts, DL, MVT::i64)); 18802 SDValue BasePtr = S->getBasePtr(); 18803 SDValue NewST1 = 18804 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(), 18805 S->getAlign(), S->getMemOperand()->getFlags()); 18806 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, 18807 DAG.getConstant(8, DL, MVT::i64)); 18808 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr, 18809 S->getPointerInfo(), S->getAlign(), 18810 S->getMemOperand()->getFlags()); 18811 } 18812 18813 static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG) { 18814 assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!"); 18815 18816 // splice(pg, op1, undef) -> op1 18817 if (N->getOperand(2).isUndef()) 18818 return N->getOperand(1); 18819 18820 return SDValue(); 18821 } 18822 18823 static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG, 18824 const AArch64Subtarget *Subtarget) { 18825 assert((N->getOpcode() == AArch64ISD::UUNPKHI || 18826 N->getOpcode() == AArch64ISD::UUNPKLO) && 18827 "Unexpected Opcode!"); 18828 18829 // uunpklo/hi undef -> undef 18830 if (N->getOperand(0).isUndef()) 18831 return DAG.getUNDEF(N->getValueType(0)); 18832 18833 // If this is a masked load followed by an UUNPKLO, fold this into a masked 18834 // extending load. We can do this even if this is already a masked 18835 // {z,}extload. 18836 if (N->getOperand(0).getOpcode() == ISD::MLOAD && 18837 N->getOpcode() == AArch64ISD::UUNPKLO) { 18838 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0)); 18839 SDValue Mask = MLD->getMask(); 18840 SDLoc DL(N); 18841 18842 if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD && 18843 SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE && 18844 (MLD->getPassThru()->isUndef() || 18845 isZerosVector(MLD->getPassThru().getNode()))) { 18846 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits(); 18847 unsigned PgPattern = Mask->getConstantOperandVal(0); 18848 EVT VT = N->getValueType(0); 18849 18850 // Ensure we can double the size of the predicate pattern 18851 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern); 18852 if (NumElts && 18853 NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) { 18854 Mask = 18855 getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern); 18856 SDValue PassThru = DAG.getConstant(0, DL, VT); 18857 SDValue NewLoad = DAG.getMaskedLoad( 18858 VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask, 18859 PassThru, MLD->getMemoryVT(), MLD->getMemOperand(), 18860 MLD->getAddressingMode(), ISD::ZEXTLOAD); 18861 18862 DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1)); 18863 18864 return NewLoad; 18865 } 18866 } 18867 } 18868 18869 return SDValue(); 18870 } 18871 18872 static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG) { 18873 SDLoc DL(N); 18874 SDValue Op0 = N->getOperand(0); 18875 SDValue Op1 = N->getOperand(1); 18876 EVT ResVT = N->getValueType(0); 18877 18878 // uzp1(x, undef) -> concat(truncate(x), undef) 18879 if (Op1.getOpcode() == ISD::UNDEF) { 18880 EVT BCVT = MVT::Other, HalfVT = MVT::Other; 18881 switch (ResVT.getSimpleVT().SimpleTy) { 18882 default: 18883 break; 18884 case MVT::v16i8: 18885 BCVT = MVT::v8i16; 18886 HalfVT = MVT::v8i8; 18887 break; 18888 case MVT::v8i16: 18889 BCVT = MVT::v4i32; 18890 HalfVT = MVT::v4i16; 18891 break; 18892 case MVT::v4i32: 18893 BCVT = MVT::v2i64; 18894 HalfVT = MVT::v2i32; 18895 break; 18896 } 18897 if (BCVT != MVT::Other) { 18898 SDValue BC = DAG.getBitcast(BCVT, Op0); 18899 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC); 18900 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc, 18901 DAG.getUNDEF(HalfVT)); 18902 } 18903 } 18904 18905 // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z) 18906 if (Op0.getOpcode() == AArch64ISD::UUNPKLO) { 18907 if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) { 18908 SDValue X = Op0.getOperand(0).getOperand(0); 18909 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1); 18910 } 18911 } 18912 18913 // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z) 18914 if (Op1.getOpcode() == AArch64ISD::UUNPKHI) { 18915 if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) { 18916 SDValue Z = Op1.getOperand(0).getOperand(1); 18917 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z); 18918 } 18919 } 18920 18921 // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y)) 18922 // Only implemented on little-endian subtargets. 18923 bool IsLittleEndian = DAG.getDataLayout().isLittleEndian(); 18924 18925 // This optimization only works on little endian. 18926 if (!IsLittleEndian) 18927 return SDValue(); 18928 18929 if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8) 18930 return SDValue(); 18931 18932 auto getSourceOp = [](SDValue Operand) -> SDValue { 18933 const unsigned Opcode = Operand.getOpcode(); 18934 if (Opcode == ISD::TRUNCATE) 18935 return Operand->getOperand(0); 18936 if (Opcode == ISD::BITCAST && 18937 Operand->getOperand(0).getOpcode() == ISD::TRUNCATE) 18938 return Operand->getOperand(0)->getOperand(0); 18939 return SDValue(); 18940 }; 18941 18942 SDValue SourceOp0 = getSourceOp(Op0); 18943 SDValue SourceOp1 = getSourceOp(Op1); 18944 18945 if (!SourceOp0 || !SourceOp1) 18946 return SDValue(); 18947 18948 if (SourceOp0.getValueType() != SourceOp1.getValueType() || 18949 !SourceOp0.getValueType().isSimple()) 18950 return SDValue(); 18951 18952 EVT ResultTy; 18953 18954 switch (SourceOp0.getSimpleValueType().SimpleTy) { 18955 case MVT::v2i64: 18956 ResultTy = MVT::v4i32; 18957 break; 18958 case MVT::v4i32: 18959 ResultTy = MVT::v8i16; 18960 break; 18961 case MVT::v8i16: 18962 ResultTy = MVT::v16i8; 18963 break; 18964 default: 18965 return SDValue(); 18966 } 18967 18968 SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0); 18969 SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1); 18970 SDValue UzpResult = 18971 DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1); 18972 18973 EVT BitcastResultTy; 18974 18975 switch (ResVT.getSimpleVT().SimpleTy) { 18976 case MVT::v2i32: 18977 BitcastResultTy = MVT::v2i64; 18978 break; 18979 case MVT::v4i16: 18980 BitcastResultTy = MVT::v4i32; 18981 break; 18982 case MVT::v8i8: 18983 BitcastResultTy = MVT::v8i16; 18984 break; 18985 default: 18986 llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}"); 18987 } 18988 18989 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, 18990 DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult)); 18991 } 18992 18993 static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG) { 18994 unsigned Opc = N->getOpcode(); 18995 18996 assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads 18997 Opc <= AArch64ISD::GLD1_IMM_MERGE_ZERO) || 18998 (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads 18999 Opc <= AArch64ISD::GLD1S_IMM_MERGE_ZERO)) && 19000 "Invalid opcode."); 19001 19002 const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO || 19003 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO; 19004 const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO || 19005 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO; 19006 const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO || 19007 Opc == AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO || 19008 Opc == AArch64ISD::GLD1_UXTW_MERGE_ZERO || 19009 Opc == AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO; 19010 19011 SDLoc DL(N); 19012 SDValue Chain = N->getOperand(0); 19013 SDValue Pg = N->getOperand(1); 19014 SDValue Base = N->getOperand(2); 19015 SDValue Offset = N->getOperand(3); 19016 SDValue Ty = N->getOperand(4); 19017 19018 EVT ResVT = N->getValueType(0); 19019 19020 const auto OffsetOpc = Offset.getOpcode(); 19021 const bool OffsetIsZExt = 19022 OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU; 19023 const bool OffsetIsSExt = 19024 OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU; 19025 19026 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible. 19027 if (!Extended && (OffsetIsSExt || OffsetIsZExt)) { 19028 SDValue ExtPg = Offset.getOperand(0); 19029 VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode()); 19030 EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType(); 19031 19032 // If the predicate for the sign- or zero-extended offset is the 19033 // same as the predicate used for this load and the sign-/zero-extension 19034 // was from a 32-bits... 19035 if (ExtPg == Pg && ExtFromEVT == MVT::i32) { 19036 SDValue UnextendedOffset = Offset.getOperand(1); 19037 19038 unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true); 19039 if (Signed) 19040 NewOpc = getSignExtendedGatherOpcode(NewOpc); 19041 19042 return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other}, 19043 {Chain, Pg, Base, UnextendedOffset, Ty}); 19044 } 19045 } 19046 19047 return SDValue(); 19048 } 19049 19050 /// Optimize a vector shift instruction and its operand if shifted out 19051 /// bits are not used. 19052 static SDValue performVectorShiftCombine(SDNode *N, 19053 const AArch64TargetLowering &TLI, 19054 TargetLowering::DAGCombinerInfo &DCI) { 19055 assert(N->getOpcode() == AArch64ISD::VASHR || 19056 N->getOpcode() == AArch64ISD::VLSHR); 19057 19058 SDValue Op = N->getOperand(0); 19059 unsigned OpScalarSize = Op.getScalarValueSizeInBits(); 19060 19061 unsigned ShiftImm = N->getConstantOperandVal(1); 19062 assert(OpScalarSize > ShiftImm && "Invalid shift imm"); 19063 19064 APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm); 19065 APInt DemandedMask = ~ShiftedOutBits; 19066 19067 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI)) 19068 return SDValue(N, 0); 19069 19070 return SDValue(); 19071 } 19072 19073 static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG) { 19074 // sunpklo(sext(pred)) -> sext(extract_low_half(pred)) 19075 // This transform works in partnership with performSetCCPunpkCombine to 19076 // remove unnecessary transfer of predicates into standard registers and back 19077 if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND && 19078 N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() == 19079 MVT::i1) { 19080 SDValue CC = N->getOperand(0)->getOperand(0); 19081 auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext()); 19082 SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC, 19083 DAG.getVectorIdxConstant(0, SDLoc(N))); 19084 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk); 19085 } 19086 19087 return SDValue(); 19088 } 19089 19090 /// Target-specific DAG combine function for post-increment LD1 (lane) and 19091 /// post-increment LD1R. 19092 static SDValue performPostLD1Combine(SDNode *N, 19093 TargetLowering::DAGCombinerInfo &DCI, 19094 bool IsLaneOp) { 19095 if (DCI.isBeforeLegalizeOps()) 19096 return SDValue(); 19097 19098 SelectionDAG &DAG = DCI.DAG; 19099 EVT VT = N->getValueType(0); 19100 19101 if (!VT.is128BitVector() && !VT.is64BitVector()) 19102 return SDValue(); 19103 19104 unsigned LoadIdx = IsLaneOp ? 1 : 0; 19105 SDNode *LD = N->getOperand(LoadIdx).getNode(); 19106 // If it is not LOAD, can not do such combine. 19107 if (LD->getOpcode() != ISD::LOAD) 19108 return SDValue(); 19109 19110 // The vector lane must be a constant in the LD1LANE opcode. 19111 SDValue Lane; 19112 if (IsLaneOp) { 19113 Lane = N->getOperand(2); 19114 auto *LaneC = dyn_cast<ConstantSDNode>(Lane); 19115 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements()) 19116 return SDValue(); 19117 } 19118 19119 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD); 19120 EVT MemVT = LoadSDN->getMemoryVT(); 19121 // Check if memory operand is the same type as the vector element. 19122 if (MemVT != VT.getVectorElementType()) 19123 return SDValue(); 19124 19125 // Check if there are other uses. If so, do not combine as it will introduce 19126 // an extra load. 19127 for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE; 19128 ++UI) { 19129 if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result. 19130 continue; 19131 if (*UI != N) 19132 return SDValue(); 19133 } 19134 19135 SDValue Addr = LD->getOperand(1); 19136 SDValue Vector = N->getOperand(0); 19137 // Search for a use of the address operand that is an increment. 19138 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE = 19139 Addr.getNode()->use_end(); UI != UE; ++UI) { 19140 SDNode *User = *UI; 19141 if (User->getOpcode() != ISD::ADD 19142 || UI.getUse().getResNo() != Addr.getResNo()) 19143 continue; 19144 19145 // If the increment is a constant, it must match the memory ref size. 19146 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 19147 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { 19148 uint32_t IncVal = CInc->getZExtValue(); 19149 unsigned NumBytes = VT.getScalarSizeInBits() / 8; 19150 if (IncVal != NumBytes) 19151 continue; 19152 Inc = DAG.getRegister(AArch64::XZR, MVT::i64); 19153 } 19154 19155 // To avoid cycle construction make sure that neither the load nor the add 19156 // are predecessors to each other or the Vector. 19157 SmallPtrSet<const SDNode *, 32> Visited; 19158 SmallVector<const SDNode *, 16> Worklist; 19159 Visited.insert(Addr.getNode()); 19160 Worklist.push_back(User); 19161 Worklist.push_back(LD); 19162 Worklist.push_back(Vector.getNode()); 19163 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) || 19164 SDNode::hasPredecessorHelper(User, Visited, Worklist)) 19165 continue; 19166 19167 SmallVector<SDValue, 8> Ops; 19168 Ops.push_back(LD->getOperand(0)); // Chain 19169 if (IsLaneOp) { 19170 Ops.push_back(Vector); // The vector to be inserted 19171 Ops.push_back(Lane); // The lane to be inserted in the vector 19172 } 19173 Ops.push_back(Addr); 19174 Ops.push_back(Inc); 19175 19176 EVT Tys[3] = { VT, MVT::i64, MVT::Other }; 19177 SDVTList SDTys = DAG.getVTList(Tys); 19178 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost; 19179 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops, 19180 MemVT, 19181 LoadSDN->getMemOperand()); 19182 19183 // Update the uses. 19184 SDValue NewResults[] = { 19185 SDValue(LD, 0), // The result of load 19186 SDValue(UpdN.getNode(), 2) // Chain 19187 }; 19188 DCI.CombineTo(LD, NewResults); 19189 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result 19190 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register 19191 19192 break; 19193 } 19194 return SDValue(); 19195 } 19196 19197 /// Simplify ``Addr`` given that the top byte of it is ignored by HW during 19198 /// address translation. 19199 static bool performTBISimplification(SDValue Addr, 19200 TargetLowering::DAGCombinerInfo &DCI, 19201 SelectionDAG &DAG) { 19202 APInt DemandedMask = APInt::getLowBitsSet(64, 56); 19203 KnownBits Known; 19204 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 19205 !DCI.isBeforeLegalizeOps()); 19206 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 19207 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) { 19208 DCI.CommitTargetLoweringOpt(TLO); 19209 return true; 19210 } 19211 return false; 19212 } 19213 19214 static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) { 19215 assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) && 19216 "Expected STORE dag node in input!"); 19217 19218 if (auto Store = dyn_cast<StoreSDNode>(N)) { 19219 if (!Store->isTruncatingStore() || Store->isIndexed()) 19220 return SDValue(); 19221 SDValue Ext = Store->getValue(); 19222 auto ExtOpCode = Ext.getOpcode(); 19223 if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND && 19224 ExtOpCode != ISD::ANY_EXTEND) 19225 return SDValue(); 19226 SDValue Orig = Ext->getOperand(0); 19227 if (Store->getMemoryVT() != Orig.getValueType()) 19228 return SDValue(); 19229 return DAG.getStore(Store->getChain(), SDLoc(Store), Orig, 19230 Store->getBasePtr(), Store->getMemOperand()); 19231 } 19232 19233 return SDValue(); 19234 } 19235 19236 // Perform TBI simplification if supported by the target and try to break up 19237 // nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit 19238 // load instructions can be selected. 19239 static SDValue performLOADCombine(SDNode *N, 19240 TargetLowering::DAGCombinerInfo &DCI, 19241 SelectionDAG &DAG, 19242 const AArch64Subtarget *Subtarget) { 19243 if (Subtarget->supportsAddressTopByteIgnored()) 19244 performTBISimplification(N->getOperand(1), DCI, DAG); 19245 19246 LoadSDNode *LD = cast<LoadSDNode>(N); 19247 EVT MemVT = LD->getMemoryVT(); 19248 if (LD->isVolatile() || !LD->isNonTemporal() || !Subtarget->isLittleEndian()) 19249 return SDValue(N, 0); 19250 19251 if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 || 19252 MemVT.getSizeInBits() % 256 == 0 || 19253 256 % MemVT.getScalarSizeInBits() != 0) 19254 return SDValue(N, 0); 19255 19256 SDLoc DL(LD); 19257 SDValue Chain = LD->getChain(); 19258 SDValue BasePtr = LD->getBasePtr(); 19259 SDNodeFlags Flags = LD->getFlags(); 19260 SmallVector<SDValue, 4> LoadOps; 19261 SmallVector<SDValue, 4> LoadOpsChain; 19262 // Replace any non temporal load over 256-bit with a series of 256 bit loads 19263 // and a scalar/vector load less than 256. This way we can utilize 256-bit 19264 // loads and reduce the amount of load instructions generated. 19265 MVT NewVT = 19266 MVT::getVectorVT(MemVT.getVectorElementType().getSimpleVT(), 19267 256 / MemVT.getVectorElementType().getSizeInBits()); 19268 unsigned Num256Loads = MemVT.getSizeInBits() / 256; 19269 // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32. 19270 for (unsigned I = 0; I < Num256Loads; I++) { 19271 unsigned PtrOffset = I * 32; 19272 SDValue NewPtr = DAG.getMemBasePlusOffset( 19273 BasePtr, TypeSize::Fixed(PtrOffset), DL, Flags); 19274 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset); 19275 SDValue NewLoad = DAG.getLoad( 19276 NewVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset), 19277 NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo()); 19278 LoadOps.push_back(NewLoad); 19279 LoadOpsChain.push_back(SDValue(cast<SDNode>(NewLoad), 1)); 19280 } 19281 19282 // Process remaining bits of the load operation. 19283 // This is done by creating an UNDEF vector to match the size of the 19284 // 256-bit loads and inserting the remaining load to it. We extract the 19285 // original load type at the end using EXTRACT_SUBVECTOR instruction. 19286 unsigned BitsRemaining = MemVT.getSizeInBits() % 256; 19287 unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8; 19288 MVT RemainingVT = MVT::getVectorVT( 19289 MemVT.getVectorElementType().getSimpleVT(), 19290 BitsRemaining / MemVT.getVectorElementType().getSizeInBits()); 19291 SDValue NewPtr = 19292 DAG.getMemBasePlusOffset(BasePtr, TypeSize::Fixed(PtrOffset), DL, Flags); 19293 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset); 19294 SDValue RemainingLoad = 19295 DAG.getLoad(RemainingVT, DL, Chain, NewPtr, 19296 LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign, 19297 LD->getMemOperand()->getFlags(), LD->getAAInfo()); 19298 SDValue UndefVector = DAG.getUNDEF(NewVT); 19299 SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL); 19300 SDValue ExtendedReminingLoad = 19301 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT, 19302 {UndefVector, RemainingLoad, InsertIdx}); 19303 LoadOps.push_back(ExtendedReminingLoad); 19304 LoadOpsChain.push_back(SDValue(cast<SDNode>(RemainingLoad), 1)); 19305 EVT ConcatVT = 19306 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), 19307 LoadOps.size() * NewVT.getVectorNumElements()); 19308 SDValue ConcatVectors = 19309 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, LoadOps); 19310 // Extract the original vector type size. 19311 SDValue ExtractSubVector = 19312 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, 19313 {ConcatVectors, DAG.getVectorIdxConstant(0, DL)}); 19314 SDValue TokenFactor = 19315 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain); 19316 return DAG.getMergeValues({ExtractSubVector, TokenFactor}, DL); 19317 } 19318 19319 static SDValue performSTORECombine(SDNode *N, 19320 TargetLowering::DAGCombinerInfo &DCI, 19321 SelectionDAG &DAG, 19322 const AArch64Subtarget *Subtarget) { 19323 StoreSDNode *ST = cast<StoreSDNode>(N); 19324 SDValue Chain = ST->getChain(); 19325 SDValue Value = ST->getValue(); 19326 SDValue Ptr = ST->getBasePtr(); 19327 EVT ValueVT = Value.getValueType(); 19328 19329 auto hasValidElementTypeForFPTruncStore = [](EVT VT) { 19330 EVT EltVT = VT.getVectorElementType(); 19331 return EltVT == MVT::f32 || EltVT == MVT::f64; 19332 }; 19333 19334 // If this is an FP_ROUND followed by a store, fold this into a truncating 19335 // store. We can do this even if this is already a truncstore. 19336 // We purposefully don't care about legality of the nodes here as we know 19337 // they can be split down into something legal. 19338 if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND && 19339 Value.getNode()->hasOneUse() && ST->isUnindexed() && 19340 Subtarget->useSVEForFixedLengthVectors() && 19341 ValueVT.isFixedLengthVector() && 19342 ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() && 19343 hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType())) 19344 return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr, 19345 ST->getMemoryVT(), ST->getMemOperand()); 19346 19347 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget)) 19348 return Split; 19349 19350 if (Subtarget->supportsAddressTopByteIgnored() && 19351 performTBISimplification(N->getOperand(2), DCI, DAG)) 19352 return SDValue(N, 0); 19353 19354 if (SDValue Store = foldTruncStoreOfExt(DAG, N)) 19355 return Store; 19356 19357 return SDValue(); 19358 } 19359 19360 static SDValue performMSTORECombine(SDNode *N, 19361 TargetLowering::DAGCombinerInfo &DCI, 19362 SelectionDAG &DAG, 19363 const AArch64Subtarget *Subtarget) { 19364 MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N); 19365 SDValue Value = MST->getValue(); 19366 SDValue Mask = MST->getMask(); 19367 SDLoc DL(N); 19368 19369 // If this is a UZP1 followed by a masked store, fold this into a masked 19370 // truncating store. We can do this even if this is already a masked 19371 // truncstore. 19372 if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() && 19373 MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE && 19374 Value.getValueType().isInteger()) { 19375 Value = Value.getOperand(0); 19376 if (Value.getOpcode() == ISD::BITCAST) { 19377 EVT HalfVT = 19378 Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext()); 19379 EVT InVT = Value.getOperand(0).getValueType(); 19380 19381 if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) { 19382 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits(); 19383 unsigned PgPattern = Mask->getConstantOperandVal(0); 19384 19385 // Ensure we can double the size of the predicate pattern 19386 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern); 19387 if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <= 19388 MinSVESize) { 19389 Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1), 19390 PgPattern); 19391 return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0), 19392 MST->getBasePtr(), MST->getOffset(), Mask, 19393 MST->getMemoryVT(), MST->getMemOperand(), 19394 MST->getAddressingMode(), 19395 /*IsTruncating=*/true); 19396 } 19397 } 19398 } 19399 } 19400 19401 return SDValue(); 19402 } 19403 19404 /// \return true if part of the index was folded into the Base. 19405 static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale, 19406 SDLoc DL, SelectionDAG &DAG) { 19407 // This function assumes a vector of i64 indices. 19408 EVT IndexVT = Index.getValueType(); 19409 if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64) 19410 return false; 19411 19412 // Simplify: 19413 // BasePtr = Ptr 19414 // Index = X + splat(Offset) 19415 // -> 19416 // BasePtr = Ptr + Offset * scale. 19417 // Index = X 19418 if (Index.getOpcode() == ISD::ADD) { 19419 if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) { 19420 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale); 19421 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset); 19422 Index = Index.getOperand(0); 19423 return true; 19424 } 19425 } 19426 19427 // Simplify: 19428 // BasePtr = Ptr 19429 // Index = (X + splat(Offset)) << splat(Shift) 19430 // -> 19431 // BasePtr = Ptr + (Offset << Shift) * scale) 19432 // Index = X << splat(shift) 19433 if (Index.getOpcode() == ISD::SHL && 19434 Index.getOperand(0).getOpcode() == ISD::ADD) { 19435 SDValue Add = Index.getOperand(0); 19436 SDValue ShiftOp = Index.getOperand(1); 19437 SDValue OffsetOp = Add.getOperand(1); 19438 if (auto Shift = DAG.getSplatValue(ShiftOp)) 19439 if (auto Offset = DAG.getSplatValue(OffsetOp)) { 19440 Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift); 19441 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale); 19442 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset); 19443 Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(), 19444 Add.getOperand(0), ShiftOp); 19445 return true; 19446 } 19447 } 19448 19449 return false; 19450 } 19451 19452 // Analyse the specified address returning true if a more optimal addressing 19453 // mode is available. When returning true all parameters are updated to reflect 19454 // their recommended values. 19455 static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N, 19456 SDValue &BasePtr, SDValue &Index, 19457 SelectionDAG &DAG) { 19458 // Try to iteratively fold parts of the index into the base pointer to 19459 // simplify the index as much as possible. 19460 bool Changed = false; 19461 while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG)) 19462 Changed = true; 19463 19464 // Only consider element types that are pointer sized as smaller types can 19465 // be easily promoted. 19466 EVT IndexVT = Index.getValueType(); 19467 if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64) 19468 return Changed; 19469 19470 // Can indices be trivially shrunk? 19471 EVT DataVT = N->getOperand(1).getValueType(); 19472 // Don't attempt to shrink the index for fixed vectors of 64 bit data since it 19473 // will later be re-extended to 64 bits in legalization 19474 if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64) 19475 return Changed; 19476 if (ISD::isVectorShrinkable(Index.getNode(), 32, N->isIndexSigned())) { 19477 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32); 19478 Index = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NewIndexVT, Index); 19479 return true; 19480 } 19481 19482 // Match: 19483 // Index = step(const) 19484 int64_t Stride = 0; 19485 if (Index.getOpcode() == ISD::STEP_VECTOR) { 19486 Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue(); 19487 } 19488 // Match: 19489 // Index = step(const) << shift(const) 19490 else if (Index.getOpcode() == ISD::SHL && 19491 Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) { 19492 SDValue RHS = Index.getOperand(1); 19493 if (auto *Shift = 19494 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(RHS))) { 19495 int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1); 19496 Stride = Step << Shift->getZExtValue(); 19497 } 19498 } 19499 19500 // Return early because no supported pattern is found. 19501 if (Stride == 0) 19502 return Changed; 19503 19504 if (Stride < std::numeric_limits<int32_t>::min() || 19505 Stride > std::numeric_limits<int32_t>::max()) 19506 return Changed; 19507 19508 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>(); 19509 unsigned MaxVScale = 19510 Subtarget.getMaxSVEVectorSizeInBits() / AArch64::SVEBitsPerBlock; 19511 int64_t LastElementOffset = 19512 IndexVT.getVectorMinNumElements() * Stride * MaxVScale; 19513 19514 if (LastElementOffset < std::numeric_limits<int32_t>::min() || 19515 LastElementOffset > std::numeric_limits<int32_t>::max()) 19516 return Changed; 19517 19518 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32); 19519 // Stride does not scale explicitly by 'Scale', because it happens in 19520 // the gather/scatter addressing mode. 19521 Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride)); 19522 return true; 19523 } 19524 19525 static SDValue performMaskedGatherScatterCombine( 19526 SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { 19527 MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N); 19528 assert(MGS && "Can only combine gather load or scatter store nodes"); 19529 19530 if (!DCI.isBeforeLegalize()) 19531 return SDValue(); 19532 19533 SDLoc DL(MGS); 19534 SDValue Chain = MGS->getChain(); 19535 SDValue Scale = MGS->getScale(); 19536 SDValue Index = MGS->getIndex(); 19537 SDValue Mask = MGS->getMask(); 19538 SDValue BasePtr = MGS->getBasePtr(); 19539 ISD::MemIndexType IndexType = MGS->getIndexType(); 19540 19541 if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG)) 19542 return SDValue(); 19543 19544 // Here we catch such cases early and change MGATHER's IndexType to allow 19545 // the use of an Index that's more legalisation friendly. 19546 if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) { 19547 SDValue PassThru = MGT->getPassThru(); 19548 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale}; 19549 return DAG.getMaskedGather( 19550 DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL, 19551 Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType()); 19552 } 19553 auto *MSC = cast<MaskedScatterSDNode>(MGS); 19554 SDValue Data = MSC->getValue(); 19555 SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale}; 19556 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(), DL, 19557 Ops, MSC->getMemOperand(), IndexType, 19558 MSC->isTruncatingStore()); 19559 } 19560 19561 /// Target-specific DAG combine function for NEON load/store intrinsics 19562 /// to merge base address updates. 19563 static SDValue performNEONPostLDSTCombine(SDNode *N, 19564 TargetLowering::DAGCombinerInfo &DCI, 19565 SelectionDAG &DAG) { 19566 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 19567 return SDValue(); 19568 19569 unsigned AddrOpIdx = N->getNumOperands() - 1; 19570 SDValue Addr = N->getOperand(AddrOpIdx); 19571 19572 // Search for a use of the address operand that is an increment. 19573 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), 19574 UE = Addr.getNode()->use_end(); UI != UE; ++UI) { 19575 SDNode *User = *UI; 19576 if (User->getOpcode() != ISD::ADD || 19577 UI.getUse().getResNo() != Addr.getResNo()) 19578 continue; 19579 19580 // Check that the add is independent of the load/store. Otherwise, folding 19581 // it would create a cycle. 19582 SmallPtrSet<const SDNode *, 32> Visited; 19583 SmallVector<const SDNode *, 16> Worklist; 19584 Visited.insert(Addr.getNode()); 19585 Worklist.push_back(N); 19586 Worklist.push_back(User); 19587 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || 19588 SDNode::hasPredecessorHelper(User, Visited, Worklist)) 19589 continue; 19590 19591 // Find the new opcode for the updating load/store. 19592 bool IsStore = false; 19593 bool IsLaneOp = false; 19594 bool IsDupOp = false; 19595 unsigned NewOpc = 0; 19596 unsigned NumVecs = 0; 19597 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 19598 switch (IntNo) { 19599 default: llvm_unreachable("unexpected intrinsic for Neon base update"); 19600 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post; 19601 NumVecs = 2; break; 19602 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post; 19603 NumVecs = 3; break; 19604 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post; 19605 NumVecs = 4; break; 19606 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post; 19607 NumVecs = 2; IsStore = true; break; 19608 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post; 19609 NumVecs = 3; IsStore = true; break; 19610 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post; 19611 NumVecs = 4; IsStore = true; break; 19612 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post; 19613 NumVecs = 2; break; 19614 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post; 19615 NumVecs = 3; break; 19616 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post; 19617 NumVecs = 4; break; 19618 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post; 19619 NumVecs = 2; IsStore = true; break; 19620 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post; 19621 NumVecs = 3; IsStore = true; break; 19622 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post; 19623 NumVecs = 4; IsStore = true; break; 19624 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost; 19625 NumVecs = 2; IsDupOp = true; break; 19626 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost; 19627 NumVecs = 3; IsDupOp = true; break; 19628 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost; 19629 NumVecs = 4; IsDupOp = true; break; 19630 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost; 19631 NumVecs = 2; IsLaneOp = true; break; 19632 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost; 19633 NumVecs = 3; IsLaneOp = true; break; 19634 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost; 19635 NumVecs = 4; IsLaneOp = true; break; 19636 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost; 19637 NumVecs = 2; IsStore = true; IsLaneOp = true; break; 19638 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost; 19639 NumVecs = 3; IsStore = true; IsLaneOp = true; break; 19640 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost; 19641 NumVecs = 4; IsStore = true; IsLaneOp = true; break; 19642 } 19643 19644 EVT VecTy; 19645 if (IsStore) 19646 VecTy = N->getOperand(2).getValueType(); 19647 else 19648 VecTy = N->getValueType(0); 19649 19650 // If the increment is a constant, it must match the memory ref size. 19651 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 19652 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { 19653 uint32_t IncVal = CInc->getZExtValue(); 19654 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; 19655 if (IsLaneOp || IsDupOp) 19656 NumBytes /= VecTy.getVectorNumElements(); 19657 if (IncVal != NumBytes) 19658 continue; 19659 Inc = DAG.getRegister(AArch64::XZR, MVT::i64); 19660 } 19661 SmallVector<SDValue, 8> Ops; 19662 Ops.push_back(N->getOperand(0)); // Incoming chain 19663 // Load lane and store have vector list as input. 19664 if (IsLaneOp || IsStore) 19665 for (unsigned i = 2; i < AddrOpIdx; ++i) 19666 Ops.push_back(N->getOperand(i)); 19667 Ops.push_back(Addr); // Base register 19668 Ops.push_back(Inc); 19669 19670 // Return Types. 19671 EVT Tys[6]; 19672 unsigned NumResultVecs = (IsStore ? 0 : NumVecs); 19673 unsigned n; 19674 for (n = 0; n < NumResultVecs; ++n) 19675 Tys[n] = VecTy; 19676 Tys[n++] = MVT::i64; // Type of write back register 19677 Tys[n] = MVT::Other; // Type of the chain 19678 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2)); 19679 19680 MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N); 19681 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops, 19682 MemInt->getMemoryVT(), 19683 MemInt->getMemOperand()); 19684 19685 // Update the uses. 19686 std::vector<SDValue> NewResults; 19687 for (unsigned i = 0; i < NumResultVecs; ++i) { 19688 NewResults.push_back(SDValue(UpdN.getNode(), i)); 19689 } 19690 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); 19691 DCI.CombineTo(N, NewResults); 19692 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); 19693 19694 break; 19695 } 19696 return SDValue(); 19697 } 19698 19699 // Checks to see if the value is the prescribed width and returns information 19700 // about its extension mode. 19701 static 19702 bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) { 19703 ExtType = ISD::NON_EXTLOAD; 19704 switch(V.getNode()->getOpcode()) { 19705 default: 19706 return false; 19707 case ISD::LOAD: { 19708 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode()); 19709 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8) 19710 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) { 19711 ExtType = LoadNode->getExtensionType(); 19712 return true; 19713 } 19714 return false; 19715 } 19716 case ISD::AssertSext: { 19717 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1)); 19718 if ((TypeNode->getVT() == MVT::i8 && width == 8) 19719 || (TypeNode->getVT() == MVT::i16 && width == 16)) { 19720 ExtType = ISD::SEXTLOAD; 19721 return true; 19722 } 19723 return false; 19724 } 19725 case ISD::AssertZext: { 19726 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1)); 19727 if ((TypeNode->getVT() == MVT::i8 && width == 8) 19728 || (TypeNode->getVT() == MVT::i16 && width == 16)) { 19729 ExtType = ISD::ZEXTLOAD; 19730 return true; 19731 } 19732 return false; 19733 } 19734 case ISD::Constant: 19735 case ISD::TargetConstant: { 19736 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) < 19737 1LL << (width - 1); 19738 } 19739 } 19740 19741 return true; 19742 } 19743 19744 // This function does a whole lot of voodoo to determine if the tests are 19745 // equivalent without and with a mask. Essentially what happens is that given a 19746 // DAG resembling: 19747 // 19748 // +-------------+ +-------------+ +-------------+ +-------------+ 19749 // | Input | | AddConstant | | CompConstant| | CC | 19750 // +-------------+ +-------------+ +-------------+ +-------------+ 19751 // | | | | 19752 // V V | +----------+ 19753 // +-------------+ +----+ | | 19754 // | ADD | |0xff| | | 19755 // +-------------+ +----+ | | 19756 // | | | | 19757 // V V | | 19758 // +-------------+ | | 19759 // | AND | | | 19760 // +-------------+ | | 19761 // | | | 19762 // +-----+ | | 19763 // | | | 19764 // V V V 19765 // +-------------+ 19766 // | CMP | 19767 // +-------------+ 19768 // 19769 // The AND node may be safely removed for some combinations of inputs. In 19770 // particular we need to take into account the extension type of the Input, 19771 // the exact values of AddConstant, CompConstant, and CC, along with the nominal 19772 // width of the input (this can work for any width inputs, the above graph is 19773 // specific to 8 bits. 19774 // 19775 // The specific equations were worked out by generating output tables for each 19776 // AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The 19777 // problem was simplified by working with 4 bit inputs, which means we only 19778 // needed to reason about 24 distinct bit patterns: 8 patterns unique to zero 19779 // extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8 19780 // patterns present in both extensions (0,7). For every distinct set of 19781 // AddConstant and CompConstants bit patterns we can consider the masked and 19782 // unmasked versions to be equivalent if the result of this function is true for 19783 // all 16 distinct bit patterns of for the current extension type of Input (w0). 19784 // 19785 // sub w8, w0, w1 19786 // and w10, w8, #0x0f 19787 // cmp w8, w2 19788 // cset w9, AArch64CC 19789 // cmp w10, w2 19790 // cset w11, AArch64CC 19791 // cmp w9, w11 19792 // cset w0, eq 19793 // ret 19794 // 19795 // Since the above function shows when the outputs are equivalent it defines 19796 // when it is safe to remove the AND. Unfortunately it only runs on AArch64 and 19797 // would be expensive to run during compiles. The equations below were written 19798 // in a test harness that confirmed they gave equivalent outputs to the above 19799 // for all inputs function, so they can be used determine if the removal is 19800 // legal instead. 19801 // 19802 // isEquivalentMaskless() is the code for testing if the AND can be removed 19803 // factored out of the DAG recognition as the DAG can take several forms. 19804 19805 static bool isEquivalentMaskless(unsigned CC, unsigned width, 19806 ISD::LoadExtType ExtType, int AddConstant, 19807 int CompConstant) { 19808 // By being careful about our equations and only writing the in term 19809 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can 19810 // make them generally applicable to all bit widths. 19811 int MaxUInt = (1 << width); 19812 19813 // For the purposes of these comparisons sign extending the type is 19814 // equivalent to zero extending the add and displacing it by half the integer 19815 // width. Provided we are careful and make sure our equations are valid over 19816 // the whole range we can just adjust the input and avoid writing equations 19817 // for sign extended inputs. 19818 if (ExtType == ISD::SEXTLOAD) 19819 AddConstant -= (1 << (width-1)); 19820 19821 switch(CC) { 19822 case AArch64CC::LE: 19823 case AArch64CC::GT: 19824 if ((AddConstant == 0) || 19825 (CompConstant == MaxUInt - 1 && AddConstant < 0) || 19826 (AddConstant >= 0 && CompConstant < 0) || 19827 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant)) 19828 return true; 19829 break; 19830 case AArch64CC::LT: 19831 case AArch64CC::GE: 19832 if ((AddConstant == 0) || 19833 (AddConstant >= 0 && CompConstant <= 0) || 19834 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant)) 19835 return true; 19836 break; 19837 case AArch64CC::HI: 19838 case AArch64CC::LS: 19839 if ((AddConstant >= 0 && CompConstant < 0) || 19840 (AddConstant <= 0 && CompConstant >= -1 && 19841 CompConstant < AddConstant + MaxUInt)) 19842 return true; 19843 break; 19844 case AArch64CC::PL: 19845 case AArch64CC::MI: 19846 if ((AddConstant == 0) || 19847 (AddConstant > 0 && CompConstant <= 0) || 19848 (AddConstant < 0 && CompConstant <= AddConstant)) 19849 return true; 19850 break; 19851 case AArch64CC::LO: 19852 case AArch64CC::HS: 19853 if ((AddConstant >= 0 && CompConstant <= 0) || 19854 (AddConstant <= 0 && CompConstant >= 0 && 19855 CompConstant <= AddConstant + MaxUInt)) 19856 return true; 19857 break; 19858 case AArch64CC::EQ: 19859 case AArch64CC::NE: 19860 if ((AddConstant > 0 && CompConstant < 0) || 19861 (AddConstant < 0 && CompConstant >= 0 && 19862 CompConstant < AddConstant + MaxUInt) || 19863 (AddConstant >= 0 && CompConstant >= 0 && 19864 CompConstant >= AddConstant) || 19865 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant)) 19866 return true; 19867 break; 19868 case AArch64CC::VS: 19869 case AArch64CC::VC: 19870 case AArch64CC::AL: 19871 case AArch64CC::NV: 19872 return true; 19873 case AArch64CC::Invalid: 19874 break; 19875 } 19876 19877 return false; 19878 } 19879 19880 // (X & C) >u Mask --> (X & (C & (~Mask)) != 0 19881 // (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0 19882 static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode, 19883 SDNode *AndNode, SelectionDAG &DAG, 19884 unsigned CCIndex, unsigned CmpIndex, 19885 unsigned CC) { 19886 ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(SubsNode->getOperand(1)); 19887 if (!SubsC) 19888 return SDValue(); 19889 19890 APInt SubsAP = SubsC->getAPIntValue(); 19891 if (CC == AArch64CC::HI) { 19892 if (!SubsAP.isMask()) 19893 return SDValue(); 19894 } else if (CC == AArch64CC::LO) { 19895 if (!SubsAP.isPowerOf2()) 19896 return SDValue(); 19897 } else 19898 return SDValue(); 19899 19900 ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(AndNode->getOperand(1)); 19901 if (!AndC) 19902 return SDValue(); 19903 19904 APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1); 19905 19906 SDLoc DL(N); 19907 APInt AndSMask = (~MaskAP) & AndC->getAPIntValue(); 19908 SDValue ANDS = DAG.getNode( 19909 AArch64ISD::ANDS, DL, SubsNode->getVTList(), AndNode->getOperand(0), 19910 DAG.getConstant(AndSMask, DL, SubsC->getValueType(0))); 19911 SDValue AArch64_CC = 19912 DAG.getConstant(CC == AArch64CC::HI ? AArch64CC::NE : AArch64CC::EQ, DL, 19913 N->getOperand(CCIndex)->getValueType(0)); 19914 19915 // For now, only performCSELCombine and performBRCONDCombine call this 19916 // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4 19917 // operands. So just init the ops direct to simplify the code. If we have some 19918 // other case with different CCIndex, CmpIndex, we need to use for loop to 19919 // rewrite the code here. 19920 // TODO: Do we need to assert number of operand is 4 here? 19921 assert((CCIndex == 2 && CmpIndex == 3) && 19922 "Expected CCIndex to be 2 and CmpIndex to be 3."); 19923 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), AArch64_CC, 19924 ANDS.getValue(1)}; 19925 return DAG.getNode(N->getOpcode(), N, N->getVTList(), Ops); 19926 } 19927 19928 static 19929 SDValue performCONDCombine(SDNode *N, 19930 TargetLowering::DAGCombinerInfo &DCI, 19931 SelectionDAG &DAG, unsigned CCIndex, 19932 unsigned CmpIndex) { 19933 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue(); 19934 SDNode *SubsNode = N->getOperand(CmpIndex).getNode(); 19935 unsigned CondOpcode = SubsNode->getOpcode(); 19936 19937 if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0)) 19938 return SDValue(); 19939 19940 // There is a SUBS feeding this condition. Is it fed by a mask we can 19941 // use? 19942 19943 SDNode *AndNode = SubsNode->getOperand(0).getNode(); 19944 unsigned MaskBits = 0; 19945 19946 if (AndNode->getOpcode() != ISD::AND) 19947 return SDValue(); 19948 19949 if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex, 19950 CmpIndex, CC)) 19951 return Val; 19952 19953 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) { 19954 uint32_t CNV = CN->getZExtValue(); 19955 if (CNV == 255) 19956 MaskBits = 8; 19957 else if (CNV == 65535) 19958 MaskBits = 16; 19959 } 19960 19961 if (!MaskBits) 19962 return SDValue(); 19963 19964 SDValue AddValue = AndNode->getOperand(0); 19965 19966 if (AddValue.getOpcode() != ISD::ADD) 19967 return SDValue(); 19968 19969 // The basic dag structure is correct, grab the inputs and validate them. 19970 19971 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0); 19972 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1); 19973 SDValue SubsInputValue = SubsNode->getOperand(1); 19974 19975 // The mask is present and the provenance of all the values is a smaller type, 19976 // lets see if the mask is superfluous. 19977 19978 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) || 19979 !isa<ConstantSDNode>(SubsInputValue.getNode())) 19980 return SDValue(); 19981 19982 ISD::LoadExtType ExtType; 19983 19984 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) || 19985 !checkValueWidth(AddInputValue2, MaskBits, ExtType) || 19986 !checkValueWidth(AddInputValue1, MaskBits, ExtType) ) 19987 return SDValue(); 19988 19989 if(!isEquivalentMaskless(CC, MaskBits, ExtType, 19990 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(), 19991 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue())) 19992 return SDValue(); 19993 19994 // The AND is not necessary, remove it. 19995 19996 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0), 19997 SubsNode->getValueType(1)); 19998 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) }; 19999 20000 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops); 20001 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode()); 20002 20003 return SDValue(N, 0); 20004 } 20005 20006 // Optimize compare with zero and branch. 20007 static SDValue performBRCONDCombine(SDNode *N, 20008 TargetLowering::DAGCombinerInfo &DCI, 20009 SelectionDAG &DAG) { 20010 MachineFunction &MF = DAG.getMachineFunction(); 20011 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions 20012 // will not be produced, as they are conditional branch instructions that do 20013 // not set flags. 20014 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening)) 20015 return SDValue(); 20016 20017 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3)) 20018 N = NV.getNode(); 20019 SDValue Chain = N->getOperand(0); 20020 SDValue Dest = N->getOperand(1); 20021 SDValue CCVal = N->getOperand(2); 20022 SDValue Cmp = N->getOperand(3); 20023 20024 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!"); 20025 unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue(); 20026 if (CC != AArch64CC::EQ && CC != AArch64CC::NE) 20027 return SDValue(); 20028 20029 unsigned CmpOpc = Cmp.getOpcode(); 20030 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS) 20031 return SDValue(); 20032 20033 // Only attempt folding if there is only one use of the flag and no use of the 20034 // value. 20035 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1)) 20036 return SDValue(); 20037 20038 SDValue LHS = Cmp.getOperand(0); 20039 SDValue RHS = Cmp.getOperand(1); 20040 20041 assert(LHS.getValueType() == RHS.getValueType() && 20042 "Expected the value type to be the same for both operands!"); 20043 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) 20044 return SDValue(); 20045 20046 if (isNullConstant(LHS)) 20047 std::swap(LHS, RHS); 20048 20049 if (!isNullConstant(RHS)) 20050 return SDValue(); 20051 20052 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA || 20053 LHS.getOpcode() == ISD::SRL) 20054 return SDValue(); 20055 20056 // Fold the compare into the branch instruction. 20057 SDValue BR; 20058 if (CC == AArch64CC::EQ) 20059 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); 20060 else 20061 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); 20062 20063 // Do not add new nodes to DAG combiner worklist. 20064 DCI.CombineTo(N, BR, false); 20065 20066 return SDValue(); 20067 } 20068 20069 static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG) { 20070 unsigned CC = N->getConstantOperandVal(2); 20071 SDValue SUBS = N->getOperand(3); 20072 SDValue Zero, CTTZ; 20073 20074 if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) { 20075 Zero = N->getOperand(0); 20076 CTTZ = N->getOperand(1); 20077 } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) { 20078 Zero = N->getOperand(1); 20079 CTTZ = N->getOperand(0); 20080 } else 20081 return SDValue(); 20082 20083 if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) || 20084 (CTTZ.getOpcode() == ISD::TRUNCATE && 20085 CTTZ.getOperand(0).getOpcode() != ISD::CTTZ)) 20086 return SDValue(); 20087 20088 assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) && 20089 "Illegal type in CTTZ folding"); 20090 20091 if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1))) 20092 return SDValue(); 20093 20094 SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE 20095 ? CTTZ.getOperand(0).getOperand(0) 20096 : CTTZ.getOperand(0); 20097 20098 if (X != SUBS.getOperand(0)) 20099 return SDValue(); 20100 20101 unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE 20102 ? CTTZ.getOperand(0).getValueSizeInBits() 20103 : CTTZ.getValueSizeInBits(); 20104 SDValue BitWidthMinusOne = 20105 DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType()); 20106 return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ, 20107 BitWidthMinusOne); 20108 } 20109 20110 // (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond) 20111 // (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond) 20112 // Where x and y are constants and x != y 20113 20114 // (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond) 20115 // (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond) 20116 // Where x and y are constants and x != y 20117 static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG) { 20118 SDValue L = Op->getOperand(0); 20119 SDValue R = Op->getOperand(1); 20120 AArch64CC::CondCode OpCC = 20121 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2)); 20122 20123 SDValue OpCmp = Op->getOperand(3); 20124 if (!isCMP(OpCmp)) 20125 return SDValue(); 20126 20127 SDValue CmpLHS = OpCmp.getOperand(0); 20128 SDValue CmpRHS = OpCmp.getOperand(1); 20129 20130 if (CmpRHS.getOpcode() == AArch64ISD::CSEL) 20131 std::swap(CmpLHS, CmpRHS); 20132 else if (CmpLHS.getOpcode() != AArch64ISD::CSEL) 20133 return SDValue(); 20134 20135 SDValue X = CmpLHS->getOperand(0); 20136 SDValue Y = CmpLHS->getOperand(1); 20137 if (!isa<ConstantSDNode>(X) || !isa<ConstantSDNode>(Y) || X == Y) { 20138 return SDValue(); 20139 } 20140 20141 // If one of the constant is opaque constant, x,y sdnode is still different 20142 // but the real value maybe the same. So check APInt here to make sure the 20143 // code is correct. 20144 ConstantSDNode *CX = cast<ConstantSDNode>(X); 20145 ConstantSDNode *CY = cast<ConstantSDNode>(Y); 20146 if (CX->getAPIntValue() == CY->getAPIntValue()) 20147 return SDValue(); 20148 20149 AArch64CC::CondCode CC = 20150 static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(2)); 20151 SDValue Cond = CmpLHS->getOperand(3); 20152 20153 if (CmpRHS == Y) 20154 CC = AArch64CC::getInvertedCondCode(CC); 20155 else if (CmpRHS != X) 20156 return SDValue(); 20157 20158 if (OpCC == AArch64CC::NE) 20159 CC = AArch64CC::getInvertedCondCode(CC); 20160 else if (OpCC != AArch64CC::EQ) 20161 return SDValue(); 20162 20163 SDLoc DL(Op); 20164 EVT VT = Op->getValueType(0); 20165 20166 SDValue CCValue = DAG.getConstant(CC, DL, MVT::i32); 20167 return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond); 20168 } 20169 20170 // Optimize CSEL instructions 20171 static SDValue performCSELCombine(SDNode *N, 20172 TargetLowering::DAGCombinerInfo &DCI, 20173 SelectionDAG &DAG) { 20174 // CSEL x, x, cc -> x 20175 if (N->getOperand(0) == N->getOperand(1)) 20176 return N->getOperand(0); 20177 20178 if (SDValue R = foldCSELOfCSEL(N, DAG)) 20179 return R; 20180 20181 // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1 20182 // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1 20183 if (SDValue Folded = foldCSELofCTTZ(N, DAG)) 20184 return Folded; 20185 20186 return performCONDCombine(N, DCI, DAG, 2, 3); 20187 } 20188 20189 // Try to re-use an already extended operand of a vector SetCC feeding a 20190 // extended select. Doing so avoids requiring another full extension of the 20191 // SET_CC result when lowering the select. 20192 static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG) { 20193 EVT Op0MVT = Op->getOperand(0).getValueType(); 20194 if (!Op0MVT.isVector() || Op->use_empty()) 20195 return SDValue(); 20196 20197 // Make sure that all uses of Op are VSELECTs with result matching types where 20198 // the result type has a larger element type than the SetCC operand. 20199 SDNode *FirstUse = *Op->use_begin(); 20200 if (FirstUse->getOpcode() != ISD::VSELECT) 20201 return SDValue(); 20202 EVT UseMVT = FirstUse->getValueType(0); 20203 if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits()) 20204 return SDValue(); 20205 if (any_of(Op->uses(), [&UseMVT](const SDNode *N) { 20206 return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT; 20207 })) 20208 return SDValue(); 20209 20210 APInt V; 20211 if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V)) 20212 return SDValue(); 20213 20214 SDLoc DL(Op); 20215 SDValue Op0ExtV; 20216 SDValue Op1ExtV; 20217 ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get(); 20218 // Check if the first operand of the SET_CC is already extended. If it is, 20219 // split the SET_CC and re-use the extended version of the operand. 20220 SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT), 20221 Op->getOperand(0)); 20222 SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT), 20223 Op->getOperand(0)); 20224 if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) { 20225 Op0ExtV = SDValue(Op0SExt, 0); 20226 Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1)); 20227 } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) { 20228 Op0ExtV = SDValue(Op0ZExt, 0); 20229 Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1)); 20230 } else 20231 return SDValue(); 20232 20233 return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1), 20234 Op0ExtV, Op1ExtV, Op->getOperand(2)); 20235 } 20236 20237 static SDValue performSETCCCombine(SDNode *N, 20238 TargetLowering::DAGCombinerInfo &DCI, 20239 SelectionDAG &DAG) { 20240 assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!"); 20241 SDValue LHS = N->getOperand(0); 20242 SDValue RHS = N->getOperand(1); 20243 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get(); 20244 SDLoc DL(N); 20245 EVT VT = N->getValueType(0); 20246 20247 if (SDValue V = tryToWidenSetCCOperands(N, DAG)) 20248 return V; 20249 20250 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X 20251 if (Cond == ISD::SETNE && isOneConstant(RHS) && 20252 LHS->getOpcode() == AArch64ISD::CSEL && 20253 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) && 20254 LHS->hasOneUse()) { 20255 // Invert CSEL's condition. 20256 auto *OpCC = cast<ConstantSDNode>(LHS.getOperand(2)); 20257 auto OldCond = static_cast<AArch64CC::CondCode>(OpCC->getZExtValue()); 20258 auto NewCond = getInvertedCondCode(OldCond); 20259 20260 // csel 0, 1, !cond, X 20261 SDValue CSEL = 20262 DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0), 20263 LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32), 20264 LHS.getOperand(3)); 20265 return DAG.getZExtOrTrunc(CSEL, DL, VT); 20266 } 20267 20268 // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne 20269 if (Cond == ISD::SETNE && isNullConstant(RHS) && 20270 LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) && 20271 LHS->hasOneUse()) { 20272 EVT TstVT = LHS->getValueType(0); 20273 if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64) { 20274 // this pattern will get better opt in emitComparison 20275 uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1); 20276 SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0), 20277 DAG.getConstant(TstImm, DL, TstVT)); 20278 return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2)); 20279 } 20280 } 20281 20282 // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne) 20283 // ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne) 20284 if (DCI.isBeforeLegalize() && VT.isScalarInteger() && 20285 (Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) && 20286 LHS->getOpcode() == ISD::BITCAST) { 20287 EVT ToVT = LHS->getValueType(0); 20288 EVT FromVT = LHS->getOperand(0).getValueType(); 20289 if (FromVT.isFixedLengthVector() && 20290 FromVT.getVectorElementType() == MVT::i1) { 20291 LHS = DAG.getNode(ISD::VECREDUCE_OR, DL, MVT::i1, LHS->getOperand(0)); 20292 LHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ToVT, LHS); 20293 return DAG.getSetCC(DL, VT, LHS, RHS, Cond); 20294 } 20295 } 20296 20297 // Try to perform the memcmp when the result is tested for [in]equality with 0 20298 if (SDValue V = performOrXorChainCombine(N, DAG)) 20299 return V; 20300 20301 return SDValue(); 20302 } 20303 20304 // Replace a flag-setting operator (eg ANDS) with the generic version 20305 // (eg AND) if the flag is unused. 20306 static SDValue performFlagSettingCombine(SDNode *N, 20307 TargetLowering::DAGCombinerInfo &DCI, 20308 unsigned GenericOpcode) { 20309 SDLoc DL(N); 20310 SDValue LHS = N->getOperand(0); 20311 SDValue RHS = N->getOperand(1); 20312 EVT VT = N->getValueType(0); 20313 20314 // If the flag result isn't used, convert back to a generic opcode. 20315 if (!N->hasAnyUseOfValue(1)) { 20316 SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops()); 20317 return DCI.DAG.getMergeValues({Res, DCI.DAG.getConstant(0, DL, MVT::i32)}, 20318 DL); 20319 } 20320 20321 // Combine identical generic nodes into this node, re-using the result. 20322 if (SDNode *Generic = DCI.DAG.getNodeIfExists( 20323 GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS})) 20324 DCI.CombineTo(Generic, SDValue(N, 0)); 20325 20326 return SDValue(); 20327 } 20328 20329 static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG) { 20330 // setcc_merge_zero pred 20331 // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne 20332 // => extract_subvector (inner setcc_merge_zero) 20333 SDValue Pred = N->getOperand(0); 20334 SDValue LHS = N->getOperand(1); 20335 SDValue RHS = N->getOperand(2); 20336 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get(); 20337 20338 if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) || 20339 LHS->getOpcode() != ISD::SIGN_EXTEND) 20340 return SDValue(); 20341 20342 SDValue Extract = LHS->getOperand(0); 20343 if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR || 20344 Extract->getValueType(0) != N->getValueType(0) || 20345 Extract->getConstantOperandVal(1) != 0) 20346 return SDValue(); 20347 20348 SDValue InnerSetCC = Extract->getOperand(0); 20349 if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO) 20350 return SDValue(); 20351 20352 // By this point we've effectively got 20353 // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive 20354 // lanes are already zero then the trunc(sext()) sequence is redundant and we 20355 // can operate on A directly. 20356 SDValue InnerPred = InnerSetCC.getOperand(0); 20357 if (Pred.getOpcode() == AArch64ISD::PTRUE && 20358 InnerPred.getOpcode() == AArch64ISD::PTRUE && 20359 Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) && 20360 Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 && 20361 Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256) 20362 return Extract; 20363 20364 return SDValue(); 20365 } 20366 20367 static SDValue 20368 performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 20369 assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO && 20370 "Unexpected opcode!"); 20371 20372 SelectionDAG &DAG = DCI.DAG; 20373 SDValue Pred = N->getOperand(0); 20374 SDValue LHS = N->getOperand(1); 20375 SDValue RHS = N->getOperand(2); 20376 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get(); 20377 20378 if (SDValue V = performSetCCPunpkCombine(N, DAG)) 20379 return V; 20380 20381 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) && 20382 LHS->getOpcode() == ISD::SIGN_EXTEND && 20383 LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) { 20384 // setcc_merge_zero( 20385 // pred, extend(setcc_merge_zero(pred, ...)), != splat(0)) 20386 // => setcc_merge_zero(pred, ...) 20387 if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO && 20388 LHS->getOperand(0)->getOperand(0) == Pred) 20389 return LHS->getOperand(0); 20390 20391 // setcc_merge_zero( 20392 // all_active, extend(nxvNi1 ...), != splat(0)) 20393 // -> nxvNi1 ... 20394 if (isAllActivePredicate(DAG, Pred)) 20395 return LHS->getOperand(0); 20396 20397 // setcc_merge_zero( 20398 // pred, extend(nxvNi1 ...), != splat(0)) 20399 // -> nxvNi1 and(pred, ...) 20400 if (DCI.isAfterLegalizeDAG()) 20401 // Do this after legalization to allow more folds on setcc_merge_zero 20402 // to be recognized. 20403 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0), 20404 LHS->getOperand(0), Pred); 20405 } 20406 20407 return SDValue(); 20408 } 20409 20410 // Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test 20411 // as well as whether the test should be inverted. This code is required to 20412 // catch these cases (as opposed to standard dag combines) because 20413 // AArch64ISD::TBZ is matched during legalization. 20414 static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, 20415 SelectionDAG &DAG) { 20416 20417 if (!Op->hasOneUse()) 20418 return Op; 20419 20420 // We don't handle undef/constant-fold cases below, as they should have 20421 // already been taken care of (e.g. and of 0, test of undefined shifted bits, 20422 // etc.) 20423 20424 // (tbz (trunc x), b) -> (tbz x, b) 20425 // This case is just here to enable more of the below cases to be caught. 20426 if (Op->getOpcode() == ISD::TRUNCATE && 20427 Bit < Op->getValueType(0).getSizeInBits()) { 20428 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 20429 } 20430 20431 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits. 20432 if (Op->getOpcode() == ISD::ANY_EXTEND && 20433 Bit < Op->getOperand(0).getValueSizeInBits()) { 20434 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 20435 } 20436 20437 if (Op->getNumOperands() != 2) 20438 return Op; 20439 20440 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 20441 if (!C) 20442 return Op; 20443 20444 switch (Op->getOpcode()) { 20445 default: 20446 return Op; 20447 20448 // (tbz (and x, m), b) -> (tbz x, b) 20449 case ISD::AND: 20450 if ((C->getZExtValue() >> Bit) & 1) 20451 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 20452 return Op; 20453 20454 // (tbz (shl x, c), b) -> (tbz x, b-c) 20455 case ISD::SHL: 20456 if (C->getZExtValue() <= Bit && 20457 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) { 20458 Bit = Bit - C->getZExtValue(); 20459 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 20460 } 20461 return Op; 20462 20463 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x 20464 case ISD::SRA: 20465 Bit = Bit + C->getZExtValue(); 20466 if (Bit >= Op->getValueType(0).getSizeInBits()) 20467 Bit = Op->getValueType(0).getSizeInBits() - 1; 20468 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 20469 20470 // (tbz (srl x, c), b) -> (tbz x, b+c) 20471 case ISD::SRL: 20472 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) { 20473 Bit = Bit + C->getZExtValue(); 20474 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 20475 } 20476 return Op; 20477 20478 // (tbz (xor x, -1), b) -> (tbnz x, b) 20479 case ISD::XOR: 20480 if ((C->getZExtValue() >> Bit) & 1) 20481 Invert = !Invert; 20482 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 20483 } 20484 } 20485 20486 // Optimize test single bit zero/non-zero and branch. 20487 static SDValue performTBZCombine(SDNode *N, 20488 TargetLowering::DAGCombinerInfo &DCI, 20489 SelectionDAG &DAG) { 20490 unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 20491 bool Invert = false; 20492 SDValue TestSrc = N->getOperand(1); 20493 SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG); 20494 20495 if (TestSrc == NewTestSrc) 20496 return SDValue(); 20497 20498 unsigned NewOpc = N->getOpcode(); 20499 if (Invert) { 20500 if (NewOpc == AArch64ISD::TBZ) 20501 NewOpc = AArch64ISD::TBNZ; 20502 else { 20503 assert(NewOpc == AArch64ISD::TBNZ); 20504 NewOpc = AArch64ISD::TBZ; 20505 } 20506 } 20507 20508 SDLoc DL(N); 20509 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc, 20510 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3)); 20511 } 20512 20513 // Swap vselect operands where it may allow a predicated operation to achieve 20514 // the `sel`. 20515 // 20516 // (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b))) 20517 // => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a)) 20518 static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG) { 20519 auto SelectA = N->getOperand(1); 20520 auto SelectB = N->getOperand(2); 20521 auto NTy = N->getValueType(0); 20522 20523 if (!NTy.isScalableVector()) 20524 return SDValue(); 20525 SDValue SetCC = N->getOperand(0); 20526 if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse()) 20527 return SDValue(); 20528 20529 switch (SelectB.getOpcode()) { 20530 default: 20531 return SDValue(); 20532 case ISD::FMUL: 20533 case ISD::FSUB: 20534 case ISD::FADD: 20535 break; 20536 } 20537 if (SelectA != SelectB.getOperand(0)) 20538 return SDValue(); 20539 20540 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get(); 20541 ISD::CondCode InverseCC = 20542 ISD::getSetCCInverse(CC, SetCC.getOperand(0).getValueType()); 20543 auto InverseSetCC = 20544 DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0), 20545 SetCC.getOperand(1), InverseCC); 20546 20547 return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy, 20548 {InverseSetCC, SelectB, SelectA}); 20549 } 20550 20551 // vselect (v1i1 setcc) -> 20552 // vselect (v1iXX setcc) (XX is the size of the compared operand type) 20553 // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as 20554 // condition. If it can legalize "VSELECT v1i1" correctly, no need to combine 20555 // such VSELECT. 20556 static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) { 20557 if (auto SwapResult = trySwapVSelectOperands(N, DAG)) 20558 return SwapResult; 20559 20560 SDValue N0 = N->getOperand(0); 20561 EVT CCVT = N0.getValueType(); 20562 20563 if (isAllActivePredicate(DAG, N0)) 20564 return N->getOperand(1); 20565 20566 if (isAllInactivePredicate(N0)) 20567 return N->getOperand(2); 20568 20569 // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform 20570 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the 20571 // supported types. 20572 SDValue SetCC = N->getOperand(0); 20573 if (SetCC.getOpcode() == ISD::SETCC && 20574 SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) { 20575 SDValue CmpLHS = SetCC.getOperand(0); 20576 EVT VT = CmpLHS.getValueType(); 20577 SDNode *CmpRHS = SetCC.getOperand(1).getNode(); 20578 SDNode *SplatLHS = N->getOperand(1).getNode(); 20579 SDNode *SplatRHS = N->getOperand(2).getNode(); 20580 APInt SplatLHSVal; 20581 if (CmpLHS.getValueType() == N->getOperand(1).getValueType() && 20582 VT.isSimple() && 20583 is_contained(ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, 20584 MVT::v2i32, MVT::v4i32, MVT::v2i64}), 20585 VT.getSimpleVT().SimpleTy) && 20586 ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) && 20587 SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) && 20588 ISD::isConstantSplatVectorAllOnes(SplatRHS)) { 20589 unsigned NumElts = VT.getVectorNumElements(); 20590 SmallVector<SDValue, 8> Ops( 20591 NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N), 20592 VT.getScalarType())); 20593 SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops); 20594 20595 auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val); 20596 auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1)); 20597 return Or; 20598 } 20599 } 20600 20601 if (N0.getOpcode() != ISD::SETCC || 20602 CCVT.getVectorElementCount() != ElementCount::getFixed(1) || 20603 CCVT.getVectorElementType() != MVT::i1) 20604 return SDValue(); 20605 20606 EVT ResVT = N->getValueType(0); 20607 EVT CmpVT = N0.getOperand(0).getValueType(); 20608 // Only combine when the result type is of the same size as the compared 20609 // operands. 20610 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits()) 20611 return SDValue(); 20612 20613 SDValue IfTrue = N->getOperand(1); 20614 SDValue IfFalse = N->getOperand(2); 20615 SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(), 20616 N0.getOperand(0), N0.getOperand(1), 20617 cast<CondCodeSDNode>(N0.getOperand(2))->get()); 20618 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC, 20619 IfTrue, IfFalse); 20620 } 20621 20622 /// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with 20623 /// the compare-mask instructions rather than going via NZCV, even if LHS and 20624 /// RHS are really scalar. This replaces any scalar setcc in the above pattern 20625 /// with a vector one followed by a DUP shuffle on the result. 20626 static SDValue performSelectCombine(SDNode *N, 20627 TargetLowering::DAGCombinerInfo &DCI) { 20628 SelectionDAG &DAG = DCI.DAG; 20629 SDValue N0 = N->getOperand(0); 20630 EVT ResVT = N->getValueType(0); 20631 20632 if (N0.getOpcode() != ISD::SETCC) 20633 return SDValue(); 20634 20635 if (ResVT.isScalableVector()) 20636 return SDValue(); 20637 20638 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered 20639 // scalar SetCCResultType. We also don't expect vectors, because we assume 20640 // that selects fed by vector SETCCs are canonicalized to VSELECT. 20641 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) && 20642 "Scalar-SETCC feeding SELECT has unexpected result type!"); 20643 20644 // If NumMaskElts == 0, the comparison is larger than select result. The 20645 // largest real NEON comparison is 64-bits per lane, which means the result is 20646 // at most 32-bits and an illegal vector. Just bail out for now. 20647 EVT SrcVT = N0.getOperand(0).getValueType(); 20648 20649 // Don't try to do this optimization when the setcc itself has i1 operands. 20650 // There are no legal vectors of i1, so this would be pointless. 20651 if (SrcVT == MVT::i1) 20652 return SDValue(); 20653 20654 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits(); 20655 if (!ResVT.isVector() || NumMaskElts == 0) 20656 return SDValue(); 20657 20658 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts); 20659 EVT CCVT = SrcVT.changeVectorElementTypeToInteger(); 20660 20661 // Also bail out if the vector CCVT isn't the same size as ResVT. 20662 // This can happen if the SETCC operand size doesn't divide the ResVT size 20663 // (e.g., f64 vs v3f32). 20664 if (CCVT.getSizeInBits() != ResVT.getSizeInBits()) 20665 return SDValue(); 20666 20667 // Make sure we didn't create illegal types, if we're not supposed to. 20668 assert(DCI.isBeforeLegalize() || 20669 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)); 20670 20671 // First perform a vector comparison, where lane 0 is the one we're interested 20672 // in. 20673 SDLoc DL(N0); 20674 SDValue LHS = 20675 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0)); 20676 SDValue RHS = 20677 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1)); 20678 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2)); 20679 20680 // Now duplicate the comparison mask we want across all other lanes. 20681 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0); 20682 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask); 20683 Mask = DAG.getNode(ISD::BITCAST, DL, 20684 ResVT.changeVectorElementTypeToInteger(), Mask); 20685 20686 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2)); 20687 } 20688 20689 static SDValue performDUPCombine(SDNode *N, 20690 TargetLowering::DAGCombinerInfo &DCI) { 20691 EVT VT = N->getValueType(0); 20692 // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the 20693 // 128bit vector version. 20694 if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) { 20695 EVT LVT = VT.getDoubleNumVectorElementsVT(*DCI.DAG.getContext()); 20696 if (SDNode *LN = DCI.DAG.getNodeIfExists( 20697 N->getOpcode(), DCI.DAG.getVTList(LVT), {N->getOperand(0)})) { 20698 SDLoc DL(N); 20699 return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0), 20700 DCI.DAG.getConstant(0, DL, MVT::i64)); 20701 } 20702 } 20703 20704 return performPostLD1Combine(N, DCI, false); 20705 } 20706 20707 /// Get rid of unnecessary NVCASTs (that don't change the type). 20708 static SDValue performNVCASTCombine(SDNode *N) { 20709 if (N->getValueType(0) == N->getOperand(0).getValueType()) 20710 return N->getOperand(0); 20711 20712 return SDValue(); 20713 } 20714 20715 // If all users of the globaladdr are of the form (globaladdr + constant), find 20716 // the smallest constant, fold it into the globaladdr's offset and rewrite the 20717 // globaladdr as (globaladdr + constant) - constant. 20718 static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, 20719 const AArch64Subtarget *Subtarget, 20720 const TargetMachine &TM) { 20721 auto *GN = cast<GlobalAddressSDNode>(N); 20722 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) != 20723 AArch64II::MO_NO_FLAG) 20724 return SDValue(); 20725 20726 uint64_t MinOffset = -1ull; 20727 for (SDNode *N : GN->uses()) { 20728 if (N->getOpcode() != ISD::ADD) 20729 return SDValue(); 20730 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0)); 20731 if (!C) 20732 C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 20733 if (!C) 20734 return SDValue(); 20735 MinOffset = std::min(MinOffset, C->getZExtValue()); 20736 } 20737 uint64_t Offset = MinOffset + GN->getOffset(); 20738 20739 // Require that the new offset is larger than the existing one. Otherwise, we 20740 // can end up oscillating between two possible DAGs, for example, 20741 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1). 20742 if (Offset <= uint64_t(GN->getOffset())) 20743 return SDValue(); 20744 20745 // Check whether folding this offset is legal. It must not go out of bounds of 20746 // the referenced object to avoid violating the code model, and must be 20747 // smaller than 2^20 because this is the largest offset expressible in all 20748 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF 20749 // stores an immediate signed 21 bit offset.) 20750 // 20751 // This check also prevents us from folding negative offsets, which will end 20752 // up being treated in the same way as large positive ones. They could also 20753 // cause code model violations, and aren't really common enough to matter. 20754 if (Offset >= (1 << 20)) 20755 return SDValue(); 20756 20757 const GlobalValue *GV = GN->getGlobal(); 20758 Type *T = GV->getValueType(); 20759 if (!T->isSized() || 20760 Offset > GV->getParent()->getDataLayout().getTypeAllocSize(T)) 20761 return SDValue(); 20762 20763 SDLoc DL(GN); 20764 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset); 20765 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result, 20766 DAG.getConstant(MinOffset, DL, MVT::i64)); 20767 } 20768 20769 static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG, 20770 const AArch64Subtarget *Subtarget) { 20771 SDValue BR = N->getOperand(0); 20772 if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE || 20773 !BR.getValueType().isScalarInteger()) 20774 return SDValue(); 20775 20776 SDLoc DL(N); 20777 return DAG.getNode(ISD::CTTZ, DL, BR.getValueType(), BR.getOperand(0)); 20778 } 20779 20780 // Turns the vector of indices into a vector of byte offstes by scaling Offset 20781 // by (BitWidth / 8). 20782 static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, 20783 SDLoc DL, unsigned BitWidth) { 20784 assert(Offset.getValueType().isScalableVector() && 20785 "This method is only for scalable vectors of offsets"); 20786 20787 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64); 20788 SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift); 20789 20790 return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift); 20791 } 20792 20793 /// Check if the value of \p OffsetInBytes can be used as an immediate for 20794 /// the gather load/prefetch and scatter store instructions with vector base and 20795 /// immediate offset addressing mode: 20796 /// 20797 /// [<Zn>.[S|D]{, #<imm>}] 20798 /// 20799 /// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31. 20800 inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, 20801 unsigned ScalarSizeInBytes) { 20802 // The immediate is not a multiple of the scalar size. 20803 if (OffsetInBytes % ScalarSizeInBytes) 20804 return false; 20805 20806 // The immediate is out of range. 20807 if (OffsetInBytes / ScalarSizeInBytes > 31) 20808 return false; 20809 20810 return true; 20811 } 20812 20813 /// Check if the value of \p Offset represents a valid immediate for the SVE 20814 /// gather load/prefetch and scatter store instructiona with vector base and 20815 /// immediate offset addressing mode: 20816 /// 20817 /// [<Zn>.[S|D]{, #<imm>}] 20818 /// 20819 /// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31. 20820 static bool isValidImmForSVEVecImmAddrMode(SDValue Offset, 20821 unsigned ScalarSizeInBytes) { 20822 ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode()); 20823 return OffsetConst && isValidImmForSVEVecImmAddrMode( 20824 OffsetConst->getZExtValue(), ScalarSizeInBytes); 20825 } 20826 20827 static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, 20828 unsigned Opcode, 20829 bool OnlyPackedOffsets = true) { 20830 const SDValue Src = N->getOperand(2); 20831 const EVT SrcVT = Src->getValueType(0); 20832 assert(SrcVT.isScalableVector() && 20833 "Scatter stores are only possible for SVE vectors"); 20834 20835 SDLoc DL(N); 20836 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT(); 20837 20838 // Make sure that source data will fit into an SVE register 20839 if (SrcVT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock) 20840 return SDValue(); 20841 20842 // For FPs, ACLE only supports _packed_ single and double precision types. 20843 if (SrcElVT.isFloatingPoint()) 20844 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64)) 20845 return SDValue(); 20846 20847 // Depending on the addressing mode, this is either a pointer or a vector of 20848 // pointers (that fits into one register) 20849 SDValue Base = N->getOperand(4); 20850 // Depending on the addressing mode, this is either a single offset or a 20851 // vector of offsets (that fits into one register) 20852 SDValue Offset = N->getOperand(5); 20853 20854 // For "scalar + vector of indices", just scale the indices. This only 20855 // applies to non-temporal scatters because there's no instruction that takes 20856 // indicies. 20857 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) { 20858 Offset = 20859 getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits()); 20860 Opcode = AArch64ISD::SSTNT1_PRED; 20861 } 20862 20863 // In the case of non-temporal gather loads there's only one SVE instruction 20864 // per data-size: "scalar + vector", i.e. 20865 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0] 20866 // Since we do have intrinsics that allow the arguments to be in a different 20867 // order, we may need to swap them to match the spec. 20868 if (Opcode == AArch64ISD::SSTNT1_PRED && Offset.getValueType().isVector()) 20869 std::swap(Base, Offset); 20870 20871 // SST1_IMM requires that the offset is an immediate that is: 20872 // * a multiple of #SizeInBytes, 20873 // * in the range [0, 31 x #SizeInBytes], 20874 // where #SizeInBytes is the size in bytes of the stored items. For 20875 // immediates outside that range and non-immediate scalar offsets use SST1 or 20876 // SST1_UXTW instead. 20877 if (Opcode == AArch64ISD::SST1_IMM_PRED) { 20878 if (!isValidImmForSVEVecImmAddrMode(Offset, 20879 SrcVT.getScalarSizeInBits() / 8)) { 20880 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy) 20881 Opcode = AArch64ISD::SST1_UXTW_PRED; 20882 else 20883 Opcode = AArch64ISD::SST1_PRED; 20884 20885 std::swap(Base, Offset); 20886 } 20887 } 20888 20889 auto &TLI = DAG.getTargetLoweringInfo(); 20890 if (!TLI.isTypeLegal(Base.getValueType())) 20891 return SDValue(); 20892 20893 // Some scatter store variants allow unpacked offsets, but only as nxv2i32 20894 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to 20895 // nxv2i64. Legalize accordingly. 20896 if (!OnlyPackedOffsets && 20897 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32) 20898 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0); 20899 20900 if (!TLI.isTypeLegal(Offset.getValueType())) 20901 return SDValue(); 20902 20903 // Source value type that is representable in hardware 20904 EVT HwSrcVt = getSVEContainerType(SrcVT); 20905 20906 // Keep the original type of the input data to store - this is needed to be 20907 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For 20908 // FP values we want the integer equivalent, so just use HwSrcVt. 20909 SDValue InputVT = DAG.getValueType(SrcVT); 20910 if (SrcVT.isFloatingPoint()) 20911 InputVT = DAG.getValueType(HwSrcVt); 20912 20913 SDVTList VTs = DAG.getVTList(MVT::Other); 20914 SDValue SrcNew; 20915 20916 if (Src.getValueType().isFloatingPoint()) 20917 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src); 20918 else 20919 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src); 20920 20921 SDValue Ops[] = {N->getOperand(0), // Chain 20922 SrcNew, 20923 N->getOperand(3), // Pg 20924 Base, 20925 Offset, 20926 InputVT}; 20927 20928 return DAG.getNode(Opcode, DL, VTs, Ops); 20929 } 20930 20931 static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, 20932 unsigned Opcode, 20933 bool OnlyPackedOffsets = true) { 20934 const EVT RetVT = N->getValueType(0); 20935 assert(RetVT.isScalableVector() && 20936 "Gather loads are only possible for SVE vectors"); 20937 20938 SDLoc DL(N); 20939 20940 // Make sure that the loaded data will fit into an SVE register 20941 if (RetVT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock) 20942 return SDValue(); 20943 20944 // Depending on the addressing mode, this is either a pointer or a vector of 20945 // pointers (that fits into one register) 20946 SDValue Base = N->getOperand(3); 20947 // Depending on the addressing mode, this is either a single offset or a 20948 // vector of offsets (that fits into one register) 20949 SDValue Offset = N->getOperand(4); 20950 20951 // For "scalar + vector of indices", just scale the indices. This only 20952 // applies to non-temporal gathers because there's no instruction that takes 20953 // indicies. 20954 if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) { 20955 Offset = getScaledOffsetForBitWidth(DAG, Offset, DL, 20956 RetVT.getScalarSizeInBits()); 20957 Opcode = AArch64ISD::GLDNT1_MERGE_ZERO; 20958 } 20959 20960 // In the case of non-temporal gather loads there's only one SVE instruction 20961 // per data-size: "scalar + vector", i.e. 20962 // * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0] 20963 // Since we do have intrinsics that allow the arguments to be in a different 20964 // order, we may need to swap them to match the spec. 20965 if (Opcode == AArch64ISD::GLDNT1_MERGE_ZERO && 20966 Offset.getValueType().isVector()) 20967 std::swap(Base, Offset); 20968 20969 // GLD{FF}1_IMM requires that the offset is an immediate that is: 20970 // * a multiple of #SizeInBytes, 20971 // * in the range [0, 31 x #SizeInBytes], 20972 // where #SizeInBytes is the size in bytes of the loaded items. For 20973 // immediates outside that range and non-immediate scalar offsets use 20974 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead. 20975 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO || 20976 Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) { 20977 if (!isValidImmForSVEVecImmAddrMode(Offset, 20978 RetVT.getScalarSizeInBits() / 8)) { 20979 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy) 20980 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO) 20981 ? AArch64ISD::GLD1_UXTW_MERGE_ZERO 20982 : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO; 20983 else 20984 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO) 20985 ? AArch64ISD::GLD1_MERGE_ZERO 20986 : AArch64ISD::GLDFF1_MERGE_ZERO; 20987 20988 std::swap(Base, Offset); 20989 } 20990 } 20991 20992 auto &TLI = DAG.getTargetLoweringInfo(); 20993 if (!TLI.isTypeLegal(Base.getValueType())) 20994 return SDValue(); 20995 20996 // Some gather load variants allow unpacked offsets, but only as nxv2i32 20997 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to 20998 // nxv2i64. Legalize accordingly. 20999 if (!OnlyPackedOffsets && 21000 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32) 21001 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0); 21002 21003 // Return value type that is representable in hardware 21004 EVT HwRetVt = getSVEContainerType(RetVT); 21005 21006 // Keep the original output value type around - this is needed to be able to 21007 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP 21008 // values we want the integer equivalent, so just use HwRetVT. 21009 SDValue OutVT = DAG.getValueType(RetVT); 21010 if (RetVT.isFloatingPoint()) 21011 OutVT = DAG.getValueType(HwRetVt); 21012 21013 SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other); 21014 SDValue Ops[] = {N->getOperand(0), // Chain 21015 N->getOperand(2), // Pg 21016 Base, Offset, OutVT}; 21017 21018 SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops); 21019 SDValue LoadChain = SDValue(Load.getNode(), 1); 21020 21021 if (RetVT.isInteger() && (RetVT != HwRetVt)) 21022 Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0)); 21023 21024 // If the original return value was FP, bitcast accordingly. Doing it here 21025 // means that we can avoid adding TableGen patterns for FPs. 21026 if (RetVT.isFloatingPoint()) 21027 Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0)); 21028 21029 return DAG.getMergeValues({Load, LoadChain}, DL); 21030 } 21031 21032 static SDValue 21033 performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 21034 SelectionDAG &DAG) { 21035 SDLoc DL(N); 21036 SDValue Src = N->getOperand(0); 21037 unsigned Opc = Src->getOpcode(); 21038 21039 // Sign extend of an unsigned unpack -> signed unpack 21040 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) { 21041 21042 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI 21043 : AArch64ISD::SUNPKLO; 21044 21045 // Push the sign extend to the operand of the unpack 21046 // This is necessary where, for example, the operand of the unpack 21047 // is another unpack: 21048 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8) 21049 // -> 21050 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8) 21051 // -> 21052 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd)) 21053 SDValue ExtOp = Src->getOperand(0); 21054 auto VT = cast<VTSDNode>(N->getOperand(1))->getVT(); 21055 EVT EltTy = VT.getVectorElementType(); 21056 (void)EltTy; 21057 21058 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) && 21059 "Sign extending from an invalid type"); 21060 21061 EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext()); 21062 21063 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(), 21064 ExtOp, DAG.getValueType(ExtVT)); 21065 21066 return DAG.getNode(SOpc, DL, N->getValueType(0), Ext); 21067 } 21068 21069 if (DCI.isBeforeLegalizeOps()) 21070 return SDValue(); 21071 21072 if (!EnableCombineMGatherIntrinsics) 21073 return SDValue(); 21074 21075 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates 21076 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes. 21077 unsigned NewOpc; 21078 unsigned MemVTOpNum = 4; 21079 switch (Opc) { 21080 case AArch64ISD::LD1_MERGE_ZERO: 21081 NewOpc = AArch64ISD::LD1S_MERGE_ZERO; 21082 MemVTOpNum = 3; 21083 break; 21084 case AArch64ISD::LDNF1_MERGE_ZERO: 21085 NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO; 21086 MemVTOpNum = 3; 21087 break; 21088 case AArch64ISD::LDFF1_MERGE_ZERO: 21089 NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO; 21090 MemVTOpNum = 3; 21091 break; 21092 case AArch64ISD::GLD1_MERGE_ZERO: 21093 NewOpc = AArch64ISD::GLD1S_MERGE_ZERO; 21094 break; 21095 case AArch64ISD::GLD1_SCALED_MERGE_ZERO: 21096 NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO; 21097 break; 21098 case AArch64ISD::GLD1_SXTW_MERGE_ZERO: 21099 NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO; 21100 break; 21101 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO: 21102 NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO; 21103 break; 21104 case AArch64ISD::GLD1_UXTW_MERGE_ZERO: 21105 NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO; 21106 break; 21107 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO: 21108 NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO; 21109 break; 21110 case AArch64ISD::GLD1_IMM_MERGE_ZERO: 21111 NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO; 21112 break; 21113 case AArch64ISD::GLDFF1_MERGE_ZERO: 21114 NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO; 21115 break; 21116 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO: 21117 NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO; 21118 break; 21119 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO: 21120 NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO; 21121 break; 21122 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO: 21123 NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO; 21124 break; 21125 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO: 21126 NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO; 21127 break; 21128 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO: 21129 NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO; 21130 break; 21131 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO: 21132 NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO; 21133 break; 21134 case AArch64ISD::GLDNT1_MERGE_ZERO: 21135 NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO; 21136 break; 21137 default: 21138 return SDValue(); 21139 } 21140 21141 EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT(); 21142 EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT(); 21143 21144 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse()) 21145 return SDValue(); 21146 21147 EVT DstVT = N->getValueType(0); 21148 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other); 21149 21150 SmallVector<SDValue, 5> Ops; 21151 for (unsigned I = 0; I < Src->getNumOperands(); ++I) 21152 Ops.push_back(Src->getOperand(I)); 21153 21154 SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops); 21155 DCI.CombineTo(N, ExtLoad); 21156 DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1)); 21157 21158 // Return N so it doesn't get rechecked 21159 return SDValue(N, 0); 21160 } 21161 21162 /// Legalize the gather prefetch (scalar + vector addressing mode) when the 21163 /// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset 21164 /// != nxv2i32) do not need legalization. 21165 static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) { 21166 const unsigned OffsetPos = 4; 21167 SDValue Offset = N->getOperand(OffsetPos); 21168 21169 // Not an unpacked vector, bail out. 21170 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32) 21171 return SDValue(); 21172 21173 // Extend the unpacked offset vector to 64-bit lanes. 21174 SDLoc DL(N); 21175 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset); 21176 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end()); 21177 // Replace the offset operand with the 64-bit one. 21178 Ops[OffsetPos] = Offset; 21179 21180 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops); 21181 } 21182 21183 /// Combines a node carrying the intrinsic 21184 /// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses 21185 /// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to 21186 /// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the 21187 /// sve gather prefetch instruction with vector plus immediate addressing mode. 21188 static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, 21189 unsigned ScalarSizeInBytes) { 21190 const unsigned ImmPos = 4, OffsetPos = 3; 21191 // No need to combine the node if the immediate is valid... 21192 if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes)) 21193 return SDValue(); 21194 21195 // ...otherwise swap the offset base with the offset... 21196 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end()); 21197 std::swap(Ops[ImmPos], Ops[OffsetPos]); 21198 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to 21199 // `aarch64_sve_prfb_gather_uxtw_index`. 21200 SDLoc DL(N); 21201 Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL, 21202 MVT::i64); 21203 21204 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops); 21205 } 21206 21207 // Return true if the vector operation can guarantee only the first lane of its 21208 // result contains data, with all bits in other lanes set to zero. 21209 static bool isLanes1toNKnownZero(SDValue Op) { 21210 switch (Op.getOpcode()) { 21211 default: 21212 return false; 21213 case AArch64ISD::ANDV_PRED: 21214 case AArch64ISD::EORV_PRED: 21215 case AArch64ISD::FADDA_PRED: 21216 case AArch64ISD::FADDV_PRED: 21217 case AArch64ISD::FMAXNMV_PRED: 21218 case AArch64ISD::FMAXV_PRED: 21219 case AArch64ISD::FMINNMV_PRED: 21220 case AArch64ISD::FMINV_PRED: 21221 case AArch64ISD::ORV_PRED: 21222 case AArch64ISD::SADDV_PRED: 21223 case AArch64ISD::SMAXV_PRED: 21224 case AArch64ISD::SMINV_PRED: 21225 case AArch64ISD::UADDV_PRED: 21226 case AArch64ISD::UMAXV_PRED: 21227 case AArch64ISD::UMINV_PRED: 21228 return true; 21229 } 21230 } 21231 21232 static SDValue removeRedundantInsertVectorElt(SDNode *N) { 21233 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!"); 21234 SDValue InsertVec = N->getOperand(0); 21235 SDValue InsertElt = N->getOperand(1); 21236 SDValue InsertIdx = N->getOperand(2); 21237 21238 // We only care about inserts into the first element... 21239 if (!isNullConstant(InsertIdx)) 21240 return SDValue(); 21241 // ...of a zero'd vector... 21242 if (!ISD::isConstantSplatVectorAllZeros(InsertVec.getNode())) 21243 return SDValue(); 21244 // ...where the inserted data was previously extracted... 21245 if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT) 21246 return SDValue(); 21247 21248 SDValue ExtractVec = InsertElt.getOperand(0); 21249 SDValue ExtractIdx = InsertElt.getOperand(1); 21250 21251 // ...from the first element of a vector. 21252 if (!isNullConstant(ExtractIdx)) 21253 return SDValue(); 21254 21255 // If we get here we are effectively trying to zero lanes 1-N of a vector. 21256 21257 // Ensure there's no type conversion going on. 21258 if (N->getValueType(0) != ExtractVec.getValueType()) 21259 return SDValue(); 21260 21261 if (!isLanes1toNKnownZero(ExtractVec)) 21262 return SDValue(); 21263 21264 // The explicit zeroing is redundant. 21265 return ExtractVec; 21266 } 21267 21268 static SDValue 21269 performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 21270 if (SDValue Res = removeRedundantInsertVectorElt(N)) 21271 return Res; 21272 21273 return performPostLD1Combine(N, DCI, true); 21274 } 21275 21276 static SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG) { 21277 EVT Ty = N->getValueType(0); 21278 if (Ty.isInteger()) 21279 return SDValue(); 21280 21281 EVT IntTy = Ty.changeVectorElementTypeToInteger(); 21282 EVT ExtIntTy = getPackedSVEVectorVT(IntTy.getVectorElementCount()); 21283 if (ExtIntTy.getVectorElementType().getScalarSizeInBits() < 21284 IntTy.getVectorElementType().getScalarSizeInBits()) 21285 return SDValue(); 21286 21287 SDLoc DL(N); 21288 SDValue LHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(0)), 21289 DL, ExtIntTy); 21290 SDValue RHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(1)), 21291 DL, ExtIntTy); 21292 SDValue Idx = N->getOperand(2); 21293 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ExtIntTy, LHS, RHS, Idx); 21294 SDValue Trunc = DAG.getAnyExtOrTrunc(Splice, DL, IntTy); 21295 return DAG.getBitcast(Ty, Trunc); 21296 } 21297 21298 static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG, 21299 TargetLowering::DAGCombinerInfo &DCI, 21300 const AArch64Subtarget *Subtarget) { 21301 SDValue N0 = N->getOperand(0); 21302 EVT VT = N->getValueType(0); 21303 21304 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded. 21305 if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND) 21306 return SDValue(); 21307 21308 auto hasValidElementTypeForFPExtLoad = [](EVT VT) { 21309 EVT EltVT = VT.getVectorElementType(); 21310 return EltVT == MVT::f32 || EltVT == MVT::f64; 21311 }; 21312 21313 // fold (fpext (load x)) -> (fpext (fptrunc (extload x))) 21314 // We purposefully don't care about legality of the nodes here as we know 21315 // they can be split down into something legal. 21316 if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) && 21317 N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() && 21318 VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) && 21319 VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) { 21320 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 21321 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT, 21322 LN0->getChain(), LN0->getBasePtr(), 21323 N0.getValueType(), LN0->getMemOperand()); 21324 DCI.CombineTo(N, ExtLoad); 21325 DCI.CombineTo( 21326 N0.getNode(), 21327 DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad, 21328 DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)), 21329 ExtLoad.getValue(1)); 21330 return SDValue(N, 0); // Return N so it doesn't get rechecked! 21331 } 21332 21333 return SDValue(); 21334 } 21335 21336 static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG, 21337 const AArch64Subtarget *Subtarget) { 21338 EVT VT = N->getValueType(0); 21339 21340 // Don't expand for NEON, SVE2 or SME 21341 if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME()) 21342 return SDValue(); 21343 21344 SDLoc DL(N); 21345 21346 SDValue Mask = N->getOperand(0); 21347 SDValue In1 = N->getOperand(1); 21348 SDValue In2 = N->getOperand(2); 21349 21350 SDValue InvMask = DAG.getNOT(DL, Mask, VT); 21351 SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1); 21352 SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2); 21353 return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv); 21354 } 21355 21356 static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG) { 21357 EVT VT = N->getValueType(0); 21358 21359 SDValue Insert = N->getOperand(0); 21360 if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR) 21361 return SDValue(); 21362 21363 if (!Insert.getOperand(0).isUndef()) 21364 return SDValue(); 21365 21366 uint64_t IdxInsert = Insert.getConstantOperandVal(2); 21367 uint64_t IdxDupLane = N->getConstantOperandVal(1); 21368 if (IdxInsert != 0 || IdxDupLane != 0) 21369 return SDValue(); 21370 21371 SDValue Bitcast = Insert.getOperand(1); 21372 if (Bitcast.getOpcode() != ISD::BITCAST) 21373 return SDValue(); 21374 21375 SDValue Subvec = Bitcast.getOperand(0); 21376 EVT SubvecVT = Subvec.getValueType(); 21377 if (!SubvecVT.is128BitVector()) 21378 return SDValue(); 21379 EVT NewSubvecVT = 21380 getPackedSVEVectorVT(Subvec.getValueType().getVectorElementType()); 21381 21382 SDLoc DL(N); 21383 SDValue NewInsert = 21384 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT, 21385 DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2)); 21386 SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT, 21387 NewInsert, N->getOperand(1)); 21388 return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128); 21389 } 21390 21391 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, 21392 DAGCombinerInfo &DCI) const { 21393 SelectionDAG &DAG = DCI.DAG; 21394 switch (N->getOpcode()) { 21395 default: 21396 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n"); 21397 break; 21398 case ISD::ADD: 21399 case ISD::SUB: 21400 return performAddSubCombine(N, DCI, DAG); 21401 case ISD::BUILD_VECTOR: 21402 return performBuildVectorCombine(N, DCI, DAG); 21403 case ISD::TRUNCATE: 21404 return performTruncateCombine(N, DAG); 21405 case AArch64ISD::ANDS: 21406 return performFlagSettingCombine(N, DCI, ISD::AND); 21407 case AArch64ISD::ADC: 21408 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true)) 21409 return R; 21410 return foldADCToCINC(N, DAG); 21411 case AArch64ISD::SBC: 21412 return foldOverflowCheck(N, DAG, /* IsAdd */ false); 21413 case AArch64ISD::ADCS: 21414 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true)) 21415 return R; 21416 return performFlagSettingCombine(N, DCI, AArch64ISD::ADC); 21417 case AArch64ISD::SBCS: 21418 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false)) 21419 return R; 21420 return performFlagSettingCombine(N, DCI, AArch64ISD::SBC); 21421 case ISD::XOR: 21422 return performXorCombine(N, DAG, DCI, Subtarget); 21423 case ISD::MUL: 21424 return performMulCombine(N, DAG, DCI, Subtarget); 21425 case ISD::SINT_TO_FP: 21426 case ISD::UINT_TO_FP: 21427 return performIntToFpCombine(N, DAG, Subtarget); 21428 case ISD::FP_TO_SINT: 21429 case ISD::FP_TO_UINT: 21430 case ISD::FP_TO_SINT_SAT: 21431 case ISD::FP_TO_UINT_SAT: 21432 return performFpToIntCombine(N, DAG, DCI, Subtarget); 21433 case ISD::FDIV: 21434 return performFDivCombine(N, DAG, DCI, Subtarget); 21435 case ISD::OR: 21436 return performORCombine(N, DCI, Subtarget, *this); 21437 case ISD::AND: 21438 return performANDCombine(N, DCI); 21439 case ISD::INTRINSIC_WO_CHAIN: 21440 return performIntrinsicCombine(N, DCI, Subtarget); 21441 case ISD::ANY_EXTEND: 21442 case ISD::ZERO_EXTEND: 21443 case ISD::SIGN_EXTEND: 21444 return performExtendCombine(N, DCI, DAG); 21445 case ISD::SIGN_EXTEND_INREG: 21446 return performSignExtendInRegCombine(N, DCI, DAG); 21447 case ISD::CONCAT_VECTORS: 21448 return performConcatVectorsCombine(N, DCI, DAG); 21449 case ISD::EXTRACT_SUBVECTOR: 21450 return performExtractSubvectorCombine(N, DCI, DAG); 21451 case ISD::INSERT_SUBVECTOR: 21452 return performInsertSubvectorCombine(N, DCI, DAG); 21453 case ISD::SELECT: 21454 return performSelectCombine(N, DCI); 21455 case ISD::VSELECT: 21456 return performVSelectCombine(N, DCI.DAG); 21457 case ISD::SETCC: 21458 return performSETCCCombine(N, DCI, DAG); 21459 case ISD::LOAD: 21460 return performLOADCombine(N, DCI, DAG, Subtarget); 21461 case ISD::STORE: 21462 return performSTORECombine(N, DCI, DAG, Subtarget); 21463 case ISD::MSTORE: 21464 return performMSTORECombine(N, DCI, DAG, Subtarget); 21465 case ISD::MGATHER: 21466 case ISD::MSCATTER: 21467 return performMaskedGatherScatterCombine(N, DCI, DAG); 21468 case ISD::VECTOR_SPLICE: 21469 return performSVESpliceCombine(N, DAG); 21470 case ISD::FP_EXTEND: 21471 return performFPExtendCombine(N, DAG, DCI, Subtarget); 21472 case AArch64ISD::BRCOND: 21473 return performBRCONDCombine(N, DCI, DAG); 21474 case AArch64ISD::TBNZ: 21475 case AArch64ISD::TBZ: 21476 return performTBZCombine(N, DCI, DAG); 21477 case AArch64ISD::CSEL: 21478 return performCSELCombine(N, DCI, DAG); 21479 case AArch64ISD::DUP: 21480 return performDUPCombine(N, DCI); 21481 case AArch64ISD::DUPLANE128: 21482 return performDupLane128Combine(N, DAG); 21483 case AArch64ISD::NVCAST: 21484 return performNVCASTCombine(N); 21485 case AArch64ISD::SPLICE: 21486 return performSpliceCombine(N, DAG); 21487 case AArch64ISD::UUNPKLO: 21488 case AArch64ISD::UUNPKHI: 21489 return performUnpackCombine(N, DAG, Subtarget); 21490 case AArch64ISD::UZP1: 21491 return performUzpCombine(N, DAG); 21492 case AArch64ISD::SETCC_MERGE_ZERO: 21493 return performSetccMergeZeroCombine(N, DCI); 21494 case AArch64ISD::REINTERPRET_CAST: 21495 return performReinterpretCastCombine(N); 21496 case AArch64ISD::GLD1_MERGE_ZERO: 21497 case AArch64ISD::GLD1_SCALED_MERGE_ZERO: 21498 case AArch64ISD::GLD1_UXTW_MERGE_ZERO: 21499 case AArch64ISD::GLD1_SXTW_MERGE_ZERO: 21500 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO: 21501 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO: 21502 case AArch64ISD::GLD1_IMM_MERGE_ZERO: 21503 case AArch64ISD::GLD1S_MERGE_ZERO: 21504 case AArch64ISD::GLD1S_SCALED_MERGE_ZERO: 21505 case AArch64ISD::GLD1S_UXTW_MERGE_ZERO: 21506 case AArch64ISD::GLD1S_SXTW_MERGE_ZERO: 21507 case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO: 21508 case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO: 21509 case AArch64ISD::GLD1S_IMM_MERGE_ZERO: 21510 return performGLD1Combine(N, DAG); 21511 case AArch64ISD::VASHR: 21512 case AArch64ISD::VLSHR: 21513 return performVectorShiftCombine(N, *this, DCI); 21514 case AArch64ISD::SUNPKLO: 21515 return performSunpkloCombine(N, DAG); 21516 case AArch64ISD::BSP: 21517 return performBSPExpandForSVE(N, DAG, Subtarget); 21518 case ISD::INSERT_VECTOR_ELT: 21519 return performInsertVectorEltCombine(N, DCI); 21520 case ISD::EXTRACT_VECTOR_ELT: 21521 return performExtractVectorEltCombine(N, DCI, Subtarget); 21522 case ISD::VECREDUCE_ADD: 21523 return performVecReduceAddCombine(N, DCI.DAG, Subtarget); 21524 case AArch64ISD::UADDV: 21525 return performUADDVCombine(N, DAG); 21526 case AArch64ISD::SMULL: 21527 case AArch64ISD::UMULL: 21528 case AArch64ISD::PMULL: 21529 return tryCombineLongOpWithDup(Intrinsic::not_intrinsic, N, DCI, DAG); 21530 case ISD::INTRINSIC_VOID: 21531 case ISD::INTRINSIC_W_CHAIN: 21532 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 21533 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset: 21534 return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/); 21535 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset: 21536 return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/); 21537 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset: 21538 return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/); 21539 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset: 21540 return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/); 21541 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index: 21542 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index: 21543 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index: 21544 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index: 21545 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index: 21546 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index: 21547 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index: 21548 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index: 21549 return legalizeSVEGatherPrefetchOffsVec(N, DAG); 21550 case Intrinsic::aarch64_neon_ld2: 21551 case Intrinsic::aarch64_neon_ld3: 21552 case Intrinsic::aarch64_neon_ld4: 21553 case Intrinsic::aarch64_neon_ld1x2: 21554 case Intrinsic::aarch64_neon_ld1x3: 21555 case Intrinsic::aarch64_neon_ld1x4: 21556 case Intrinsic::aarch64_neon_ld2lane: 21557 case Intrinsic::aarch64_neon_ld3lane: 21558 case Intrinsic::aarch64_neon_ld4lane: 21559 case Intrinsic::aarch64_neon_ld2r: 21560 case Intrinsic::aarch64_neon_ld3r: 21561 case Intrinsic::aarch64_neon_ld4r: 21562 case Intrinsic::aarch64_neon_st2: 21563 case Intrinsic::aarch64_neon_st3: 21564 case Intrinsic::aarch64_neon_st4: 21565 case Intrinsic::aarch64_neon_st1x2: 21566 case Intrinsic::aarch64_neon_st1x3: 21567 case Intrinsic::aarch64_neon_st1x4: 21568 case Intrinsic::aarch64_neon_st2lane: 21569 case Intrinsic::aarch64_neon_st3lane: 21570 case Intrinsic::aarch64_neon_st4lane: 21571 return performNEONPostLDSTCombine(N, DCI, DAG); 21572 case Intrinsic::aarch64_sve_ldnt1: 21573 return performLDNT1Combine(N, DAG); 21574 case Intrinsic::aarch64_sve_ld1rq: 21575 return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG); 21576 case Intrinsic::aarch64_sve_ld1ro: 21577 return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG); 21578 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset: 21579 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO); 21580 case Intrinsic::aarch64_sve_ldnt1_gather: 21581 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO); 21582 case Intrinsic::aarch64_sve_ldnt1_gather_index: 21583 return performGatherLoadCombine(N, DAG, 21584 AArch64ISD::GLDNT1_INDEX_MERGE_ZERO); 21585 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw: 21586 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO); 21587 case Intrinsic::aarch64_sve_ld1: 21588 return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO); 21589 case Intrinsic::aarch64_sve_ldnf1: 21590 return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO); 21591 case Intrinsic::aarch64_sve_ldff1: 21592 return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO); 21593 case Intrinsic::aarch64_sve_st1: 21594 return performST1Combine(N, DAG); 21595 case Intrinsic::aarch64_sve_stnt1: 21596 return performSTNT1Combine(N, DAG); 21597 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset: 21598 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED); 21599 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw: 21600 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED); 21601 case Intrinsic::aarch64_sve_stnt1_scatter: 21602 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED); 21603 case Intrinsic::aarch64_sve_stnt1_scatter_index: 21604 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED); 21605 case Intrinsic::aarch64_sve_ld1_gather: 21606 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO); 21607 case Intrinsic::aarch64_sve_ld1_gather_index: 21608 return performGatherLoadCombine(N, DAG, 21609 AArch64ISD::GLD1_SCALED_MERGE_ZERO); 21610 case Intrinsic::aarch64_sve_ld1_gather_sxtw: 21611 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO, 21612 /*OnlyPackedOffsets=*/false); 21613 case Intrinsic::aarch64_sve_ld1_gather_uxtw: 21614 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO, 21615 /*OnlyPackedOffsets=*/false); 21616 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index: 21617 return performGatherLoadCombine(N, DAG, 21618 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO, 21619 /*OnlyPackedOffsets=*/false); 21620 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index: 21621 return performGatherLoadCombine(N, DAG, 21622 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO, 21623 /*OnlyPackedOffsets=*/false); 21624 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset: 21625 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO); 21626 case Intrinsic::aarch64_sve_ldff1_gather: 21627 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO); 21628 case Intrinsic::aarch64_sve_ldff1_gather_index: 21629 return performGatherLoadCombine(N, DAG, 21630 AArch64ISD::GLDFF1_SCALED_MERGE_ZERO); 21631 case Intrinsic::aarch64_sve_ldff1_gather_sxtw: 21632 return performGatherLoadCombine(N, DAG, 21633 AArch64ISD::GLDFF1_SXTW_MERGE_ZERO, 21634 /*OnlyPackedOffsets=*/false); 21635 case Intrinsic::aarch64_sve_ldff1_gather_uxtw: 21636 return performGatherLoadCombine(N, DAG, 21637 AArch64ISD::GLDFF1_UXTW_MERGE_ZERO, 21638 /*OnlyPackedOffsets=*/false); 21639 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index: 21640 return performGatherLoadCombine(N, DAG, 21641 AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO, 21642 /*OnlyPackedOffsets=*/false); 21643 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index: 21644 return performGatherLoadCombine(N, DAG, 21645 AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO, 21646 /*OnlyPackedOffsets=*/false); 21647 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset: 21648 return performGatherLoadCombine(N, DAG, 21649 AArch64ISD::GLDFF1_IMM_MERGE_ZERO); 21650 case Intrinsic::aarch64_sve_st1_scatter: 21651 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED); 21652 case Intrinsic::aarch64_sve_st1_scatter_index: 21653 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED); 21654 case Intrinsic::aarch64_sve_st1_scatter_sxtw: 21655 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED, 21656 /*OnlyPackedOffsets=*/false); 21657 case Intrinsic::aarch64_sve_st1_scatter_uxtw: 21658 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED, 21659 /*OnlyPackedOffsets=*/false); 21660 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index: 21661 return performScatterStoreCombine(N, DAG, 21662 AArch64ISD::SST1_SXTW_SCALED_PRED, 21663 /*OnlyPackedOffsets=*/false); 21664 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index: 21665 return performScatterStoreCombine(N, DAG, 21666 AArch64ISD::SST1_UXTW_SCALED_PRED, 21667 /*OnlyPackedOffsets=*/false); 21668 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset: 21669 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED); 21670 case Intrinsic::aarch64_rndr: 21671 case Intrinsic::aarch64_rndrrs: { 21672 unsigned IntrinsicID = 21673 cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 21674 auto Register = 21675 (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR 21676 : AArch64SysReg::RNDRRS); 21677 SDLoc DL(N); 21678 SDValue A = DAG.getNode( 21679 AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::Glue, MVT::Other), 21680 N->getOperand(0), DAG.getConstant(Register, DL, MVT::i64)); 21681 SDValue B = DAG.getNode( 21682 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32), 21683 DAG.getConstant(0, DL, MVT::i32), 21684 DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1)); 21685 return DAG.getMergeValues( 21686 {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL); 21687 } 21688 default: 21689 break; 21690 } 21691 break; 21692 case ISD::GlobalAddress: 21693 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine()); 21694 case ISD::CTLZ: 21695 return performCTLZCombine(N, DAG, Subtarget); 21696 } 21697 return SDValue(); 21698 } 21699 21700 // Check if the return value is used as only a return value, as otherwise 21701 // we can't perform a tail-call. In particular, we need to check for 21702 // target ISD nodes that are returns and any other "odd" constructs 21703 // that the generic analysis code won't necessarily catch. 21704 bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N, 21705 SDValue &Chain) const { 21706 if (N->getNumValues() != 1) 21707 return false; 21708 if (!N->hasNUsesOfValue(1, 0)) 21709 return false; 21710 21711 SDValue TCChain = Chain; 21712 SDNode *Copy = *N->use_begin(); 21713 if (Copy->getOpcode() == ISD::CopyToReg) { 21714 // If the copy has a glue operand, we conservatively assume it isn't safe to 21715 // perform a tail call. 21716 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() == 21717 MVT::Glue) 21718 return false; 21719 TCChain = Copy->getOperand(0); 21720 } else if (Copy->getOpcode() != ISD::FP_EXTEND) 21721 return false; 21722 21723 bool HasRet = false; 21724 for (SDNode *Node : Copy->uses()) { 21725 if (Node->getOpcode() != AArch64ISD::RET_FLAG) 21726 return false; 21727 HasRet = true; 21728 } 21729 21730 if (!HasRet) 21731 return false; 21732 21733 Chain = TCChain; 21734 return true; 21735 } 21736 21737 // Return whether the an instruction can potentially be optimized to a tail 21738 // call. This will cause the optimizers to attempt to move, or duplicate, 21739 // return instructions to help enable tail call optimizations for this 21740 // instruction. 21741 bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { 21742 return CI->isTailCall(); 21743 } 21744 21745 bool AArch64TargetLowering::getIndexedAddressParts( 21746 SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, 21747 ISD::MemIndexedMode &AM, bool &IsInc, SelectionDAG &DAG) const { 21748 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB) 21749 return false; 21750 21751 // Non-null if there is exactly one user of the loaded value (ignoring chain). 21752 SDNode *ValOnlyUser = nullptr; 21753 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE; 21754 ++UI) { 21755 if (UI.getUse().getResNo() == 1) 21756 continue; // Ignore chain. 21757 if (ValOnlyUser == nullptr) 21758 ValOnlyUser = *UI; 21759 else { 21760 ValOnlyUser = nullptr; // Multiple non-chain uses, bail out. 21761 break; 21762 } 21763 } 21764 21765 auto IsUndefOrZero = [](SDValue V) { 21766 return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true); 21767 }; 21768 21769 // If the only user of the value is a scalable vector splat, it is 21770 // preferable to do a replicating load (ld1r*). 21771 if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() && 21772 (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR || 21773 (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU && 21774 IsUndefOrZero(ValOnlyUser->getOperand(2))))) 21775 return false; 21776 21777 Base = Op->getOperand(0); 21778 // All of the indexed addressing mode instructions take a signed 21779 // 9 bit immediate offset. 21780 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) { 21781 int64_t RHSC = RHS->getSExtValue(); 21782 if (Op->getOpcode() == ISD::SUB) 21783 RHSC = -(uint64_t)RHSC; 21784 if (!isInt<9>(RHSC)) 21785 return false; 21786 IsInc = (Op->getOpcode() == ISD::ADD); 21787 Offset = Op->getOperand(1); 21788 return true; 21789 } 21790 return false; 21791 } 21792 21793 bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, 21794 SDValue &Offset, 21795 ISD::MemIndexedMode &AM, 21796 SelectionDAG &DAG) const { 21797 EVT VT; 21798 SDValue Ptr; 21799 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 21800 VT = LD->getMemoryVT(); 21801 Ptr = LD->getBasePtr(); 21802 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 21803 VT = ST->getMemoryVT(); 21804 Ptr = ST->getBasePtr(); 21805 } else 21806 return false; 21807 21808 bool IsInc; 21809 if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, AM, IsInc, DAG)) 21810 return false; 21811 AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC; 21812 return true; 21813 } 21814 21815 bool AArch64TargetLowering::getPostIndexedAddressParts( 21816 SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, 21817 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const { 21818 EVT VT; 21819 SDValue Ptr; 21820 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 21821 VT = LD->getMemoryVT(); 21822 Ptr = LD->getBasePtr(); 21823 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 21824 VT = ST->getMemoryVT(); 21825 Ptr = ST->getBasePtr(); 21826 } else 21827 return false; 21828 21829 bool IsInc; 21830 if (!getIndexedAddressParts(N, Op, Base, Offset, AM, IsInc, DAG)) 21831 return false; 21832 // Post-indexing updates the base, so it's not a valid transform 21833 // if that's not the same as the load's pointer. 21834 if (Ptr != Base) 21835 return false; 21836 AM = IsInc ? ISD::POST_INC : ISD::POST_DEC; 21837 return true; 21838 } 21839 21840 void AArch64TargetLowering::ReplaceBITCASTResults( 21841 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { 21842 SDLoc DL(N); 21843 SDValue Op = N->getOperand(0); 21844 EVT VT = N->getValueType(0); 21845 EVT SrcVT = Op.getValueType(); 21846 21847 if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) { 21848 assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() && 21849 "Expected fp->int bitcast!"); 21850 21851 // Bitcasting between unpacked vector types of different element counts is 21852 // not a NOP because the live elements are laid out differently. 21853 // 01234567 21854 // e.g. nxv2i32 = XX??XX?? 21855 // nxv4f16 = X?X?X?X? 21856 if (VT.getVectorElementCount() != SrcVT.getVectorElementCount()) 21857 return; 21858 21859 SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG); 21860 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult)); 21861 return; 21862 } 21863 21864 if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16)) 21865 return; 21866 21867 Op = SDValue( 21868 DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32, 21869 DAG.getUNDEF(MVT::i32), Op, 21870 DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)), 21871 0); 21872 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op); 21873 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op)); 21874 } 21875 21876 static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl<SDValue> &Results, 21877 SelectionDAG &DAG, 21878 const AArch64Subtarget *Subtarget) { 21879 EVT VT = N->getValueType(0); 21880 if (!VT.is256BitVector() || 21881 (VT.getScalarType().isFloatingPoint() && 21882 !N->getFlags().hasAllowReassociation()) || 21883 (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16())) 21884 return; 21885 21886 SDValue X = N->getOperand(0); 21887 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1)); 21888 if (!Shuf) { 21889 Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0)); 21890 X = N->getOperand(1); 21891 if (!Shuf) 21892 return; 21893 } 21894 21895 if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef()) 21896 return; 21897 21898 // Check the mask is 1,0,3,2,5,4,... 21899 ArrayRef<int> Mask = Shuf->getMask(); 21900 for (int I = 0, E = Mask.size(); I < E; I++) 21901 if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1)) 21902 return; 21903 21904 SDLoc DL(N); 21905 auto LoHi = DAG.SplitVector(X, DL); 21906 assert(LoHi.first.getValueType() == LoHi.second.getValueType()); 21907 SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(), 21908 LoHi.first, LoHi.second); 21909 21910 // Shuffle the elements back into order. 21911 SmallVector<int> NMask; 21912 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) { 21913 NMask.push_back(I); 21914 NMask.push_back(I); 21915 } 21916 Results.push_back( 21917 DAG.getVectorShuffle(VT, DL, 21918 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp, 21919 DAG.getUNDEF(LoHi.first.getValueType())), 21920 DAG.getUNDEF(VT), NMask)); 21921 } 21922 21923 static void ReplaceReductionResults(SDNode *N, 21924 SmallVectorImpl<SDValue> &Results, 21925 SelectionDAG &DAG, unsigned InterOp, 21926 unsigned AcrossOp) { 21927 EVT LoVT, HiVT; 21928 SDValue Lo, Hi; 21929 SDLoc dl(N); 21930 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); 21931 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); 21932 SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi); 21933 SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal); 21934 Results.push_back(SplitVal); 21935 } 21936 21937 static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) { 21938 SDLoc DL(N); 21939 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, N); 21940 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, 21941 DAG.getNode(ISD::SRL, DL, MVT::i128, N, 21942 DAG.getConstant(64, DL, MVT::i64))); 21943 return std::make_pair(Lo, Hi); 21944 } 21945 21946 void AArch64TargetLowering::ReplaceExtractSubVectorResults( 21947 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { 21948 SDValue In = N->getOperand(0); 21949 EVT InVT = In.getValueType(); 21950 21951 // Common code will handle these just fine. 21952 if (!InVT.isScalableVector() || !InVT.isInteger()) 21953 return; 21954 21955 SDLoc DL(N); 21956 EVT VT = N->getValueType(0); 21957 21958 // The following checks bail if this is not a halving operation. 21959 21960 ElementCount ResEC = VT.getVectorElementCount(); 21961 21962 if (InVT.getVectorElementCount() != (ResEC * 2)) 21963 return; 21964 21965 auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1)); 21966 if (!CIndex) 21967 return; 21968 21969 unsigned Index = CIndex->getZExtValue(); 21970 if ((Index != 0) && (Index != ResEC.getKnownMinValue())) 21971 return; 21972 21973 unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI; 21974 EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext()); 21975 21976 SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0)); 21977 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half)); 21978 } 21979 21980 // Create an even/odd pair of X registers holding integer value V. 21981 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) { 21982 SDLoc dl(V.getNode()); 21983 SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i64); 21984 SDValue VHi = DAG.getAnyExtOrTrunc( 21985 DAG.getNode(ISD::SRL, dl, MVT::i128, V, DAG.getConstant(64, dl, MVT::i64)), 21986 dl, MVT::i64); 21987 if (DAG.getDataLayout().isBigEndian()) 21988 std::swap (VLo, VHi); 21989 SDValue RegClass = 21990 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32); 21991 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32); 21992 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32); 21993 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 }; 21994 return SDValue( 21995 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0); 21996 } 21997 21998 static void ReplaceCMP_SWAP_128Results(SDNode *N, 21999 SmallVectorImpl<SDValue> &Results, 22000 SelectionDAG &DAG, 22001 const AArch64Subtarget *Subtarget) { 22002 assert(N->getValueType(0) == MVT::i128 && 22003 "AtomicCmpSwap on types less than 128 should be legal"); 22004 22005 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand(); 22006 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) { 22007 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type, 22008 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG. 22009 SDValue Ops[] = { 22010 createGPRPairNode(DAG, N->getOperand(2)), // Compare value 22011 createGPRPairNode(DAG, N->getOperand(3)), // Store value 22012 N->getOperand(1), // Ptr 22013 N->getOperand(0), // Chain in 22014 }; 22015 22016 unsigned Opcode; 22017 switch (MemOp->getMergedOrdering()) { 22018 case AtomicOrdering::Monotonic: 22019 Opcode = AArch64::CASPX; 22020 break; 22021 case AtomicOrdering::Acquire: 22022 Opcode = AArch64::CASPAX; 22023 break; 22024 case AtomicOrdering::Release: 22025 Opcode = AArch64::CASPLX; 22026 break; 22027 case AtomicOrdering::AcquireRelease: 22028 case AtomicOrdering::SequentiallyConsistent: 22029 Opcode = AArch64::CASPALX; 22030 break; 22031 default: 22032 llvm_unreachable("Unexpected ordering!"); 22033 } 22034 22035 MachineSDNode *CmpSwap = DAG.getMachineNode( 22036 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops); 22037 DAG.setNodeMemRefs(CmpSwap, {MemOp}); 22038 22039 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64; 22040 if (DAG.getDataLayout().isBigEndian()) 22041 std::swap(SubReg1, SubReg2); 22042 SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64, 22043 SDValue(CmpSwap, 0)); 22044 SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64, 22045 SDValue(CmpSwap, 0)); 22046 Results.push_back( 22047 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi)); 22048 Results.push_back(SDValue(CmpSwap, 1)); // Chain out 22049 return; 22050 } 22051 22052 unsigned Opcode; 22053 switch (MemOp->getMergedOrdering()) { 22054 case AtomicOrdering::Monotonic: 22055 Opcode = AArch64::CMP_SWAP_128_MONOTONIC; 22056 break; 22057 case AtomicOrdering::Acquire: 22058 Opcode = AArch64::CMP_SWAP_128_ACQUIRE; 22059 break; 22060 case AtomicOrdering::Release: 22061 Opcode = AArch64::CMP_SWAP_128_RELEASE; 22062 break; 22063 case AtomicOrdering::AcquireRelease: 22064 case AtomicOrdering::SequentiallyConsistent: 22065 Opcode = AArch64::CMP_SWAP_128; 22066 break; 22067 default: 22068 llvm_unreachable("Unexpected ordering!"); 22069 } 22070 22071 auto Desired = splitInt128(N->getOperand(2), DAG); 22072 auto New = splitInt128(N->getOperand(3), DAG); 22073 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second, 22074 New.first, New.second, N->getOperand(0)}; 22075 SDNode *CmpSwap = DAG.getMachineNode( 22076 Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), 22077 Ops); 22078 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp}); 22079 22080 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, 22081 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1))); 22082 Results.push_back(SDValue(CmpSwap, 3)); 22083 } 22084 22085 void AArch64TargetLowering::ReplaceNodeResults( 22086 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { 22087 switch (N->getOpcode()) { 22088 default: 22089 llvm_unreachable("Don't know how to custom expand this"); 22090 case ISD::BITCAST: 22091 ReplaceBITCASTResults(N, Results, DAG); 22092 return; 22093 case ISD::VECREDUCE_ADD: 22094 case ISD::VECREDUCE_SMAX: 22095 case ISD::VECREDUCE_SMIN: 22096 case ISD::VECREDUCE_UMAX: 22097 case ISD::VECREDUCE_UMIN: 22098 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG)); 22099 return; 22100 case ISD::ADD: 22101 case ISD::FADD: 22102 ReplaceAddWithADDP(N, Results, DAG, Subtarget); 22103 return; 22104 22105 case ISD::CTPOP: 22106 case ISD::PARITY: 22107 if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG)) 22108 Results.push_back(Result); 22109 return; 22110 case AArch64ISD::SADDV: 22111 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV); 22112 return; 22113 case AArch64ISD::UADDV: 22114 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV); 22115 return; 22116 case AArch64ISD::SMINV: 22117 ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV); 22118 return; 22119 case AArch64ISD::UMINV: 22120 ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV); 22121 return; 22122 case AArch64ISD::SMAXV: 22123 ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV); 22124 return; 22125 case AArch64ISD::UMAXV: 22126 ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV); 22127 return; 22128 case ISD::FP_TO_UINT: 22129 case ISD::FP_TO_SINT: 22130 case ISD::STRICT_FP_TO_SINT: 22131 case ISD::STRICT_FP_TO_UINT: 22132 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion"); 22133 // Let normal code take care of it by not adding anything to Results. 22134 return; 22135 case ISD::ATOMIC_CMP_SWAP: 22136 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget); 22137 return; 22138 case ISD::ATOMIC_LOAD: 22139 case ISD::LOAD: { 22140 MemSDNode *LoadNode = cast<MemSDNode>(N); 22141 EVT MemVT = LoadNode->getMemoryVT(); 22142 // Handle lowering 256 bit non temporal loads into LDNP for little-endian 22143 // targets. 22144 if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() && 22145 MemVT.getSizeInBits() == 256u && 22146 (MemVT.getScalarSizeInBits() == 8u || 22147 MemVT.getScalarSizeInBits() == 16u || 22148 MemVT.getScalarSizeInBits() == 32u || 22149 MemVT.getScalarSizeInBits() == 64u)) { 22150 22151 SDValue Result = DAG.getMemIntrinsicNode( 22152 AArch64ISD::LDNP, SDLoc(N), 22153 DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), 22154 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), 22155 MVT::Other}), 22156 {LoadNode->getChain(), LoadNode->getBasePtr()}, 22157 LoadNode->getMemoryVT(), LoadNode->getMemOperand()); 22158 22159 SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT, 22160 Result.getValue(0), Result.getValue(1)); 22161 Results.append({Pair, Result.getValue(2) /* Chain */}); 22162 return; 22163 } 22164 22165 if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) || 22166 LoadNode->getMemoryVT() != MVT::i128) { 22167 // Non-volatile or atomic loads are optimized later in AArch64's load/store 22168 // optimizer. 22169 return; 22170 } 22171 22172 if (SDValue(N, 0).getValueType() == MVT::i128) { 22173 SDValue Result = DAG.getMemIntrinsicNode( 22174 AArch64ISD::LDP, SDLoc(N), 22175 DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}), 22176 {LoadNode->getChain(), LoadNode->getBasePtr()}, 22177 LoadNode->getMemoryVT(), LoadNode->getMemOperand()); 22178 22179 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, 22180 Result.getValue(0), Result.getValue(1)); 22181 Results.append({Pair, Result.getValue(2) /* Chain */}); 22182 } 22183 return; 22184 } 22185 case ISD::EXTRACT_SUBVECTOR: 22186 ReplaceExtractSubVectorResults(N, Results, DAG); 22187 return; 22188 case ISD::INSERT_SUBVECTOR: 22189 case ISD::CONCAT_VECTORS: 22190 // Custom lowering has been requested for INSERT_SUBVECTOR and 22191 // CONCAT_VECTORS -- but delegate to common code for result type 22192 // legalisation 22193 return; 22194 case ISD::INTRINSIC_WO_CHAIN: { 22195 EVT VT = N->getValueType(0); 22196 assert((VT == MVT::i8 || VT == MVT::i16) && 22197 "custom lowering for unexpected type"); 22198 22199 ConstantSDNode *CN = cast<ConstantSDNode>(N->getOperand(0)); 22200 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); 22201 switch (IntID) { 22202 default: 22203 return; 22204 case Intrinsic::aarch64_sve_clasta_n: { 22205 SDLoc DL(N); 22206 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2)); 22207 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32, 22208 N->getOperand(1), Op2, N->getOperand(3)); 22209 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V)); 22210 return; 22211 } 22212 case Intrinsic::aarch64_sve_clastb_n: { 22213 SDLoc DL(N); 22214 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2)); 22215 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32, 22216 N->getOperand(1), Op2, N->getOperand(3)); 22217 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V)); 22218 return; 22219 } 22220 case Intrinsic::aarch64_sve_lasta: { 22221 SDLoc DL(N); 22222 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32, 22223 N->getOperand(1), N->getOperand(2)); 22224 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V)); 22225 return; 22226 } 22227 case Intrinsic::aarch64_sve_lastb: { 22228 SDLoc DL(N); 22229 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32, 22230 N->getOperand(1), N->getOperand(2)); 22231 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V)); 22232 return; 22233 } 22234 } 22235 } 22236 case ISD::READ_REGISTER: { 22237 SDLoc DL(N); 22238 assert(N->getValueType(0) == MVT::i128 && 22239 "READ_REGISTER custom lowering is only for 128-bit sysregs"); 22240 SDValue Chain = N->getOperand(0); 22241 SDValue SysRegName = N->getOperand(1); 22242 22243 SDValue Result = DAG.getNode( 22244 AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}), 22245 Chain, SysRegName); 22246 22247 // Sysregs are not endian. Result.getValue(0) always contains the lower half 22248 // of the 128-bit System Register value. 22249 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, 22250 Result.getValue(0), Result.getValue(1)); 22251 Results.push_back(Pair); 22252 Results.push_back(Result.getValue(2)); // Chain 22253 return; 22254 } 22255 } 22256 } 22257 22258 bool AArch64TargetLowering::useLoadStackGuardNode() const { 22259 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia()) 22260 return TargetLowering::useLoadStackGuardNode(); 22261 return true; 22262 } 22263 22264 unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const { 22265 // Combine multiple FDIVs with the same divisor into multiple FMULs by the 22266 // reciprocal if there are three or more FDIVs. 22267 return 3; 22268 } 22269 22270 TargetLoweringBase::LegalizeTypeAction 22271 AArch64TargetLowering::getPreferredVectorAction(MVT VT) const { 22272 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8, 22273 // v4i16, v2i32 instead of to promote. 22274 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 || 22275 VT == MVT::v1f32) 22276 return TypeWidenVector; 22277 22278 return TargetLoweringBase::getPreferredVectorAction(VT); 22279 } 22280 22281 // In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic 22282 // provided the address is 16-byte aligned. 22283 bool AArch64TargetLowering::isOpSuitableForLDPSTP(const Instruction *I) const { 22284 if (!Subtarget->hasLSE2()) 22285 return false; 22286 22287 if (auto LI = dyn_cast<LoadInst>(I)) 22288 return LI->getType()->getPrimitiveSizeInBits() == 128 && 22289 LI->getAlign() >= Align(16); 22290 22291 if (auto SI = dyn_cast<StoreInst>(I)) 22292 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 && 22293 SI->getAlign() >= Align(16); 22294 22295 return false; 22296 } 22297 22298 bool AArch64TargetLowering::shouldInsertFencesForAtomic( 22299 const Instruction *I) const { 22300 return isOpSuitableForLDPSTP(I); 22301 } 22302 22303 bool AArch64TargetLowering::shouldInsertTrailingFenceForAtomicStore( 22304 const Instruction *I) const { 22305 // Store-Release instructions only provide seq_cst guarantees when paired with 22306 // Load-Acquire instructions. MSVC CRT does not use these instructions to 22307 // implement seq_cst loads and stores, so we need additional explicit fences 22308 // after memory writes. 22309 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 22310 return false; 22311 22312 switch (I->getOpcode()) { 22313 default: 22314 return false; 22315 case Instruction::AtomicCmpXchg: 22316 return cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() == 22317 AtomicOrdering::SequentiallyConsistent; 22318 case Instruction::AtomicRMW: 22319 return cast<AtomicRMWInst>(I)->getOrdering() == 22320 AtomicOrdering::SequentiallyConsistent; 22321 case Instruction::Store: 22322 return cast<StoreInst>(I)->getOrdering() == 22323 AtomicOrdering::SequentiallyConsistent; 22324 } 22325 } 22326 22327 // Loads and stores less than 128-bits are already atomic; ones above that 22328 // are doomed anyway, so defer to the default libcall and blame the OS when 22329 // things go wrong. 22330 TargetLoweringBase::AtomicExpansionKind 22331 AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { 22332 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); 22333 if (Size != 128 || isOpSuitableForLDPSTP(SI)) 22334 return AtomicExpansionKind::None; 22335 return AtomicExpansionKind::Expand; 22336 } 22337 22338 // Loads and stores less than 128-bits are already atomic; ones above that 22339 // are doomed anyway, so defer to the default libcall and blame the OS when 22340 // things go wrong. 22341 TargetLowering::AtomicExpansionKind 22342 AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { 22343 unsigned Size = LI->getType()->getPrimitiveSizeInBits(); 22344 22345 if (Size != 128 || isOpSuitableForLDPSTP(LI)) 22346 return AtomicExpansionKind::None; 22347 22348 // At -O0, fast-regalloc cannot cope with the live vregs necessary to 22349 // implement atomicrmw without spilling. If the target address is also on the 22350 // stack and close enough to the spill slot, this can lead to a situation 22351 // where the monitor always gets cleared and the atomic operation can never 22352 // succeed. So at -O0 lower this operation to a CAS loop. 22353 if (getTargetMachine().getOptLevel() == CodeGenOpt::None) 22354 return AtomicExpansionKind::CmpXChg; 22355 22356 // Using CAS for an atomic load has a better chance of succeeding under high 22357 // contention situations. So use it if available. 22358 return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg 22359 : AtomicExpansionKind::LLSC; 22360 } 22361 22362 // For the real atomic operations, we have ldxr/stxr up to 128 bits, 22363 TargetLowering::AtomicExpansionKind 22364 AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 22365 if (AI->isFloatingPointOperation()) 22366 return AtomicExpansionKind::CmpXChg; 22367 22368 unsigned Size = AI->getType()->getPrimitiveSizeInBits(); 22369 if (Size > 128) return AtomicExpansionKind::None; 22370 22371 // Nand is not supported in LSE. 22372 // Leave 128 bits to LLSC or CmpXChg. 22373 if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) { 22374 if (Subtarget->hasLSE()) 22375 return AtomicExpansionKind::None; 22376 if (Subtarget->outlineAtomics()) { 22377 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far. 22378 // Don't outline them unless 22379 // (1) high level <atomic> support approved: 22380 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf 22381 // (2) low level libgcc and compiler-rt support implemented by: 22382 // min/max outline atomics helpers 22383 if (AI->getOperation() != AtomicRMWInst::Min && 22384 AI->getOperation() != AtomicRMWInst::Max && 22385 AI->getOperation() != AtomicRMWInst::UMin && 22386 AI->getOperation() != AtomicRMWInst::UMax) { 22387 return AtomicExpansionKind::None; 22388 } 22389 } 22390 } 22391 22392 // At -O0, fast-regalloc cannot cope with the live vregs necessary to 22393 // implement atomicrmw without spilling. If the target address is also on the 22394 // stack and close enough to the spill slot, this can lead to a situation 22395 // where the monitor always gets cleared and the atomic operation can never 22396 // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if 22397 // we have a single CAS instruction that can replace the loop. 22398 if (getTargetMachine().getOptLevel() == CodeGenOpt::None || 22399 Subtarget->hasLSE()) 22400 return AtomicExpansionKind::CmpXChg; 22401 22402 return AtomicExpansionKind::LLSC; 22403 } 22404 22405 TargetLowering::AtomicExpansionKind 22406 AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR( 22407 AtomicCmpXchgInst *AI) const { 22408 // If subtarget has LSE, leave cmpxchg intact for codegen. 22409 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) 22410 return AtomicExpansionKind::None; 22411 // At -O0, fast-regalloc cannot cope with the live vregs necessary to 22412 // implement cmpxchg without spilling. If the address being exchanged is also 22413 // on the stack and close enough to the spill slot, this can lead to a 22414 // situation where the monitor always gets cleared and the atomic operation 22415 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. 22416 if (getTargetMachine().getOptLevel() == CodeGenOpt::None) 22417 return AtomicExpansionKind::None; 22418 22419 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand 22420 // it. 22421 unsigned Size = AI->getCompareOperand()->getType()->getPrimitiveSizeInBits(); 22422 if (Size > 64) 22423 return AtomicExpansionKind::None; 22424 22425 return AtomicExpansionKind::LLSC; 22426 } 22427 22428 Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder, 22429 Type *ValueTy, Value *Addr, 22430 AtomicOrdering Ord) const { 22431 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 22432 bool IsAcquire = isAcquireOrStronger(Ord); 22433 22434 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd 22435 // intrinsic must return {i64, i64} and we have to recombine them into a 22436 // single i128 here. 22437 if (ValueTy->getPrimitiveSizeInBits() == 128) { 22438 Intrinsic::ID Int = 22439 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp; 22440 Function *Ldxr = Intrinsic::getDeclaration(M, Int); 22441 22442 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 22443 Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi"); 22444 22445 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); 22446 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); 22447 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64"); 22448 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64"); 22449 return Builder.CreateOr( 22450 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64"); 22451 } 22452 22453 Type *Tys[] = { Addr->getType() }; 22454 Intrinsic::ID Int = 22455 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr; 22456 Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys); 22457 22458 const DataLayout &DL = M->getDataLayout(); 22459 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy)); 22460 CallInst *CI = Builder.CreateCall(Ldxr, Addr); 22461 CI->addParamAttr( 22462 0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy)); 22463 Value *Trunc = Builder.CreateTrunc(CI, IntEltTy); 22464 22465 return Builder.CreateBitCast(Trunc, ValueTy); 22466 } 22467 22468 void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance( 22469 IRBuilderBase &Builder) const { 22470 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 22471 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex)); 22472 } 22473 22474 Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder, 22475 Value *Val, Value *Addr, 22476 AtomicOrdering Ord) const { 22477 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 22478 bool IsRelease = isReleaseOrStronger(Ord); 22479 22480 // Since the intrinsics must have legal type, the i128 intrinsics take two 22481 // parameters: "i64, i64". We must marshal Val into the appropriate form 22482 // before the call. 22483 if (Val->getType()->getPrimitiveSizeInBits() == 128) { 22484 Intrinsic::ID Int = 22485 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp; 22486 Function *Stxr = Intrinsic::getDeclaration(M, Int); 22487 Type *Int64Ty = Type::getInt64Ty(M->getContext()); 22488 22489 Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo"); 22490 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi"); 22491 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 22492 return Builder.CreateCall(Stxr, {Lo, Hi, Addr}); 22493 } 22494 22495 Intrinsic::ID Int = 22496 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr; 22497 Type *Tys[] = { Addr->getType() }; 22498 Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys); 22499 22500 const DataLayout &DL = M->getDataLayout(); 22501 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType())); 22502 Val = Builder.CreateBitCast(Val, IntValTy); 22503 22504 CallInst *CI = Builder.CreateCall( 22505 Stxr, {Builder.CreateZExtOrBitCast( 22506 Val, Stxr->getFunctionType()->getParamType(0)), 22507 Addr}); 22508 CI->addParamAttr(1, Attribute::get(Builder.getContext(), 22509 Attribute::ElementType, Val->getType())); 22510 return CI; 22511 } 22512 22513 bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters( 22514 Type *Ty, CallingConv::ID CallConv, bool isVarArg, 22515 const DataLayout &DL) const { 22516 if (!Ty->isArrayTy()) { 22517 const TypeSize &TySize = Ty->getPrimitiveSizeInBits(); 22518 return TySize.isScalable() && TySize.getKnownMinValue() > 128; 22519 } 22520 22521 // All non aggregate members of the type must have the same type 22522 SmallVector<EVT> ValueVTs; 22523 ComputeValueVTs(*this, DL, Ty, ValueVTs); 22524 return all_equal(ValueVTs); 22525 } 22526 22527 bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &, 22528 EVT) const { 22529 return false; 22530 } 22531 22532 static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) { 22533 Module *M = IRB.GetInsertBlock()->getParent()->getParent(); 22534 Function *ThreadPointerFunc = 22535 Intrinsic::getDeclaration(M, Intrinsic::thread_pointer); 22536 return IRB.CreatePointerCast( 22537 IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc), 22538 Offset), 22539 IRB.getInt8PtrTy()->getPointerTo(0)); 22540 } 22541 22542 Value *AArch64TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const { 22543 // Android provides a fixed TLS slot for the stack cookie. See the definition 22544 // of TLS_SLOT_STACK_GUARD in 22545 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h 22546 if (Subtarget->isTargetAndroid()) 22547 return UseTlsOffset(IRB, 0x28); 22548 22549 // Fuchsia is similar. 22550 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value. 22551 if (Subtarget->isTargetFuchsia()) 22552 return UseTlsOffset(IRB, -0x10); 22553 22554 return TargetLowering::getIRStackGuard(IRB); 22555 } 22556 22557 void AArch64TargetLowering::insertSSPDeclarations(Module &M) const { 22558 // MSVC CRT provides functionalities for stack protection. 22559 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) { 22560 // MSVC CRT has a global variable holding security cookie. 22561 M.getOrInsertGlobal("__security_cookie", 22562 Type::getInt8PtrTy(M.getContext())); 22563 22564 // MSVC CRT has a function to validate security cookie. 22565 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( 22566 Subtarget->getSecurityCheckCookieName(), 22567 Type::getVoidTy(M.getContext()), Type::getInt8PtrTy(M.getContext())); 22568 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) { 22569 F->setCallingConv(CallingConv::Win64); 22570 F->addParamAttr(0, Attribute::AttrKind::InReg); 22571 } 22572 return; 22573 } 22574 TargetLowering::insertSSPDeclarations(M); 22575 } 22576 22577 Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const { 22578 // MSVC CRT has a global variable holding security cookie. 22579 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 22580 return M.getGlobalVariable("__security_cookie"); 22581 return TargetLowering::getSDagStackGuard(M); 22582 } 22583 22584 Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const { 22585 // MSVC CRT has a function to validate security cookie. 22586 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 22587 return M.getFunction(Subtarget->getSecurityCheckCookieName()); 22588 return TargetLowering::getSSPStackGuardCheck(M); 22589 } 22590 22591 Value * 22592 AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const { 22593 // Android provides a fixed TLS slot for the SafeStack pointer. See the 22594 // definition of TLS_SLOT_SAFESTACK in 22595 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h 22596 if (Subtarget->isTargetAndroid()) 22597 return UseTlsOffset(IRB, 0x48); 22598 22599 // Fuchsia is similar. 22600 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value. 22601 if (Subtarget->isTargetFuchsia()) 22602 return UseTlsOffset(IRB, -0x8); 22603 22604 return TargetLowering::getSafeStackPointerLocation(IRB); 22605 } 22606 22607 bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial( 22608 const Instruction &AndI) const { 22609 // Only sink 'and' mask to cmp use block if it is masking a single bit, since 22610 // this is likely to be fold the and/cmp/br into a single tbz instruction. It 22611 // may be beneficial to sink in other cases, but we would have to check that 22612 // the cmp would not get folded into the br to form a cbz for these to be 22613 // beneficial. 22614 ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1)); 22615 if (!Mask) 22616 return false; 22617 return Mask->getValue().isPowerOf2(); 22618 } 22619 22620 bool AArch64TargetLowering:: 22621 shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( 22622 SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, 22623 unsigned OldShiftOpcode, unsigned NewShiftOpcode, 22624 SelectionDAG &DAG) const { 22625 // Does baseline recommend not to perform the fold by default? 22626 if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( 22627 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG)) 22628 return false; 22629 // Else, if this is a vector shift, prefer 'shl'. 22630 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL; 22631 } 22632 22633 TargetLowering::ShiftLegalizationStrategy 22634 AArch64TargetLowering::preferredShiftLegalizationStrategy( 22635 SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const { 22636 if (DAG.getMachineFunction().getFunction().hasMinSize() && 22637 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin()) 22638 return ShiftLegalizationStrategy::LowerToLibcall; 22639 return TargetLowering::preferredShiftLegalizationStrategy(DAG, N, 22640 ExpansionFactor); 22641 } 22642 22643 void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { 22644 // Update IsSplitCSR in AArch64unctionInfo. 22645 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>(); 22646 AFI->setIsSplitCSR(true); 22647 } 22648 22649 void AArch64TargetLowering::insertCopiesSplitCSR( 22650 MachineBasicBlock *Entry, 22651 const SmallVectorImpl<MachineBasicBlock *> &Exits) const { 22652 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 22653 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); 22654 if (!IStart) 22655 return; 22656 22657 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 22658 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); 22659 MachineBasicBlock::iterator MBBI = Entry->begin(); 22660 for (const MCPhysReg *I = IStart; *I; ++I) { 22661 const TargetRegisterClass *RC = nullptr; 22662 if (AArch64::GPR64RegClass.contains(*I)) 22663 RC = &AArch64::GPR64RegClass; 22664 else if (AArch64::FPR64RegClass.contains(*I)) 22665 RC = &AArch64::FPR64RegClass; 22666 else 22667 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 22668 22669 Register NewVR = MRI->createVirtualRegister(RC); 22670 // Create copy from CSR to a virtual register. 22671 // FIXME: this currently does not emit CFI pseudo-instructions, it works 22672 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be 22673 // nounwind. If we want to generalize this later, we may need to emit 22674 // CFI pseudo-instructions. 22675 assert(Entry->getParent()->getFunction().hasFnAttribute( 22676 Attribute::NoUnwind) && 22677 "Function should be nounwind in insertCopiesSplitCSR!"); 22678 Entry->addLiveIn(*I); 22679 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) 22680 .addReg(*I); 22681 22682 // Insert the copy-back instructions right before the terminator. 22683 for (auto *Exit : Exits) 22684 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), 22685 TII->get(TargetOpcode::COPY), *I) 22686 .addReg(NewVR); 22687 } 22688 } 22689 22690 bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const { 22691 // Integer division on AArch64 is expensive. However, when aggressively 22692 // optimizing for code size, we prefer to use a div instruction, as it is 22693 // usually smaller than the alternative sequence. 22694 // The exception to this is vector division. Since AArch64 doesn't have vector 22695 // integer division, leaving the division as-is is a loss even in terms of 22696 // size, because it will have to be scalarized, while the alternative code 22697 // sequence can be performed in vector form. 22698 bool OptSize = Attr.hasFnAttr(Attribute::MinSize); 22699 return OptSize && !VT.isVector(); 22700 } 22701 22702 bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const { 22703 // We want inc-of-add for scalars and sub-of-not for vectors. 22704 return VT.isScalarInteger(); 22705 } 22706 22707 bool AArch64TargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT, 22708 EVT VT) const { 22709 // v8f16 without fp16 need to be extended to v8f32, which is more difficult to 22710 // legalize. 22711 if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16()) 22712 return false; 22713 return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT); 22714 } 22715 22716 bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const { 22717 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint(); 22718 } 22719 22720 unsigned 22721 AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const { 22722 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows()) 22723 return getPointerTy(DL).getSizeInBits(); 22724 22725 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32; 22726 } 22727 22728 void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const { 22729 MachineFrameInfo &MFI = MF.getFrameInfo(); 22730 // If we have any vulnerable SVE stack objects then the stack protector 22731 // needs to be placed at the top of the SVE stack area, as the SVE locals 22732 // are placed above the other locals, so we allocate it as if it were a 22733 // scalable vector. 22734 // FIXME: It may be worthwhile having a specific interface for this rather 22735 // than doing it here in finalizeLowering. 22736 if (MFI.hasStackProtectorIndex()) { 22737 for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) { 22738 if (MFI.getStackID(i) == TargetStackID::ScalableVector && 22739 MFI.getObjectSSPLayout(i) != MachineFrameInfo::SSPLK_None) { 22740 MFI.setStackID(MFI.getStackProtectorIndex(), 22741 TargetStackID::ScalableVector); 22742 MFI.setObjectAlignment(MFI.getStackProtectorIndex(), Align(16)); 22743 break; 22744 } 22745 } 22746 } 22747 MFI.computeMaxCallFrameSize(MF); 22748 TargetLoweringBase::finalizeLowering(MF); 22749 } 22750 22751 // Unlike X86, we let frame lowering assign offsets to all catch objects. 22752 bool AArch64TargetLowering::needsFixedCatchObjects() const { 22753 return false; 22754 } 22755 22756 bool AArch64TargetLowering::shouldLocalize( 22757 const MachineInstr &MI, const TargetTransformInfo *TTI) const { 22758 auto &MF = *MI.getMF(); 22759 auto &MRI = MF.getRegInfo(); 22760 auto maxUses = [](unsigned RematCost) { 22761 // A cost of 1 means remats are basically free. 22762 if (RematCost == 1) 22763 return std::numeric_limits<unsigned>::max(); 22764 if (RematCost == 2) 22765 return 2U; 22766 22767 // Remat is too expensive, only sink if there's one user. 22768 if (RematCost > 2) 22769 return 1U; 22770 llvm_unreachable("Unexpected remat cost"); 22771 }; 22772 22773 switch (MI.getOpcode()) { 22774 case TargetOpcode::G_GLOBAL_VALUE: { 22775 // On Darwin, TLS global vars get selected into function calls, which 22776 // we don't want localized, as they can get moved into the middle of a 22777 // another call sequence. 22778 const GlobalValue &GV = *MI.getOperand(1).getGlobal(); 22779 if (GV.isThreadLocal() && Subtarget->isTargetMachO()) 22780 return false; 22781 break; 22782 } 22783 case TargetOpcode::G_CONSTANT: { 22784 auto *CI = MI.getOperand(1).getCImm(); 22785 APInt Imm = CI->getValue(); 22786 InstructionCost Cost = TTI->getIntImmCost( 22787 Imm, CI->getType(), TargetTransformInfo::TCK_CodeSize); 22788 assert(Cost.isValid() && "Expected a valid imm cost"); 22789 22790 unsigned RematCost = *Cost.getValue(); 22791 Register Reg = MI.getOperand(0).getReg(); 22792 unsigned MaxUses = maxUses(RematCost); 22793 // Don't pass UINT_MAX sentinal value to hasAtMostUserInstrs(). 22794 if (MaxUses == std::numeric_limits<unsigned>::max()) 22795 --MaxUses; 22796 return MRI.hasAtMostUserInstrs(Reg, MaxUses); 22797 } 22798 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being 22799 // localizable. 22800 case AArch64::ADRP: 22801 case AArch64::G_ADD_LOW: 22802 return true; 22803 default: 22804 break; 22805 } 22806 return TargetLoweringBase::shouldLocalize(MI, TTI); 22807 } 22808 22809 bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const { 22810 if (isa<ScalableVectorType>(Inst.getType())) 22811 return true; 22812 22813 for (unsigned i = 0; i < Inst.getNumOperands(); ++i) 22814 if (isa<ScalableVectorType>(Inst.getOperand(i)->getType())) 22815 return true; 22816 22817 if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) { 22818 if (isa<ScalableVectorType>(AI->getAllocatedType())) 22819 return true; 22820 } 22821 22822 // Checks to allow the use of SME instructions 22823 if (auto *Base = dyn_cast<CallBase>(&Inst)) { 22824 auto CallerAttrs = SMEAttrs(*Inst.getFunction()); 22825 auto CalleeAttrs = SMEAttrs(*Base); 22826 if (CallerAttrs.requiresSMChange(CalleeAttrs, 22827 /*BodyOverridesInterface=*/false) || 22828 CallerAttrs.requiresLazySave(CalleeAttrs)) 22829 return true; 22830 } 22831 return false; 22832 } 22833 22834 // Return the largest legal scalable vector type that matches VT's element type. 22835 static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) { 22836 assert(VT.isFixedLengthVector() && 22837 DAG.getTargetLoweringInfo().isTypeLegal(VT) && 22838 "Expected legal fixed length vector!"); 22839 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { 22840 default: 22841 llvm_unreachable("unexpected element type for SVE container"); 22842 case MVT::i8: 22843 return EVT(MVT::nxv16i8); 22844 case MVT::i16: 22845 return EVT(MVT::nxv8i16); 22846 case MVT::i32: 22847 return EVT(MVT::nxv4i32); 22848 case MVT::i64: 22849 return EVT(MVT::nxv2i64); 22850 case MVT::f16: 22851 return EVT(MVT::nxv8f16); 22852 case MVT::f32: 22853 return EVT(MVT::nxv4f32); 22854 case MVT::f64: 22855 return EVT(MVT::nxv2f64); 22856 } 22857 } 22858 22859 // Return a PTRUE with active lanes corresponding to the extent of VT. 22860 static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, 22861 EVT VT) { 22862 assert(VT.isFixedLengthVector() && 22863 DAG.getTargetLoweringInfo().isTypeLegal(VT) && 22864 "Expected legal fixed length vector!"); 22865 22866 std::optional<unsigned> PgPattern = 22867 getSVEPredPatternFromNumElements(VT.getVectorNumElements()); 22868 assert(PgPattern && "Unexpected element count for SVE predicate"); 22869 22870 // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use 22871 // AArch64SVEPredPattern::all, which can enable the use of unpredicated 22872 // variants of instructions when available. 22873 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>(); 22874 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits(); 22875 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits(); 22876 if (MaxSVESize && MinSVESize == MaxSVESize && 22877 MaxSVESize == VT.getSizeInBits()) 22878 PgPattern = AArch64SVEPredPattern::all; 22879 22880 MVT MaskVT; 22881 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { 22882 default: 22883 llvm_unreachable("unexpected element type for SVE predicate"); 22884 case MVT::i8: 22885 MaskVT = MVT::nxv16i1; 22886 break; 22887 case MVT::i16: 22888 case MVT::f16: 22889 MaskVT = MVT::nxv8i1; 22890 break; 22891 case MVT::i32: 22892 case MVT::f32: 22893 MaskVT = MVT::nxv4i1; 22894 break; 22895 case MVT::i64: 22896 case MVT::f64: 22897 MaskVT = MVT::nxv2i1; 22898 break; 22899 } 22900 22901 return getPTrue(DAG, DL, MaskVT, *PgPattern); 22902 } 22903 22904 static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, 22905 EVT VT) { 22906 assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && 22907 "Expected legal scalable vector!"); 22908 auto PredTy = VT.changeVectorElementType(MVT::i1); 22909 return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all); 22910 } 22911 22912 static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) { 22913 if (VT.isFixedLengthVector()) 22914 return getPredicateForFixedLengthVector(DAG, DL, VT); 22915 22916 return getPredicateForScalableVector(DAG, DL, VT); 22917 } 22918 22919 // Grow V to consume an entire SVE register. 22920 static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) { 22921 assert(VT.isScalableVector() && 22922 "Expected to convert into a scalable vector!"); 22923 assert(V.getValueType().isFixedLengthVector() && 22924 "Expected a fixed length vector operand!"); 22925 SDLoc DL(V); 22926 SDValue Zero = DAG.getConstant(0, DL, MVT::i64); 22927 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero); 22928 } 22929 22930 // Shrink V so it's just big enough to maintain a VT's worth of data. 22931 static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) { 22932 assert(VT.isFixedLengthVector() && 22933 "Expected to convert into a fixed length vector!"); 22934 assert(V.getValueType().isScalableVector() && 22935 "Expected a scalable vector operand!"); 22936 SDLoc DL(V); 22937 SDValue Zero = DAG.getConstant(0, DL, MVT::i64); 22938 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero); 22939 } 22940 22941 // Convert all fixed length vector loads larger than NEON to masked_loads. 22942 SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE( 22943 SDValue Op, SelectionDAG &DAG) const { 22944 auto Load = cast<LoadSDNode>(Op); 22945 22946 SDLoc DL(Op); 22947 EVT VT = Op.getValueType(); 22948 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 22949 EVT LoadVT = ContainerVT; 22950 EVT MemVT = Load->getMemoryVT(); 22951 22952 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT); 22953 22954 if (VT.isFloatingPoint()) { 22955 LoadVT = ContainerVT.changeTypeToInteger(); 22956 MemVT = MemVT.changeTypeToInteger(); 22957 } 22958 22959 SDValue NewLoad = DAG.getMaskedLoad( 22960 LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg, 22961 DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(), 22962 Load->getAddressingMode(), Load->getExtensionType()); 22963 22964 SDValue Result = NewLoad; 22965 if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) { 22966 EVT ExtendVT = ContainerVT.changeVectorElementType( 22967 Load->getMemoryVT().getVectorElementType()); 22968 22969 Result = getSVESafeBitCast(ExtendVT, Result, DAG); 22970 Result = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT, 22971 Pg, Result, DAG.getUNDEF(ContainerVT)); 22972 } else if (VT.isFloatingPoint()) { 22973 Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result); 22974 } 22975 22976 Result = convertFromScalableVector(DAG, VT, Result); 22977 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)}; 22978 return DAG.getMergeValues(MergedValues, DL); 22979 } 22980 22981 static SDValue convertFixedMaskToScalableVector(SDValue Mask, 22982 SelectionDAG &DAG) { 22983 SDLoc DL(Mask); 22984 EVT InVT = Mask.getValueType(); 22985 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); 22986 22987 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT); 22988 22989 if (ISD::isBuildVectorAllOnes(Mask.getNode())) 22990 return Pg; 22991 22992 auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask); 22993 auto Op2 = DAG.getConstant(0, DL, ContainerVT); 22994 22995 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, Pg.getValueType(), 22996 {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)}); 22997 } 22998 22999 // Convert all fixed length vector loads larger than NEON to masked_loads. 23000 SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE( 23001 SDValue Op, SelectionDAG &DAG) const { 23002 auto Load = cast<MaskedLoadSDNode>(Op); 23003 23004 SDLoc DL(Op); 23005 EVT VT = Op.getValueType(); 23006 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 23007 23008 SDValue Mask = convertFixedMaskToScalableVector(Load->getMask(), DAG); 23009 23010 SDValue PassThru; 23011 bool IsPassThruZeroOrUndef = false; 23012 23013 if (Load->getPassThru()->isUndef()) { 23014 PassThru = DAG.getUNDEF(ContainerVT); 23015 IsPassThruZeroOrUndef = true; 23016 } else { 23017 if (ContainerVT.isInteger()) 23018 PassThru = DAG.getConstant(0, DL, ContainerVT); 23019 else 23020 PassThru = DAG.getConstantFP(0, DL, ContainerVT); 23021 if (isZerosVector(Load->getPassThru().getNode())) 23022 IsPassThruZeroOrUndef = true; 23023 } 23024 23025 SDValue NewLoad = DAG.getMaskedLoad( 23026 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), 23027 Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(), 23028 Load->getAddressingMode(), Load->getExtensionType()); 23029 23030 SDValue Result = NewLoad; 23031 if (!IsPassThruZeroOrUndef) { 23032 SDValue OldPassThru = 23033 convertToScalableVector(DAG, ContainerVT, Load->getPassThru()); 23034 Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru); 23035 } 23036 23037 Result = convertFromScalableVector(DAG, VT, Result); 23038 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)}; 23039 return DAG.getMergeValues(MergedValues, DL); 23040 } 23041 23042 // Convert all fixed length vector stores larger than NEON to masked_stores. 23043 SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE( 23044 SDValue Op, SelectionDAG &DAG) const { 23045 auto Store = cast<StoreSDNode>(Op); 23046 23047 SDLoc DL(Op); 23048 EVT VT = Store->getValue().getValueType(); 23049 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 23050 EVT MemVT = Store->getMemoryVT(); 23051 23052 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT); 23053 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue()); 23054 23055 if (VT.isFloatingPoint() && Store->isTruncatingStore()) { 23056 EVT TruncVT = ContainerVT.changeVectorElementType( 23057 Store->getMemoryVT().getVectorElementType()); 23058 MemVT = MemVT.changeTypeToInteger(); 23059 NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg, 23060 NewValue, DAG.getTargetConstant(0, DL, MVT::i64), 23061 DAG.getUNDEF(TruncVT)); 23062 NewValue = 23063 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG); 23064 } else if (VT.isFloatingPoint()) { 23065 MemVT = MemVT.changeTypeToInteger(); 23066 NewValue = 23067 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG); 23068 } 23069 23070 return DAG.getMaskedStore(Store->getChain(), DL, NewValue, 23071 Store->getBasePtr(), Store->getOffset(), Pg, MemVT, 23072 Store->getMemOperand(), Store->getAddressingMode(), 23073 Store->isTruncatingStore()); 23074 } 23075 23076 SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE( 23077 SDValue Op, SelectionDAG &DAG) const { 23078 auto *Store = cast<MaskedStoreSDNode>(Op); 23079 23080 SDLoc DL(Op); 23081 EVT VT = Store->getValue().getValueType(); 23082 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 23083 23084 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue()); 23085 SDValue Mask = convertFixedMaskToScalableVector(Store->getMask(), DAG); 23086 23087 return DAG.getMaskedStore( 23088 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(), 23089 Mask, Store->getMemoryVT(), Store->getMemOperand(), 23090 Store->getAddressingMode(), Store->isTruncatingStore()); 23091 } 23092 23093 SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE( 23094 SDValue Op, SelectionDAG &DAG) const { 23095 SDLoc dl(Op); 23096 EVT VT = Op.getValueType(); 23097 EVT EltVT = VT.getVectorElementType(); 23098 23099 bool Signed = Op.getOpcode() == ISD::SDIV; 23100 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED; 23101 23102 bool Negated; 23103 uint64_t SplatVal; 23104 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) { 23105 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 23106 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0)); 23107 SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32); 23108 23109 SDValue Pg = getPredicateForFixedLengthVector(DAG, dl, VT); 23110 SDValue Res = 23111 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, ContainerVT, Pg, Op1, Op2); 23112 if (Negated) 23113 Res = DAG.getNode(ISD::SUB, dl, ContainerVT, 23114 DAG.getConstant(0, dl, ContainerVT), Res); 23115 23116 return convertFromScalableVector(DAG, VT, Res); 23117 } 23118 23119 // Scalable vector i32/i64 DIV is supported. 23120 if (EltVT == MVT::i32 || EltVT == MVT::i64) 23121 return LowerToPredicatedOp(Op, DAG, PredOpcode); 23122 23123 // Scalable vector i8/i16 DIV is not supported. Promote it to i32. 23124 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); 23125 EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext()); 23126 unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 23127 23128 // If the wider type is legal: extend, op, and truncate. 23129 EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext()); 23130 if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) { 23131 SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(0)); 23132 SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(1)); 23133 SDValue Div = DAG.getNode(Op.getOpcode(), dl, WideVT, Op0, Op1); 23134 return DAG.getNode(ISD::TRUNCATE, dl, VT, Div); 23135 } 23136 23137 auto HalveAndExtendVector = [&DAG, &dl, &HalfVT, &PromVT, 23138 &ExtendOpcode](SDValue Op) { 23139 SDValue IdxZero = DAG.getConstant(0, dl, MVT::i64); 23140 SDValue IdxHalf = 23141 DAG.getConstant(HalfVT.getVectorNumElements(), dl, MVT::i64); 23142 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxZero); 23143 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxHalf); 23144 return std::pair<SDValue, SDValue>( 23145 {DAG.getNode(ExtendOpcode, dl, PromVT, Lo), 23146 DAG.getNode(ExtendOpcode, dl, PromVT, Hi)}); 23147 }; 23148 23149 // If wider type is not legal: split, extend, op, trunc and concat. 23150 auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0)); 23151 auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1)); 23152 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0LoExt, Op1LoExt); 23153 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0HiExt, Op1HiExt); 23154 SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Lo); 23155 SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Hi); 23156 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, {LoTrunc, HiTrunc}); 23157 } 23158 23159 SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE( 23160 SDValue Op, SelectionDAG &DAG) const { 23161 EVT VT = Op.getValueType(); 23162 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); 23163 23164 SDLoc DL(Op); 23165 SDValue Val = Op.getOperand(0); 23166 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType()); 23167 Val = convertToScalableVector(DAG, ContainerVT, Val); 23168 23169 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND; 23170 unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO; 23171 23172 // Repeatedly unpack Val until the result is of the desired element type. 23173 switch (ContainerVT.getSimpleVT().SimpleTy) { 23174 default: 23175 llvm_unreachable("unimplemented container type"); 23176 case MVT::nxv16i8: 23177 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val); 23178 if (VT.getVectorElementType() == MVT::i16) 23179 break; 23180 [[fallthrough]]; 23181 case MVT::nxv8i16: 23182 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val); 23183 if (VT.getVectorElementType() == MVT::i32) 23184 break; 23185 [[fallthrough]]; 23186 case MVT::nxv4i32: 23187 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val); 23188 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!"); 23189 break; 23190 } 23191 23192 return convertFromScalableVector(DAG, VT, Val); 23193 } 23194 23195 SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE( 23196 SDValue Op, SelectionDAG &DAG) const { 23197 EVT VT = Op.getValueType(); 23198 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); 23199 23200 SDLoc DL(Op); 23201 SDValue Val = Op.getOperand(0); 23202 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType()); 23203 Val = convertToScalableVector(DAG, ContainerVT, Val); 23204 23205 // Repeatedly truncate Val until the result is of the desired element type. 23206 switch (ContainerVT.getSimpleVT().SimpleTy) { 23207 default: 23208 llvm_unreachable("unimplemented container type"); 23209 case MVT::nxv2i64: 23210 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val); 23211 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val); 23212 if (VT.getVectorElementType() == MVT::i32) 23213 break; 23214 [[fallthrough]]; 23215 case MVT::nxv4i32: 23216 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val); 23217 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val); 23218 if (VT.getVectorElementType() == MVT::i16) 23219 break; 23220 [[fallthrough]]; 23221 case MVT::nxv8i16: 23222 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val); 23223 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val); 23224 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!"); 23225 break; 23226 } 23227 23228 return convertFromScalableVector(DAG, VT, Val); 23229 } 23230 23231 SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt( 23232 SDValue Op, SelectionDAG &DAG) const { 23233 EVT VT = Op.getValueType(); 23234 EVT InVT = Op.getOperand(0).getValueType(); 23235 assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!"); 23236 23237 SDLoc DL(Op); 23238 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); 23239 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0)); 23240 23241 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1)); 23242 } 23243 23244 SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt( 23245 SDValue Op, SelectionDAG &DAG) const { 23246 EVT VT = Op.getValueType(); 23247 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); 23248 23249 SDLoc DL(Op); 23250 EVT InVT = Op.getOperand(0).getValueType(); 23251 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); 23252 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0)); 23253 23254 auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0, 23255 Op.getOperand(1), Op.getOperand(2)); 23256 23257 return convertFromScalableVector(DAG, VT, ScalableRes); 23258 } 23259 23260 // Convert vector operation 'Op' to an equivalent predicated operation whereby 23261 // the original operation's type is used to construct a suitable predicate. 23262 // NOTE: The results for inactive lanes are undefined. 23263 SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op, 23264 SelectionDAG &DAG, 23265 unsigned NewOp) const { 23266 EVT VT = Op.getValueType(); 23267 SDLoc DL(Op); 23268 auto Pg = getPredicateForVector(DAG, DL, VT); 23269 23270 if (VT.isFixedLengthVector()) { 23271 assert(isTypeLegal(VT) && "Expected only legal fixed-width types"); 23272 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 23273 23274 // Create list of operands by converting existing ones to scalable types. 23275 SmallVector<SDValue, 4> Operands = {Pg}; 23276 for (const SDValue &V : Op->op_values()) { 23277 if (isa<CondCodeSDNode>(V)) { 23278 Operands.push_back(V); 23279 continue; 23280 } 23281 23282 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) { 23283 EVT VTArg = VTNode->getVT().getVectorElementType(); 23284 EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg); 23285 Operands.push_back(DAG.getValueType(NewVTArg)); 23286 continue; 23287 } 23288 23289 assert(isTypeLegal(V.getValueType()) && 23290 "Expected only legal fixed-width types"); 23291 Operands.push_back(convertToScalableVector(DAG, ContainerVT, V)); 23292 } 23293 23294 if (isMergePassthruOpcode(NewOp)) 23295 Operands.push_back(DAG.getUNDEF(ContainerVT)); 23296 23297 auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands); 23298 return convertFromScalableVector(DAG, VT, ScalableRes); 23299 } 23300 23301 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!"); 23302 23303 SmallVector<SDValue, 4> Operands = {Pg}; 23304 for (const SDValue &V : Op->op_values()) { 23305 assert((!V.getValueType().isVector() || 23306 V.getValueType().isScalableVector()) && 23307 "Only scalable vectors are supported!"); 23308 Operands.push_back(V); 23309 } 23310 23311 if (isMergePassthruOpcode(NewOp)) 23312 Operands.push_back(DAG.getUNDEF(VT)); 23313 23314 return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags()); 23315 } 23316 23317 // If a fixed length vector operation has no side effects when applied to 23318 // undefined elements, we can safely use scalable vectors to perform the same 23319 // operation without needing to worry about predication. 23320 SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op, 23321 SelectionDAG &DAG) const { 23322 EVT VT = Op.getValueType(); 23323 assert(VT.isFixedLengthVector() && isTypeLegal(VT) && 23324 "Only expected to lower fixed length vector operation!"); 23325 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 23326 23327 // Create list of operands by converting existing ones to scalable types. 23328 SmallVector<SDValue, 4> Ops; 23329 for (const SDValue &V : Op->op_values()) { 23330 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!"); 23331 23332 // Pass through non-vector operands. 23333 if (!V.getValueType().isVector()) { 23334 Ops.push_back(V); 23335 continue; 23336 } 23337 23338 // "cast" fixed length vector to a scalable vector. 23339 assert(V.getValueType().isFixedLengthVector() && 23340 isTypeLegal(V.getValueType()) && 23341 "Only fixed length vectors are supported!"); 23342 Ops.push_back(convertToScalableVector(DAG, ContainerVT, V)); 23343 } 23344 23345 auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops); 23346 return convertFromScalableVector(DAG, VT, ScalableRes); 23347 } 23348 23349 SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp, 23350 SelectionDAG &DAG) const { 23351 SDLoc DL(ScalarOp); 23352 SDValue AccOp = ScalarOp.getOperand(0); 23353 SDValue VecOp = ScalarOp.getOperand(1); 23354 EVT SrcVT = VecOp.getValueType(); 23355 EVT ResVT = SrcVT.getVectorElementType(); 23356 23357 EVT ContainerVT = SrcVT; 23358 if (SrcVT.isFixedLengthVector()) { 23359 ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT); 23360 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp); 23361 } 23362 23363 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT); 23364 SDValue Zero = DAG.getConstant(0, DL, MVT::i64); 23365 23366 // Convert operands to Scalable. 23367 AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, 23368 DAG.getUNDEF(ContainerVT), AccOp, Zero); 23369 23370 // Perform reduction. 23371 SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT, 23372 Pg, AccOp, VecOp); 23373 23374 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero); 23375 } 23376 23377 SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp, 23378 SelectionDAG &DAG) const { 23379 SDLoc DL(ReduceOp); 23380 SDValue Op = ReduceOp.getOperand(0); 23381 EVT OpVT = Op.getValueType(); 23382 EVT VT = ReduceOp.getValueType(); 23383 23384 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1) 23385 return SDValue(); 23386 23387 SDValue Pg = getPredicateForVector(DAG, DL, OpVT); 23388 23389 switch (ReduceOp.getOpcode()) { 23390 default: 23391 return SDValue(); 23392 case ISD::VECREDUCE_OR: 23393 if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1) 23394 // The predicate can be 'Op' because 23395 // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op). 23396 return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE); 23397 else 23398 return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE); 23399 case ISD::VECREDUCE_AND: { 23400 Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg); 23401 return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE); 23402 } 23403 case ISD::VECREDUCE_XOR: { 23404 SDValue ID = 23405 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64); 23406 if (OpVT == MVT::nxv1i1) { 23407 // Emulate a CNTP on .Q using .D and a different governing predicate. 23408 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg); 23409 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op); 23410 } 23411 SDValue Cntp = 23412 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op); 23413 return DAG.getAnyExtOrTrunc(Cntp, DL, VT); 23414 } 23415 } 23416 23417 return SDValue(); 23418 } 23419 23420 SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode, 23421 SDValue ScalarOp, 23422 SelectionDAG &DAG) const { 23423 SDLoc DL(ScalarOp); 23424 SDValue VecOp = ScalarOp.getOperand(0); 23425 EVT SrcVT = VecOp.getValueType(); 23426 23427 if (useSVEForFixedLengthVectorVT( 23428 SrcVT, 23429 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) { 23430 EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT); 23431 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp); 23432 } 23433 23434 // UADDV always returns an i64 result. 23435 EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 : 23436 SrcVT.getVectorElementType(); 23437 EVT RdxVT = SrcVT; 23438 if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED) 23439 RdxVT = getPackedSVEVectorVT(ResVT); 23440 23441 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT); 23442 SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp); 23443 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, 23444 Rdx, DAG.getConstant(0, DL, MVT::i64)); 23445 23446 // The VEC_REDUCE nodes expect an element size result. 23447 if (ResVT != ScalarOp.getValueType()) 23448 Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType()); 23449 23450 return Res; 23451 } 23452 23453 SDValue 23454 AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op, 23455 SelectionDAG &DAG) const { 23456 EVT VT = Op.getValueType(); 23457 SDLoc DL(Op); 23458 23459 EVT InVT = Op.getOperand(1).getValueType(); 23460 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); 23461 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1)); 23462 SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2)); 23463 23464 // Convert the mask to a predicated (NOTE: We don't need to worry about 23465 // inactive lanes since VSELECT is safe when given undefined elements). 23466 EVT MaskVT = Op.getOperand(0).getValueType(); 23467 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT); 23468 auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0)); 23469 Mask = DAG.getNode(ISD::TRUNCATE, DL, 23470 MaskContainerVT.changeVectorElementType(MVT::i1), Mask); 23471 23472 auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT, 23473 Mask, Op1, Op2); 23474 23475 return convertFromScalableVector(DAG, VT, ScalableRes); 23476 } 23477 23478 SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE( 23479 SDValue Op, SelectionDAG &DAG) const { 23480 SDLoc DL(Op); 23481 EVT InVT = Op.getOperand(0).getValueType(); 23482 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); 23483 23484 assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) && 23485 "Only expected to lower fixed length vector operation!"); 23486 assert(Op.getValueType() == InVT.changeTypeToInteger() && 23487 "Expected integer result of the same bit length as the inputs!"); 23488 23489 auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0)); 23490 auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1)); 23491 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT); 23492 23493 EVT CmpVT = Pg.getValueType(); 23494 auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT, 23495 {Pg, Op1, Op2, Op.getOperand(2)}); 23496 23497 EVT PromoteVT = ContainerVT.changeTypeToInteger(); 23498 auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT); 23499 return convertFromScalableVector(DAG, Op.getValueType(), Promote); 23500 } 23501 23502 SDValue 23503 AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op, 23504 SelectionDAG &DAG) const { 23505 SDLoc DL(Op); 23506 auto SrcOp = Op.getOperand(0); 23507 EVT VT = Op.getValueType(); 23508 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT); 23509 EVT ContainerSrcVT = 23510 getContainerForFixedLengthVector(DAG, SrcOp.getValueType()); 23511 23512 SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp); 23513 Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp); 23514 return convertFromScalableVector(DAG, VT, Op); 23515 } 23516 23517 SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE( 23518 SDValue Op, SelectionDAG &DAG) const { 23519 SDLoc DL(Op); 23520 unsigned NumOperands = Op->getNumOperands(); 23521 23522 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) && 23523 "Unexpected number of operands in CONCAT_VECTORS"); 23524 23525 auto SrcOp1 = Op.getOperand(0); 23526 auto SrcOp2 = Op.getOperand(1); 23527 EVT VT = Op.getValueType(); 23528 EVT SrcVT = SrcOp1.getValueType(); 23529 23530 if (NumOperands > 2) { 23531 SmallVector<SDValue, 4> Ops; 23532 EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext()); 23533 for (unsigned I = 0; I < NumOperands; I += 2) 23534 Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT, 23535 Op->getOperand(I), Op->getOperand(I + 1))); 23536 23537 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops); 23538 } 23539 23540 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 23541 23542 SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT); 23543 SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1); 23544 SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2); 23545 23546 Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2); 23547 23548 return convertFromScalableVector(DAG, VT, Op); 23549 } 23550 23551 SDValue 23552 AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op, 23553 SelectionDAG &DAG) const { 23554 EVT VT = Op.getValueType(); 23555 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); 23556 23557 SDLoc DL(Op); 23558 SDValue Val = Op.getOperand(0); 23559 SDValue Pg = getPredicateForVector(DAG, DL, VT); 23560 EVT SrcVT = Val.getValueType(); 23561 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 23562 EVT ExtendVT = ContainerVT.changeVectorElementType( 23563 SrcVT.getVectorElementType()); 23564 23565 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val); 23566 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val); 23567 23568 Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val); 23569 Val = getSVESafeBitCast(ExtendVT, Val, DAG); 23570 Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT, 23571 Pg, Val, DAG.getUNDEF(ContainerVT)); 23572 23573 return convertFromScalableVector(DAG, VT, Val); 23574 } 23575 23576 SDValue 23577 AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op, 23578 SelectionDAG &DAG) const { 23579 EVT VT = Op.getValueType(); 23580 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); 23581 23582 SDLoc DL(Op); 23583 SDValue Val = Op.getOperand(0); 23584 EVT SrcVT = Val.getValueType(); 23585 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT); 23586 EVT RoundVT = ContainerSrcVT.changeVectorElementType( 23587 VT.getVectorElementType()); 23588 SDValue Pg = getPredicateForVector(DAG, DL, RoundVT); 23589 23590 Val = convertToScalableVector(DAG, ContainerSrcVT, Val); 23591 Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val, 23592 Op.getOperand(1), DAG.getUNDEF(RoundVT)); 23593 Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG); 23594 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val); 23595 23596 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val); 23597 return DAG.getNode(ISD::BITCAST, DL, VT, Val); 23598 } 23599 23600 SDValue 23601 AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op, 23602 SelectionDAG &DAG) const { 23603 EVT VT = Op.getValueType(); 23604 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); 23605 23606 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP; 23607 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU 23608 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU; 23609 23610 SDLoc DL(Op); 23611 SDValue Val = Op.getOperand(0); 23612 EVT SrcVT = Val.getValueType(); 23613 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT); 23614 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT); 23615 23616 if (VT.bitsGE(SrcVT)) { 23617 SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT); 23618 23619 Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, 23620 VT.changeTypeToInteger(), Val); 23621 23622 // Safe to use a larger than specified operand because by promoting the 23623 // value nothing has changed from an arithmetic point of view. 23624 Val = 23625 convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val); 23626 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val, 23627 DAG.getUNDEF(ContainerDstVT)); 23628 return convertFromScalableVector(DAG, VT, Val); 23629 } else { 23630 EVT CvtVT = ContainerSrcVT.changeVectorElementType( 23631 ContainerDstVT.getVectorElementType()); 23632 SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT); 23633 23634 Val = convertToScalableVector(DAG, ContainerSrcVT, Val); 23635 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT)); 23636 Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG); 23637 Val = convertFromScalableVector(DAG, SrcVT, Val); 23638 23639 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val); 23640 return DAG.getNode(ISD::BITCAST, DL, VT, Val); 23641 } 23642 } 23643 23644 SDValue 23645 AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op, 23646 SelectionDAG &DAG) const { 23647 EVT VT = Op.getValueType(); 23648 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); 23649 23650 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT; 23651 unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU 23652 : AArch64ISD::FCVTZU_MERGE_PASSTHRU; 23653 23654 SDLoc DL(Op); 23655 SDValue Val = Op.getOperand(0); 23656 EVT SrcVT = Val.getValueType(); 23657 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT); 23658 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT); 23659 23660 if (VT.bitsGT(SrcVT)) { 23661 EVT CvtVT = ContainerDstVT.changeVectorElementType( 23662 ContainerSrcVT.getVectorElementType()); 23663 SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT); 23664 23665 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val); 23666 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val); 23667 23668 Val = convertToScalableVector(DAG, ContainerDstVT, Val); 23669 Val = getSVESafeBitCast(CvtVT, Val, DAG); 23670 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val, 23671 DAG.getUNDEF(ContainerDstVT)); 23672 return convertFromScalableVector(DAG, VT, Val); 23673 } else { 23674 EVT CvtVT = ContainerSrcVT.changeTypeToInteger(); 23675 SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT); 23676 23677 // Safe to use a larger than specified result since an fp_to_int where the 23678 // result doesn't fit into the destination is undefined. 23679 Val = convertToScalableVector(DAG, ContainerSrcVT, Val); 23680 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT)); 23681 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val); 23682 23683 return DAG.getNode(ISD::TRUNCATE, DL, VT, Val); 23684 } 23685 } 23686 23687 SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE( 23688 SDValue Op, SelectionDAG &DAG) const { 23689 EVT VT = Op.getValueType(); 23690 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); 23691 23692 auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 23693 auto ShuffleMask = SVN->getMask(); 23694 23695 SDLoc DL(Op); 23696 SDValue Op1 = Op.getOperand(0); 23697 SDValue Op2 = Op.getOperand(1); 23698 23699 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 23700 Op1 = convertToScalableVector(DAG, ContainerVT, Op1); 23701 Op2 = convertToScalableVector(DAG, ContainerVT, Op2); 23702 23703 auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT { 23704 if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16) 23705 return MVT::i32; 23706 return ScalarTy; 23707 }; 23708 23709 if (SVN->isSplat()) { 23710 unsigned Lane = std::max(0, SVN->getSplatIndex()); 23711 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType()); 23712 SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1, 23713 DAG.getConstant(Lane, DL, MVT::i64)); 23714 Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl); 23715 return convertFromScalableVector(DAG, VT, Op); 23716 } 23717 23718 bool ReverseEXT = false; 23719 unsigned Imm; 23720 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) && 23721 Imm == VT.getVectorNumElements() - 1) { 23722 if (ReverseEXT) 23723 std::swap(Op1, Op2); 23724 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType()); 23725 SDValue Scalar = DAG.getNode( 23726 ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1, 23727 DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64)); 23728 Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar); 23729 return convertFromScalableVector(DAG, VT, Op); 23730 } 23731 23732 for (unsigned LaneSize : {64U, 32U, 16U}) { 23733 if (isREVMask(ShuffleMask, VT, LaneSize)) { 23734 EVT NewVT = 23735 getPackedSVEVectorVT(EVT::getIntegerVT(*DAG.getContext(), LaneSize)); 23736 unsigned RevOp; 23737 unsigned EltSz = VT.getScalarSizeInBits(); 23738 if (EltSz == 8) 23739 RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU; 23740 else if (EltSz == 16) 23741 RevOp = AArch64ISD::REVH_MERGE_PASSTHRU; 23742 else 23743 RevOp = AArch64ISD::REVW_MERGE_PASSTHRU; 23744 23745 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1); 23746 Op = LowerToPredicatedOp(Op, DAG, RevOp); 23747 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op); 23748 return convertFromScalableVector(DAG, VT, Op); 23749 } 23750 } 23751 23752 if (Subtarget->hasSVE2p1() && VT.getScalarSizeInBits() == 64 && 23753 isREVMask(ShuffleMask, VT, 128)) { 23754 if (!VT.isFloatingPoint()) 23755 return LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU); 23756 23757 EVT NewVT = getPackedSVEVectorVT(EVT::getIntegerVT(*DAG.getContext(), 64)); 23758 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1); 23759 Op = LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU); 23760 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op); 23761 return convertFromScalableVector(DAG, VT, Op); 23762 } 23763 23764 unsigned WhichResult; 23765 if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult == 0) 23766 return convertFromScalableVector( 23767 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2)); 23768 23769 if (isTRNMask(ShuffleMask, VT, WhichResult)) { 23770 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; 23771 return convertFromScalableVector( 23772 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2)); 23773 } 23774 23775 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0) 23776 return convertFromScalableVector( 23777 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1)); 23778 23779 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 23780 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; 23781 return convertFromScalableVector( 23782 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1)); 23783 } 23784 23785 // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask 23786 // represents the same logical operation as performed by a ZIP instruction. In 23787 // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly 23788 // equivalent to an AArch64 instruction. There's the extra component of 23789 // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions 23790 // only operated on 64/128bit vector types that have a direct mapping to a 23791 // target register and so an exact mapping is implied. 23792 // However, when using SVE for fixed length vectors, most legal vector types 23793 // are actually sub-vectors of a larger SVE register. When mapping 23794 // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider 23795 // how the mask's indices translate. Specifically, when the mapping requires 23796 // an exact meaning for a specific vector index (e.g. Index X is the last 23797 // vector element in the register) then such mappings are often only safe when 23798 // the exact SVE register size is know. The main exception to this is when 23799 // indices are logically relative to the first element of either 23800 // ISD::VECTOR_SHUFFLE operand because these relative indices don't change 23801 // when converting from fixed-length to scalable vector types (i.e. the start 23802 // of a fixed length vector is always the start of a scalable vector). 23803 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits(); 23804 unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits(); 23805 if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) { 23806 if (ShuffleVectorInst::isReverseMask(ShuffleMask) && Op2.isUndef()) { 23807 Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1); 23808 return convertFromScalableVector(DAG, VT, Op); 23809 } 23810 23811 if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult != 0) 23812 return convertFromScalableVector( 23813 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2)); 23814 23815 if (isUZPMask(ShuffleMask, VT, WhichResult)) { 23816 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; 23817 return convertFromScalableVector( 23818 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2)); 23819 } 23820 23821 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0) 23822 return convertFromScalableVector( 23823 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1)); 23824 23825 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 23826 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; 23827 return convertFromScalableVector( 23828 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1)); 23829 } 23830 } 23831 23832 return SDValue(); 23833 } 23834 23835 SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op, 23836 SelectionDAG &DAG) const { 23837 SDLoc DL(Op); 23838 EVT InVT = Op.getValueType(); 23839 23840 assert(VT.isScalableVector() && isTypeLegal(VT) && 23841 InVT.isScalableVector() && isTypeLegal(InVT) && 23842 "Only expect to cast between legal scalable vector types!"); 23843 assert(VT.getVectorElementType() != MVT::i1 && 23844 InVT.getVectorElementType() != MVT::i1 && 23845 "For predicate bitcasts, use getSVEPredicateBitCast"); 23846 23847 if (InVT == VT) 23848 return Op; 23849 23850 EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType()); 23851 EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType()); 23852 23853 // Safe bitcasting between unpacked vector types of different element counts 23854 // is currently unsupported because the following is missing the necessary 23855 // work to ensure the result's elements live where they're supposed to within 23856 // an SVE register. 23857 // 01234567 23858 // e.g. nxv2i32 = XX??XX?? 23859 // nxv4f16 = X?X?X?X? 23860 assert((VT.getVectorElementCount() == InVT.getVectorElementCount() || 23861 VT == PackedVT || InVT == PackedInVT) && 23862 "Unexpected bitcast!"); 23863 23864 // Pack input if required. 23865 if (InVT != PackedInVT) 23866 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op); 23867 23868 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op); 23869 23870 // Unpack result if required. 23871 if (VT != PackedVT) 23872 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op); 23873 23874 return Op; 23875 } 23876 23877 bool AArch64TargetLowering::isAllActivePredicate(SelectionDAG &DAG, 23878 SDValue N) const { 23879 return ::isAllActivePredicate(DAG, N); 23880 } 23881 23882 EVT AArch64TargetLowering::getPromotedVTForPredicate(EVT VT) const { 23883 return ::getPromotedVTForPredicate(VT); 23884 } 23885 23886 bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode( 23887 SDValue Op, const APInt &OriginalDemandedBits, 23888 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, 23889 unsigned Depth) const { 23890 23891 unsigned Opc = Op.getOpcode(); 23892 switch (Opc) { 23893 case AArch64ISD::VSHL: { 23894 // Match (VSHL (VLSHR Val X) X) 23895 SDValue ShiftL = Op; 23896 SDValue ShiftR = Op->getOperand(0); 23897 if (ShiftR->getOpcode() != AArch64ISD::VLSHR) 23898 return false; 23899 23900 if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse()) 23901 return false; 23902 23903 unsigned ShiftLBits = ShiftL->getConstantOperandVal(1); 23904 unsigned ShiftRBits = ShiftR->getConstantOperandVal(1); 23905 23906 // Other cases can be handled as well, but this is not 23907 // implemented. 23908 if (ShiftRBits != ShiftLBits) 23909 return false; 23910 23911 unsigned ScalarSize = Op.getScalarValueSizeInBits(); 23912 assert(ScalarSize > ShiftLBits && "Invalid shift imm"); 23913 23914 APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits); 23915 APInt UnusedBits = ~OriginalDemandedBits; 23916 23917 if ((ZeroBits & UnusedBits) != ZeroBits) 23918 return false; 23919 23920 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not 23921 // used - simplify to just Val. 23922 return TLO.CombineTo(Op, ShiftR->getOperand(0)); 23923 } 23924 case ISD::INTRINSIC_WO_CHAIN: { 23925 if (auto ElementSize = IsSVECntIntrinsic(Op)) { 23926 unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits(); 23927 if (!MaxSVEVectorSizeInBits) 23928 MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector; 23929 unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize; 23930 // The SVE count intrinsics don't support the multiplier immediate so we 23931 // don't have to account for that here. The value returned may be slightly 23932 // over the true required bits, as this is based on the "ALL" pattern. The 23933 // other patterns are also exposed by these intrinsics, but they all 23934 // return a value that's strictly less than "ALL". 23935 unsigned RequiredBits = llvm::bit_width(MaxElements); 23936 unsigned BitWidth = Known.Zero.getBitWidth(); 23937 if (RequiredBits < BitWidth) 23938 Known.Zero.setHighBits(BitWidth - RequiredBits); 23939 return false; 23940 } 23941 } 23942 } 23943 23944 return TargetLowering::SimplifyDemandedBitsForTargetNode( 23945 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth); 23946 } 23947 23948 bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const { 23949 return Op.getOpcode() == AArch64ISD::DUP || 23950 Op.getOpcode() == AArch64ISD::MOVI || 23951 (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && 23952 Op.getOperand(0).getOpcode() == AArch64ISD::DUP) || 23953 TargetLowering::isTargetCanonicalConstantNode(Op); 23954 } 23955 23956 bool AArch64TargetLowering::isConstantUnsignedBitfieldExtractLegal( 23957 unsigned Opc, LLT Ty1, LLT Ty2) const { 23958 return Ty1 == Ty2 && (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64)); 23959 } 23960 23961 bool AArch64TargetLowering::isComplexDeinterleavingSupported() const { 23962 return Subtarget->hasComplxNum(); 23963 } 23964 23965 bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported( 23966 ComplexDeinterleavingOperation Operation, Type *Ty) const { 23967 auto *VTy = dyn_cast<FixedVectorType>(Ty); 23968 if (!VTy) 23969 return false; 23970 23971 auto *ScalarTy = VTy->getScalarType(); 23972 unsigned NumElements = VTy->getNumElements(); 23973 23974 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements; 23975 if ((VTyWidth < 128 && VTyWidth != 64) || !llvm::isPowerOf2_32(VTyWidth)) 23976 return false; 23977 23978 return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) || 23979 ScalarTy->isFloatTy() || ScalarTy->isDoubleTy(); 23980 } 23981 23982 Value *AArch64TargetLowering::createComplexDeinterleavingIR( 23983 Instruction *I, ComplexDeinterleavingOperation OperationType, 23984 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, 23985 Value *Accumulator) const { 23986 FixedVectorType *Ty = cast<FixedVectorType>(InputA->getType()); 23987 23988 IRBuilder<> B(I); 23989 23990 unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements(); 23991 23992 assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) && 23993 "Vector type must be either 64 or a power of 2 that is at least 128"); 23994 23995 if (TyWidth > 128) { 23996 int Stride = Ty->getNumElements() / 2; 23997 auto SplitSeq = llvm::seq<int>(0, Ty->getNumElements()); 23998 auto SplitSeqVec = llvm::to_vector(SplitSeq); 23999 ArrayRef<int> LowerSplitMask(&SplitSeqVec[0], Stride); 24000 ArrayRef<int> UpperSplitMask(&SplitSeqVec[Stride], Stride); 24001 24002 auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask); 24003 auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask); 24004 auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask); 24005 auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask); 24006 Value *LowerSplitAcc = nullptr; 24007 Value *UpperSplitAcc = nullptr; 24008 24009 if (Accumulator) { 24010 LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask); 24011 UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask); 24012 } 24013 24014 auto *LowerSplitInt = createComplexDeinterleavingIR( 24015 I, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc); 24016 auto *UpperSplitInt = createComplexDeinterleavingIR( 24017 I, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc); 24018 24019 ArrayRef<int> JoinMask(&SplitSeqVec[0], Ty->getNumElements()); 24020 return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask); 24021 } 24022 24023 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) { 24024 Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0, 24025 Intrinsic::aarch64_neon_vcmla_rot90, 24026 Intrinsic::aarch64_neon_vcmla_rot180, 24027 Intrinsic::aarch64_neon_vcmla_rot270}; 24028 24029 if (Accumulator == nullptr) 24030 Accumulator = ConstantFP::get(Ty, 0); 24031 24032 return B.CreateIntrinsic(IdMap[(int)Rotation], Ty, 24033 {Accumulator, InputB, InputA}); 24034 } 24035 24036 if (OperationType == ComplexDeinterleavingOperation::CAdd) { 24037 Intrinsic::ID IntId = Intrinsic::not_intrinsic; 24038 if (Rotation == ComplexDeinterleavingRotation::Rotation_90) 24039 IntId = Intrinsic::aarch64_neon_vcadd_rot90; 24040 else if (Rotation == ComplexDeinterleavingRotation::Rotation_270) 24041 IntId = Intrinsic::aarch64_neon_vcadd_rot270; 24042 24043 if (IntId == Intrinsic::not_intrinsic) 24044 return nullptr; 24045 24046 return B.CreateIntrinsic(IntId, Ty, {InputA, InputB}); 24047 } 24048 24049 return nullptr; 24050 } 24051