1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the AArch64TargetLowering class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AArch64ISelLowering.h" 14 #include "AArch64CallingConvention.h" 15 #include "AArch64ExpandImm.h" 16 #include "AArch64MachineFunctionInfo.h" 17 #include "AArch64PerfectShuffle.h" 18 #include "AArch64RegisterInfo.h" 19 #include "AArch64Subtarget.h" 20 #include "MCTargetDesc/AArch64AddressingModes.h" 21 #include "Utils/AArch64BaseInfo.h" 22 #include "llvm/ADT/APFloat.h" 23 #include "llvm/ADT/APInt.h" 24 #include "llvm/ADT/ArrayRef.h" 25 #include "llvm/ADT/STLExtras.h" 26 #include "llvm/ADT/SmallSet.h" 27 #include "llvm/ADT/SmallVector.h" 28 #include "llvm/ADT/Statistic.h" 29 #include "llvm/ADT/StringRef.h" 30 #include "llvm/ADT/StringSwitch.h" 31 #include "llvm/ADT/Triple.h" 32 #include "llvm/ADT/Twine.h" 33 #include "llvm/Analysis/VectorUtils.h" 34 #include "llvm/CodeGen/CallingConvLower.h" 35 #include "llvm/CodeGen/MachineBasicBlock.h" 36 #include "llvm/CodeGen/MachineFrameInfo.h" 37 #include "llvm/CodeGen/MachineFunction.h" 38 #include "llvm/CodeGen/MachineInstr.h" 39 #include "llvm/CodeGen/MachineInstrBuilder.h" 40 #include "llvm/CodeGen/MachineMemOperand.h" 41 #include "llvm/CodeGen/MachineRegisterInfo.h" 42 #include "llvm/CodeGen/RuntimeLibcalls.h" 43 #include "llvm/CodeGen/SelectionDAG.h" 44 #include "llvm/CodeGen/SelectionDAGNodes.h" 45 #include "llvm/CodeGen/TargetCallingConv.h" 46 #include "llvm/CodeGen/TargetInstrInfo.h" 47 #include "llvm/CodeGen/ValueTypes.h" 48 #include "llvm/IR/Attributes.h" 49 #include "llvm/IR/Constants.h" 50 #include "llvm/IR/DataLayout.h" 51 #include "llvm/IR/DebugLoc.h" 52 #include "llvm/IR/DerivedTypes.h" 53 #include "llvm/IR/Function.h" 54 #include "llvm/IR/GetElementPtrTypeIterator.h" 55 #include "llvm/IR/GlobalValue.h" 56 #include "llvm/IR/IRBuilder.h" 57 #include "llvm/IR/Instruction.h" 58 #include "llvm/IR/Instructions.h" 59 #include "llvm/IR/IntrinsicInst.h" 60 #include "llvm/IR/Intrinsics.h" 61 #include "llvm/IR/IntrinsicsAArch64.h" 62 #include "llvm/IR/Module.h" 63 #include "llvm/IR/OperandTraits.h" 64 #include "llvm/IR/PatternMatch.h" 65 #include "llvm/IR/Type.h" 66 #include "llvm/IR/Use.h" 67 #include "llvm/IR/Value.h" 68 #include "llvm/MC/MCRegisterInfo.h" 69 #include "llvm/Support/Casting.h" 70 #include "llvm/Support/CodeGen.h" 71 #include "llvm/Support/CommandLine.h" 72 #include "llvm/Support/Compiler.h" 73 #include "llvm/Support/Debug.h" 74 #include "llvm/Support/ErrorHandling.h" 75 #include "llvm/Support/KnownBits.h" 76 #include "llvm/Support/MachineValueType.h" 77 #include "llvm/Support/MathExtras.h" 78 #include "llvm/Support/raw_ostream.h" 79 #include "llvm/Target/TargetMachine.h" 80 #include "llvm/Target/TargetOptions.h" 81 #include <algorithm> 82 #include <bitset> 83 #include <cassert> 84 #include <cctype> 85 #include <cstdint> 86 #include <cstdlib> 87 #include <iterator> 88 #include <limits> 89 #include <tuple> 90 #include <utility> 91 #include <vector> 92 93 using namespace llvm; 94 using namespace llvm::PatternMatch; 95 96 #define DEBUG_TYPE "aarch64-lower" 97 98 STATISTIC(NumTailCalls, "Number of tail calls"); 99 STATISTIC(NumShiftInserts, "Number of vector shift inserts"); 100 STATISTIC(NumOptimizedImms, "Number of times immediates were optimized"); 101 102 // FIXME: The necessary dtprel relocations don't seem to be supported 103 // well in the GNU bfd and gold linkers at the moment. Therefore, by 104 // default, for now, fall back to GeneralDynamic code generation. 105 cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration( 106 "aarch64-elf-ldtls-generation", cl::Hidden, 107 cl::desc("Allow AArch64 Local Dynamic TLS code generation"), 108 cl::init(false)); 109 110 static cl::opt<bool> 111 EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, 112 cl::desc("Enable AArch64 logical imm instruction " 113 "optimization"), 114 cl::init(true)); 115 116 /// Value type used for condition codes. 117 static const MVT MVT_CC = MVT::i32; 118 119 /// Returns true if VT's elements occupy the lowest bit positions of its 120 /// associated register class without any intervening space. 121 /// 122 /// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the 123 /// same register class, but only nxv8f16 can be treated as a packed vector. 124 static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) { 125 assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && 126 "Expected legal vector type!"); 127 return VT.isFixedLengthVector() || 128 VT.getSizeInBits().getKnownMinSize() == AArch64::SVEBitsPerBlock; 129 } 130 131 AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, 132 const AArch64Subtarget &STI) 133 : TargetLowering(TM), Subtarget(&STI) { 134 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so 135 // we have to make something up. Arbitrarily, choose ZeroOrOne. 136 setBooleanContents(ZeroOrOneBooleanContent); 137 // When comparing vectors the result sets the different elements in the 138 // vector to all-one or all-zero. 139 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 140 141 // Set up the register classes. 142 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass); 143 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass); 144 145 if (Subtarget->hasFPARMv8()) { 146 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); 147 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass); 148 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); 149 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass); 150 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); 151 } 152 153 if (Subtarget->hasNEON()) { 154 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass); 155 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass); 156 // Someone set us up the NEON. 157 addDRTypeForNEON(MVT::v2f32); 158 addDRTypeForNEON(MVT::v8i8); 159 addDRTypeForNEON(MVT::v4i16); 160 addDRTypeForNEON(MVT::v2i32); 161 addDRTypeForNEON(MVT::v1i64); 162 addDRTypeForNEON(MVT::v1f64); 163 addDRTypeForNEON(MVT::v4f16); 164 addDRTypeForNEON(MVT::v4bf16); 165 166 addQRTypeForNEON(MVT::v4f32); 167 addQRTypeForNEON(MVT::v2f64); 168 addQRTypeForNEON(MVT::v16i8); 169 addQRTypeForNEON(MVT::v8i16); 170 addQRTypeForNEON(MVT::v4i32); 171 addQRTypeForNEON(MVT::v2i64); 172 addQRTypeForNEON(MVT::v8f16); 173 addQRTypeForNEON(MVT::v8bf16); 174 } 175 176 if (Subtarget->hasSVE()) { 177 // Add legal sve predicate types 178 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass); 179 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass); 180 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass); 181 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass); 182 183 // Add legal sve data types 184 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass); 185 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass); 186 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass); 187 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass); 188 189 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass); 190 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass); 191 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass); 192 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass); 193 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass); 194 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass); 195 196 if (Subtarget->hasBF16()) { 197 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass); 198 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass); 199 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass); 200 } 201 202 if (useSVEForFixedLengthVectors()) { 203 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) 204 if (useSVEForFixedLengthVectorVT(VT)) 205 addRegisterClass(VT, &AArch64::ZPRRegClass); 206 207 for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) 208 if (useSVEForFixedLengthVectorVT(VT)) 209 addRegisterClass(VT, &AArch64::ZPRRegClass); 210 } 211 212 for (auto VT : { MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64 }) { 213 setOperationAction(ISD::SADDSAT, VT, Legal); 214 setOperationAction(ISD::UADDSAT, VT, Legal); 215 setOperationAction(ISD::SSUBSAT, VT, Legal); 216 setOperationAction(ISD::USUBSAT, VT, Legal); 217 setOperationAction(ISD::UREM, VT, Expand); 218 setOperationAction(ISD::SREM, VT, Expand); 219 setOperationAction(ISD::SDIVREM, VT, Expand); 220 setOperationAction(ISD::UDIVREM, VT, Expand); 221 } 222 223 for (auto VT : 224 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8, 225 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 }) 226 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal); 227 228 for (auto VT : 229 { MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32, MVT::nxv4f32, 230 MVT::nxv2f64 }) { 231 setCondCodeAction(ISD::SETO, VT, Expand); 232 setCondCodeAction(ISD::SETOLT, VT, Expand); 233 setCondCodeAction(ISD::SETOLE, VT, Expand); 234 setCondCodeAction(ISD::SETULT, VT, Expand); 235 setCondCodeAction(ISD::SETULE, VT, Expand); 236 setCondCodeAction(ISD::SETUGE, VT, Expand); 237 setCondCodeAction(ISD::SETUGT, VT, Expand); 238 setCondCodeAction(ISD::SETUEQ, VT, Expand); 239 setCondCodeAction(ISD::SETUNE, VT, Expand); 240 } 241 } 242 243 // Compute derived properties from the register classes 244 computeRegisterProperties(Subtarget->getRegisterInfo()); 245 246 // Provide all sorts of operation actions 247 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 248 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 249 setOperationAction(ISD::SETCC, MVT::i32, Custom); 250 setOperationAction(ISD::SETCC, MVT::i64, Custom); 251 setOperationAction(ISD::SETCC, MVT::f16, Custom); 252 setOperationAction(ISD::SETCC, MVT::f32, Custom); 253 setOperationAction(ISD::SETCC, MVT::f64, Custom); 254 setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom); 255 setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom); 256 setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom); 257 setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom); 258 setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom); 259 setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom); 260 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 261 setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); 262 setOperationAction(ISD::BRCOND, MVT::Other, Expand); 263 setOperationAction(ISD::BR_CC, MVT::i32, Custom); 264 setOperationAction(ISD::BR_CC, MVT::i64, Custom); 265 setOperationAction(ISD::BR_CC, MVT::f16, Custom); 266 setOperationAction(ISD::BR_CC, MVT::f32, Custom); 267 setOperationAction(ISD::BR_CC, MVT::f64, Custom); 268 setOperationAction(ISD::SELECT, MVT::i32, Custom); 269 setOperationAction(ISD::SELECT, MVT::i64, Custom); 270 setOperationAction(ISD::SELECT, MVT::f16, Custom); 271 setOperationAction(ISD::SELECT, MVT::f32, Custom); 272 setOperationAction(ISD::SELECT, MVT::f64, Custom); 273 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 274 setOperationAction(ISD::SELECT_CC, MVT::i64, Custom); 275 setOperationAction(ISD::SELECT_CC, MVT::f16, Custom); 276 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 277 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 278 setOperationAction(ISD::BR_JT, MVT::Other, Custom); 279 setOperationAction(ISD::JumpTable, MVT::i64, Custom); 280 281 setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom); 282 setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom); 283 setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom); 284 285 setOperationAction(ISD::FREM, MVT::f32, Expand); 286 setOperationAction(ISD::FREM, MVT::f64, Expand); 287 setOperationAction(ISD::FREM, MVT::f80, Expand); 288 289 setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand); 290 291 // Custom lowering hooks are needed for XOR 292 // to fold it into CSINC/CSINV. 293 setOperationAction(ISD::XOR, MVT::i32, Custom); 294 setOperationAction(ISD::XOR, MVT::i64, Custom); 295 296 // Virtually no operation on f128 is legal, but LLVM can't expand them when 297 // there's a valid register class, so we need custom operations in most cases. 298 setOperationAction(ISD::FABS, MVT::f128, Expand); 299 setOperationAction(ISD::FADD, MVT::f128, Custom); 300 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); 301 setOperationAction(ISD::FCOS, MVT::f128, Expand); 302 setOperationAction(ISD::FDIV, MVT::f128, Custom); 303 setOperationAction(ISD::FMA, MVT::f128, Expand); 304 setOperationAction(ISD::FMUL, MVT::f128, Custom); 305 setOperationAction(ISD::FNEG, MVT::f128, Expand); 306 setOperationAction(ISD::FPOW, MVT::f128, Expand); 307 setOperationAction(ISD::FREM, MVT::f128, Expand); 308 setOperationAction(ISD::FRINT, MVT::f128, Expand); 309 setOperationAction(ISD::FSIN, MVT::f128, Expand); 310 setOperationAction(ISD::FSINCOS, MVT::f128, Expand); 311 setOperationAction(ISD::FSQRT, MVT::f128, Expand); 312 setOperationAction(ISD::FSUB, MVT::f128, Custom); 313 setOperationAction(ISD::FTRUNC, MVT::f128, Expand); 314 setOperationAction(ISD::SETCC, MVT::f128, Custom); 315 setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom); 316 setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom); 317 setOperationAction(ISD::BR_CC, MVT::f128, Custom); 318 setOperationAction(ISD::SELECT, MVT::f128, Custom); 319 setOperationAction(ISD::SELECT_CC, MVT::f128, Custom); 320 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); 321 322 // Lowering for many of the conversions is actually specified by the non-f128 323 // type. The LowerXXX function will be trivial when f128 isn't involved. 324 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 325 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 326 setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom); 327 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom); 328 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom); 329 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom); 330 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 331 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); 332 setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom); 333 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom); 334 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom); 335 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom); 336 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 337 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); 338 setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom); 339 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom); 340 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom); 341 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom); 342 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 343 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); 344 setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom); 345 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom); 346 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom); 347 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom); 348 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); 349 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); 350 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom); 351 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom); 352 353 // Variable arguments. 354 setOperationAction(ISD::VASTART, MVT::Other, Custom); 355 setOperationAction(ISD::VAARG, MVT::Other, Custom); 356 setOperationAction(ISD::VACOPY, MVT::Other, Custom); 357 setOperationAction(ISD::VAEND, MVT::Other, Expand); 358 359 // Variable-sized objects. 360 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 361 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 362 363 if (Subtarget->isTargetWindows()) 364 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom); 365 else 366 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 367 368 // Constant pool entries 369 setOperationAction(ISD::ConstantPool, MVT::i64, Custom); 370 371 // BlockAddress 372 setOperationAction(ISD::BlockAddress, MVT::i64, Custom); 373 374 // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences. 375 setOperationAction(ISD::ADDC, MVT::i32, Custom); 376 setOperationAction(ISD::ADDE, MVT::i32, Custom); 377 setOperationAction(ISD::SUBC, MVT::i32, Custom); 378 setOperationAction(ISD::SUBE, MVT::i32, Custom); 379 setOperationAction(ISD::ADDC, MVT::i64, Custom); 380 setOperationAction(ISD::ADDE, MVT::i64, Custom); 381 setOperationAction(ISD::SUBC, MVT::i64, Custom); 382 setOperationAction(ISD::SUBE, MVT::i64, Custom); 383 384 // AArch64 lacks both left-rotate and popcount instructions. 385 setOperationAction(ISD::ROTL, MVT::i32, Expand); 386 setOperationAction(ISD::ROTL, MVT::i64, Expand); 387 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 388 setOperationAction(ISD::ROTL, VT, Expand); 389 setOperationAction(ISD::ROTR, VT, Expand); 390 } 391 392 // AArch64 doesn't have i32 MULH{S|U}. 393 setOperationAction(ISD::MULHU, MVT::i32, Expand); 394 setOperationAction(ISD::MULHS, MVT::i32, Expand); 395 396 // AArch64 doesn't have {U|S}MUL_LOHI. 397 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); 398 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); 399 400 setOperationAction(ISD::CTPOP, MVT::i32, Custom); 401 setOperationAction(ISD::CTPOP, MVT::i64, Custom); 402 setOperationAction(ISD::CTPOP, MVT::i128, Custom); 403 404 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 405 setOperationAction(ISD::SDIVREM, MVT::i64, Expand); 406 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 407 setOperationAction(ISD::SDIVREM, VT, Expand); 408 setOperationAction(ISD::UDIVREM, VT, Expand); 409 } 410 setOperationAction(ISD::SREM, MVT::i32, Expand); 411 setOperationAction(ISD::SREM, MVT::i64, Expand); 412 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 413 setOperationAction(ISD::UDIVREM, MVT::i64, Expand); 414 setOperationAction(ISD::UREM, MVT::i32, Expand); 415 setOperationAction(ISD::UREM, MVT::i64, Expand); 416 417 // Custom lower Add/Sub/Mul with overflow. 418 setOperationAction(ISD::SADDO, MVT::i32, Custom); 419 setOperationAction(ISD::SADDO, MVT::i64, Custom); 420 setOperationAction(ISD::UADDO, MVT::i32, Custom); 421 setOperationAction(ISD::UADDO, MVT::i64, Custom); 422 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 423 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 424 setOperationAction(ISD::USUBO, MVT::i32, Custom); 425 setOperationAction(ISD::USUBO, MVT::i64, Custom); 426 setOperationAction(ISD::SMULO, MVT::i32, Custom); 427 setOperationAction(ISD::SMULO, MVT::i64, Custom); 428 setOperationAction(ISD::UMULO, MVT::i32, Custom); 429 setOperationAction(ISD::UMULO, MVT::i64, Custom); 430 431 setOperationAction(ISD::FSIN, MVT::f32, Expand); 432 setOperationAction(ISD::FSIN, MVT::f64, Expand); 433 setOperationAction(ISD::FCOS, MVT::f32, Expand); 434 setOperationAction(ISD::FCOS, MVT::f64, Expand); 435 setOperationAction(ISD::FPOW, MVT::f32, Expand); 436 setOperationAction(ISD::FPOW, MVT::f64, Expand); 437 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 438 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 439 if (Subtarget->hasFullFP16()) 440 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom); 441 else 442 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote); 443 444 setOperationAction(ISD::FREM, MVT::f16, Promote); 445 setOperationAction(ISD::FREM, MVT::v4f16, Expand); 446 setOperationAction(ISD::FREM, MVT::v8f16, Expand); 447 setOperationAction(ISD::FPOW, MVT::f16, Promote); 448 setOperationAction(ISD::FPOW, MVT::v4f16, Expand); 449 setOperationAction(ISD::FPOW, MVT::v8f16, Expand); 450 setOperationAction(ISD::FPOWI, MVT::f16, Promote); 451 setOperationAction(ISD::FPOWI, MVT::v4f16, Expand); 452 setOperationAction(ISD::FPOWI, MVT::v8f16, Expand); 453 setOperationAction(ISD::FCOS, MVT::f16, Promote); 454 setOperationAction(ISD::FCOS, MVT::v4f16, Expand); 455 setOperationAction(ISD::FCOS, MVT::v8f16, Expand); 456 setOperationAction(ISD::FSIN, MVT::f16, Promote); 457 setOperationAction(ISD::FSIN, MVT::v4f16, Expand); 458 setOperationAction(ISD::FSIN, MVT::v8f16, Expand); 459 setOperationAction(ISD::FSINCOS, MVT::f16, Promote); 460 setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand); 461 setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand); 462 setOperationAction(ISD::FEXP, MVT::f16, Promote); 463 setOperationAction(ISD::FEXP, MVT::v4f16, Expand); 464 setOperationAction(ISD::FEXP, MVT::v8f16, Expand); 465 setOperationAction(ISD::FEXP2, MVT::f16, Promote); 466 setOperationAction(ISD::FEXP2, MVT::v4f16, Expand); 467 setOperationAction(ISD::FEXP2, MVT::v8f16, Expand); 468 setOperationAction(ISD::FLOG, MVT::f16, Promote); 469 setOperationAction(ISD::FLOG, MVT::v4f16, Expand); 470 setOperationAction(ISD::FLOG, MVT::v8f16, Expand); 471 setOperationAction(ISD::FLOG2, MVT::f16, Promote); 472 setOperationAction(ISD::FLOG2, MVT::v4f16, Expand); 473 setOperationAction(ISD::FLOG2, MVT::v8f16, Expand); 474 setOperationAction(ISD::FLOG10, MVT::f16, Promote); 475 setOperationAction(ISD::FLOG10, MVT::v4f16, Expand); 476 setOperationAction(ISD::FLOG10, MVT::v8f16, Expand); 477 478 if (!Subtarget->hasFullFP16()) { 479 setOperationAction(ISD::SELECT, MVT::f16, Promote); 480 setOperationAction(ISD::SELECT_CC, MVT::f16, Promote); 481 setOperationAction(ISD::SETCC, MVT::f16, Promote); 482 setOperationAction(ISD::BR_CC, MVT::f16, Promote); 483 setOperationAction(ISD::FADD, MVT::f16, Promote); 484 setOperationAction(ISD::FSUB, MVT::f16, Promote); 485 setOperationAction(ISD::FMUL, MVT::f16, Promote); 486 setOperationAction(ISD::FDIV, MVT::f16, Promote); 487 setOperationAction(ISD::FMA, MVT::f16, Promote); 488 setOperationAction(ISD::FNEG, MVT::f16, Promote); 489 setOperationAction(ISD::FABS, MVT::f16, Promote); 490 setOperationAction(ISD::FCEIL, MVT::f16, Promote); 491 setOperationAction(ISD::FSQRT, MVT::f16, Promote); 492 setOperationAction(ISD::FFLOOR, MVT::f16, Promote); 493 setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote); 494 setOperationAction(ISD::FRINT, MVT::f16, Promote); 495 setOperationAction(ISD::FROUND, MVT::f16, Promote); 496 setOperationAction(ISD::FTRUNC, MVT::f16, Promote); 497 setOperationAction(ISD::FMINNUM, MVT::f16, Promote); 498 setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); 499 setOperationAction(ISD::FMINIMUM, MVT::f16, Promote); 500 setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote); 501 502 // promote v4f16 to v4f32 when that is known to be safe. 503 setOperationAction(ISD::FADD, MVT::v4f16, Promote); 504 setOperationAction(ISD::FSUB, MVT::v4f16, Promote); 505 setOperationAction(ISD::FMUL, MVT::v4f16, Promote); 506 setOperationAction(ISD::FDIV, MVT::v4f16, Promote); 507 AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32); 508 AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32); 509 AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32); 510 AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32); 511 512 setOperationAction(ISD::FABS, MVT::v4f16, Expand); 513 setOperationAction(ISD::FNEG, MVT::v4f16, Expand); 514 setOperationAction(ISD::FROUND, MVT::v4f16, Expand); 515 setOperationAction(ISD::FMA, MVT::v4f16, Expand); 516 setOperationAction(ISD::SETCC, MVT::v4f16, Expand); 517 setOperationAction(ISD::BR_CC, MVT::v4f16, Expand); 518 setOperationAction(ISD::SELECT, MVT::v4f16, Expand); 519 setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand); 520 setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand); 521 setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand); 522 setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand); 523 setOperationAction(ISD::FCEIL, MVT::v4f16, Expand); 524 setOperationAction(ISD::FRINT, MVT::v4f16, Expand); 525 setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand); 526 setOperationAction(ISD::FSQRT, MVT::v4f16, Expand); 527 528 setOperationAction(ISD::FABS, MVT::v8f16, Expand); 529 setOperationAction(ISD::FADD, MVT::v8f16, Expand); 530 setOperationAction(ISD::FCEIL, MVT::v8f16, Expand); 531 setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand); 532 setOperationAction(ISD::FDIV, MVT::v8f16, Expand); 533 setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand); 534 setOperationAction(ISD::FMA, MVT::v8f16, Expand); 535 setOperationAction(ISD::FMUL, MVT::v8f16, Expand); 536 setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand); 537 setOperationAction(ISD::FNEG, MVT::v8f16, Expand); 538 setOperationAction(ISD::FROUND, MVT::v8f16, Expand); 539 setOperationAction(ISD::FRINT, MVT::v8f16, Expand); 540 setOperationAction(ISD::FSQRT, MVT::v8f16, Expand); 541 setOperationAction(ISD::FSUB, MVT::v8f16, Expand); 542 setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand); 543 setOperationAction(ISD::SETCC, MVT::v8f16, Expand); 544 setOperationAction(ISD::BR_CC, MVT::v8f16, Expand); 545 setOperationAction(ISD::SELECT, MVT::v8f16, Expand); 546 setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand); 547 setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand); 548 } 549 550 // AArch64 has implementations of a lot of rounding-like FP operations. 551 for (MVT Ty : {MVT::f32, MVT::f64}) { 552 setOperationAction(ISD::FFLOOR, Ty, Legal); 553 setOperationAction(ISD::FNEARBYINT, Ty, Legal); 554 setOperationAction(ISD::FCEIL, Ty, Legal); 555 setOperationAction(ISD::FRINT, Ty, Legal); 556 setOperationAction(ISD::FTRUNC, Ty, Legal); 557 setOperationAction(ISD::FROUND, Ty, Legal); 558 setOperationAction(ISD::FMINNUM, Ty, Legal); 559 setOperationAction(ISD::FMAXNUM, Ty, Legal); 560 setOperationAction(ISD::FMINIMUM, Ty, Legal); 561 setOperationAction(ISD::FMAXIMUM, Ty, Legal); 562 setOperationAction(ISD::LROUND, Ty, Legal); 563 setOperationAction(ISD::LLROUND, Ty, Legal); 564 setOperationAction(ISD::LRINT, Ty, Legal); 565 setOperationAction(ISD::LLRINT, Ty, Legal); 566 } 567 568 if (Subtarget->hasFullFP16()) { 569 setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal); 570 setOperationAction(ISD::FFLOOR, MVT::f16, Legal); 571 setOperationAction(ISD::FCEIL, MVT::f16, Legal); 572 setOperationAction(ISD::FRINT, MVT::f16, Legal); 573 setOperationAction(ISD::FTRUNC, MVT::f16, Legal); 574 setOperationAction(ISD::FROUND, MVT::f16, Legal); 575 setOperationAction(ISD::FMINNUM, MVT::f16, Legal); 576 setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); 577 setOperationAction(ISD::FMINIMUM, MVT::f16, Legal); 578 setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal); 579 } 580 581 setOperationAction(ISD::PREFETCH, MVT::Other, Custom); 582 583 setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); 584 585 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); 586 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); 587 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 588 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom); 589 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 590 591 // 128-bit loads and stores can be done without expanding 592 setOperationAction(ISD::LOAD, MVT::i128, Custom); 593 setOperationAction(ISD::STORE, MVT::i128, Custom); 594 595 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the 596 // custom lowering, as there are no un-paired non-temporal stores and 597 // legalization will break up 256 bit inputs. 598 setOperationAction(ISD::STORE, MVT::v32i8, Custom); 599 setOperationAction(ISD::STORE, MVT::v16i16, Custom); 600 setOperationAction(ISD::STORE, MVT::v16f16, Custom); 601 setOperationAction(ISD::STORE, MVT::v8i32, Custom); 602 setOperationAction(ISD::STORE, MVT::v8f32, Custom); 603 setOperationAction(ISD::STORE, MVT::v4f64, Custom); 604 setOperationAction(ISD::STORE, MVT::v4i64, Custom); 605 606 // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0. 607 // This requires the Performance Monitors extension. 608 if (Subtarget->hasPerfMon()) 609 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); 610 611 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && 612 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { 613 // Issue __sincos_stret if available. 614 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 615 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 616 } else { 617 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 618 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 619 } 620 621 if (Subtarget->getTargetTriple().isOSMSVCRT()) { 622 // MSVCRT doesn't have powi; fall back to pow 623 setLibcallName(RTLIB::POWI_F32, nullptr); 624 setLibcallName(RTLIB::POWI_F64, nullptr); 625 } 626 627 // Make floating-point constants legal for the large code model, so they don't 628 // become loads from the constant pool. 629 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) { 630 setOperationAction(ISD::ConstantFP, MVT::f32, Legal); 631 setOperationAction(ISD::ConstantFP, MVT::f64, Legal); 632 } 633 634 // AArch64 does not have floating-point extending loads, i1 sign-extending 635 // load, floating-point truncating stores, or v2i32->v2i16 truncating store. 636 for (MVT VT : MVT::fp_valuetypes()) { 637 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); 638 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); 639 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand); 640 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand); 641 } 642 for (MVT VT : MVT::integer_valuetypes()) 643 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand); 644 645 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 646 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 647 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 648 setTruncStoreAction(MVT::f128, MVT::f80, Expand); 649 setTruncStoreAction(MVT::f128, MVT::f64, Expand); 650 setTruncStoreAction(MVT::f128, MVT::f32, Expand); 651 setTruncStoreAction(MVT::f128, MVT::f16, Expand); 652 653 setOperationAction(ISD::BITCAST, MVT::i16, Custom); 654 setOperationAction(ISD::BITCAST, MVT::f16, Custom); 655 setOperationAction(ISD::BITCAST, MVT::bf16, Custom); 656 657 // Indexed loads and stores are supported. 658 for (unsigned im = (unsigned)ISD::PRE_INC; 659 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 660 setIndexedLoadAction(im, MVT::i8, Legal); 661 setIndexedLoadAction(im, MVT::i16, Legal); 662 setIndexedLoadAction(im, MVT::i32, Legal); 663 setIndexedLoadAction(im, MVT::i64, Legal); 664 setIndexedLoadAction(im, MVT::f64, Legal); 665 setIndexedLoadAction(im, MVT::f32, Legal); 666 setIndexedLoadAction(im, MVT::f16, Legal); 667 setIndexedLoadAction(im, MVT::bf16, Legal); 668 setIndexedStoreAction(im, MVT::i8, Legal); 669 setIndexedStoreAction(im, MVT::i16, Legal); 670 setIndexedStoreAction(im, MVT::i32, Legal); 671 setIndexedStoreAction(im, MVT::i64, Legal); 672 setIndexedStoreAction(im, MVT::f64, Legal); 673 setIndexedStoreAction(im, MVT::f32, Legal); 674 setIndexedStoreAction(im, MVT::f16, Legal); 675 setIndexedStoreAction(im, MVT::bf16, Legal); 676 } 677 678 // Trap. 679 setOperationAction(ISD::TRAP, MVT::Other, Legal); 680 if (Subtarget->isTargetWindows()) 681 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); 682 683 // We combine OR nodes for bitfield operations. 684 setTargetDAGCombine(ISD::OR); 685 // Try to create BICs for vector ANDs. 686 setTargetDAGCombine(ISD::AND); 687 688 // Vector add and sub nodes may conceal a high-half opportunity. 689 // Also, try to fold ADD into CSINC/CSINV.. 690 setTargetDAGCombine(ISD::ADD); 691 setTargetDAGCombine(ISD::SUB); 692 setTargetDAGCombine(ISD::SRL); 693 setTargetDAGCombine(ISD::XOR); 694 setTargetDAGCombine(ISD::SINT_TO_FP); 695 setTargetDAGCombine(ISD::UINT_TO_FP); 696 697 setTargetDAGCombine(ISD::FP_TO_SINT); 698 setTargetDAGCombine(ISD::FP_TO_UINT); 699 setTargetDAGCombine(ISD::FDIV); 700 701 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 702 703 setTargetDAGCombine(ISD::ANY_EXTEND); 704 setTargetDAGCombine(ISD::ZERO_EXTEND); 705 setTargetDAGCombine(ISD::SIGN_EXTEND); 706 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); 707 setTargetDAGCombine(ISD::CONCAT_VECTORS); 708 setTargetDAGCombine(ISD::STORE); 709 if (Subtarget->supportsAddressTopByteIgnored()) 710 setTargetDAGCombine(ISD::LOAD); 711 712 setTargetDAGCombine(ISD::MUL); 713 714 setTargetDAGCombine(ISD::SELECT); 715 setTargetDAGCombine(ISD::VSELECT); 716 717 setTargetDAGCombine(ISD::INTRINSIC_VOID); 718 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); 719 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 720 721 setTargetDAGCombine(ISD::GlobalAddress); 722 723 // In case of strict alignment, avoid an excessive number of byte wide stores. 724 MaxStoresPerMemsetOptSize = 8; 725 MaxStoresPerMemset = Subtarget->requiresStrictAlign() 726 ? MaxStoresPerMemsetOptSize : 32; 727 728 MaxGluedStoresPerMemcpy = 4; 729 MaxStoresPerMemcpyOptSize = 4; 730 MaxStoresPerMemcpy = Subtarget->requiresStrictAlign() 731 ? MaxStoresPerMemcpyOptSize : 16; 732 733 MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4; 734 735 MaxLoadsPerMemcmpOptSize = 4; 736 MaxLoadsPerMemcmp = Subtarget->requiresStrictAlign() 737 ? MaxLoadsPerMemcmpOptSize : 8; 738 739 setStackPointerRegisterToSaveRestore(AArch64::SP); 740 741 setSchedulingPreference(Sched::Hybrid); 742 743 EnableExtLdPromotion = true; 744 745 // Set required alignment. 746 setMinFunctionAlignment(Align(4)); 747 // Set preferred alignments. 748 setPrefLoopAlignment(Align(1ULL << STI.getPrefLoopLogAlignment())); 749 setPrefFunctionAlignment(Align(1ULL << STI.getPrefFunctionLogAlignment())); 750 751 // Only change the limit for entries in a jump table if specified by 752 // the sub target, but not at the command line. 753 unsigned MaxJT = STI.getMaximumJumpTableSize(); 754 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX) 755 setMaximumJumpTableSize(MaxJT); 756 757 setHasExtractBitsInsn(true); 758 759 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 760 761 if (Subtarget->hasNEON()) { 762 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to 763 // silliness like this: 764 setOperationAction(ISD::FABS, MVT::v1f64, Expand); 765 setOperationAction(ISD::FADD, MVT::v1f64, Expand); 766 setOperationAction(ISD::FCEIL, MVT::v1f64, Expand); 767 setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand); 768 setOperationAction(ISD::FCOS, MVT::v1f64, Expand); 769 setOperationAction(ISD::FDIV, MVT::v1f64, Expand); 770 setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand); 771 setOperationAction(ISD::FMA, MVT::v1f64, Expand); 772 setOperationAction(ISD::FMUL, MVT::v1f64, Expand); 773 setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand); 774 setOperationAction(ISD::FNEG, MVT::v1f64, Expand); 775 setOperationAction(ISD::FPOW, MVT::v1f64, Expand); 776 setOperationAction(ISD::FREM, MVT::v1f64, Expand); 777 setOperationAction(ISD::FROUND, MVT::v1f64, Expand); 778 setOperationAction(ISD::FRINT, MVT::v1f64, Expand); 779 setOperationAction(ISD::FSIN, MVT::v1f64, Expand); 780 setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand); 781 setOperationAction(ISD::FSQRT, MVT::v1f64, Expand); 782 setOperationAction(ISD::FSUB, MVT::v1f64, Expand); 783 setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand); 784 setOperationAction(ISD::SETCC, MVT::v1f64, Expand); 785 setOperationAction(ISD::BR_CC, MVT::v1f64, Expand); 786 setOperationAction(ISD::SELECT, MVT::v1f64, Expand); 787 setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand); 788 setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand); 789 790 setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand); 791 setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand); 792 setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand); 793 setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand); 794 setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand); 795 796 setOperationAction(ISD::MUL, MVT::v1i64, Expand); 797 798 // AArch64 doesn't have a direct vector ->f32 conversion instructions for 799 // elements smaller than i32, so promote the input to i32 first. 800 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32); 801 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32); 802 // i8 vector elements also need promotion to i32 for v8i8 803 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32); 804 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32); 805 // Similarly, there is no direct i32 -> f64 vector conversion instruction. 806 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); 807 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); 808 setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom); 809 setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom); 810 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the 811 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16 812 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom); 813 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); 814 815 if (Subtarget->hasFullFP16()) { 816 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); 817 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); 818 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom); 819 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); 820 } else { 821 // when AArch64 doesn't have fullfp16 support, promote the input 822 // to i32 first. 823 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32); 824 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32); 825 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32); 826 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32); 827 } 828 829 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); 830 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); 831 832 // AArch64 doesn't have MUL.2d: 833 setOperationAction(ISD::MUL, MVT::v2i64, Expand); 834 // Custom handling for some quad-vector types to detect MULL. 835 setOperationAction(ISD::MUL, MVT::v8i16, Custom); 836 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 837 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 838 839 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, 840 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { 841 // Vector reductions 842 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); 843 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); 844 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); 845 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); 846 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); 847 848 // Saturates 849 setOperationAction(ISD::SADDSAT, VT, Legal); 850 setOperationAction(ISD::UADDSAT, VT, Legal); 851 setOperationAction(ISD::SSUBSAT, VT, Legal); 852 setOperationAction(ISD::USUBSAT, VT, Legal); 853 854 setOperationAction(ISD::TRUNCATE, VT, Custom); 855 } 856 for (MVT VT : { MVT::v4f16, MVT::v2f32, 857 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) { 858 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); 859 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); 860 } 861 862 setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); 863 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); 864 // Likewise, narrowing and extending vector loads/stores aren't handled 865 // directly. 866 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 867 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); 868 869 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) { 870 setOperationAction(ISD::MULHS, VT, Legal); 871 setOperationAction(ISD::MULHU, VT, Legal); 872 } else { 873 setOperationAction(ISD::MULHS, VT, Expand); 874 setOperationAction(ISD::MULHU, VT, Expand); 875 } 876 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 877 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 878 879 setOperationAction(ISD::BSWAP, VT, Expand); 880 setOperationAction(ISD::CTTZ, VT, Expand); 881 882 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { 883 setTruncStoreAction(VT, InnerVT, Expand); 884 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); 885 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); 886 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); 887 } 888 } 889 890 // AArch64 has implementations of a lot of rounding-like FP operations. 891 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) { 892 setOperationAction(ISD::FFLOOR, Ty, Legal); 893 setOperationAction(ISD::FNEARBYINT, Ty, Legal); 894 setOperationAction(ISD::FCEIL, Ty, Legal); 895 setOperationAction(ISD::FRINT, Ty, Legal); 896 setOperationAction(ISD::FTRUNC, Ty, Legal); 897 setOperationAction(ISD::FROUND, Ty, Legal); 898 } 899 900 if (Subtarget->hasFullFP16()) { 901 for (MVT Ty : {MVT::v4f16, MVT::v8f16}) { 902 setOperationAction(ISD::FFLOOR, Ty, Legal); 903 setOperationAction(ISD::FNEARBYINT, Ty, Legal); 904 setOperationAction(ISD::FCEIL, Ty, Legal); 905 setOperationAction(ISD::FRINT, Ty, Legal); 906 setOperationAction(ISD::FTRUNC, Ty, Legal); 907 setOperationAction(ISD::FROUND, Ty, Legal); 908 } 909 } 910 911 if (Subtarget->hasSVE()) 912 setOperationAction(ISD::VSCALE, MVT::i32, Custom); 913 914 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom); 915 } 916 917 if (Subtarget->hasSVE()) { 918 // FIXME: Add custom lowering of MLOAD to handle different passthrus (not a 919 // splat of 0 or undef) once vector selects supported in SVE codegen. See 920 // D68877 for more details. 921 for (MVT VT : MVT::integer_scalable_vector_valuetypes()) { 922 if (isTypeLegal(VT)) { 923 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 924 setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); 925 setOperationAction(ISD::SELECT, VT, Custom); 926 setOperationAction(ISD::SDIV, VT, Custom); 927 setOperationAction(ISD::UDIV, VT, Custom); 928 setOperationAction(ISD::SMIN, VT, Custom); 929 setOperationAction(ISD::UMIN, VT, Custom); 930 setOperationAction(ISD::SMAX, VT, Custom); 931 setOperationAction(ISD::UMAX, VT, Custom); 932 setOperationAction(ISD::SHL, VT, Custom); 933 setOperationAction(ISD::SRL, VT, Custom); 934 setOperationAction(ISD::SRA, VT, Custom); 935 if (VT.getScalarType() == MVT::i1) { 936 setOperationAction(ISD::SETCC, VT, Custom); 937 setOperationAction(ISD::TRUNCATE, VT, Custom); 938 setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); 939 } 940 } 941 } 942 943 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) 944 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 945 946 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom); 947 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom); 948 949 for (MVT VT : MVT::fp_scalable_vector_valuetypes()) { 950 if (isTypeLegal(VT)) { 951 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 952 setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); 953 setOperationAction(ISD::SELECT, VT, Custom); 954 setOperationAction(ISD::FMA, VT, Custom); 955 } 956 } 957 958 // NOTE: Currently this has to happen after computeRegisterProperties rather 959 // than the preferred option of combining it with the addRegisterClass call. 960 if (useSVEForFixedLengthVectors()) { 961 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) 962 if (useSVEForFixedLengthVectorVT(VT)) 963 addTypeForFixedLengthSVE(VT); 964 for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) 965 if (useSVEForFixedLengthVectorVT(VT)) 966 addTypeForFixedLengthSVE(VT); 967 968 // 64bit results can mean a bigger than NEON input. 969 for (auto VT : {MVT::v8i8, MVT::v4i16}) 970 setOperationAction(ISD::TRUNCATE, VT, Custom); 971 setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom); 972 973 // 128bit results imply a bigger than NEON input. 974 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32}) 975 setOperationAction(ISD::TRUNCATE, VT, Custom); 976 for (auto VT : {MVT::v8f16, MVT::v4f32}) 977 setOperationAction(ISD::FP_ROUND, VT, Expand); 978 } 979 } 980 981 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); 982 } 983 984 void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) { 985 assert(VT.isVector() && "VT should be a vector type"); 986 987 if (VT.isFloatingPoint()) { 988 MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT(); 989 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo); 990 setOperationPromotedToType(ISD::STORE, VT, PromoteTo); 991 } 992 993 // Mark vector float intrinsics as expand. 994 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) { 995 setOperationAction(ISD::FSIN, VT, Expand); 996 setOperationAction(ISD::FCOS, VT, Expand); 997 setOperationAction(ISD::FPOW, VT, Expand); 998 setOperationAction(ISD::FLOG, VT, Expand); 999 setOperationAction(ISD::FLOG2, VT, Expand); 1000 setOperationAction(ISD::FLOG10, VT, Expand); 1001 setOperationAction(ISD::FEXP, VT, Expand); 1002 setOperationAction(ISD::FEXP2, VT, Expand); 1003 1004 // But we do support custom-lowering for FCOPYSIGN. 1005 setOperationAction(ISD::FCOPYSIGN, VT, Custom); 1006 } 1007 1008 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 1009 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 1010 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 1011 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 1012 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 1013 setOperationAction(ISD::SRA, VT, Custom); 1014 setOperationAction(ISD::SRL, VT, Custom); 1015 setOperationAction(ISD::SHL, VT, Custom); 1016 setOperationAction(ISD::OR, VT, Custom); 1017 setOperationAction(ISD::SETCC, VT, Custom); 1018 setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); 1019 1020 setOperationAction(ISD::SELECT, VT, Expand); 1021 setOperationAction(ISD::SELECT_CC, VT, Expand); 1022 setOperationAction(ISD::VSELECT, VT, Expand); 1023 for (MVT InnerVT : MVT::all_valuetypes()) 1024 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); 1025 1026 // CNT supports only B element sizes, then use UADDLP to widen. 1027 if (VT != MVT::v8i8 && VT != MVT::v16i8) 1028 setOperationAction(ISD::CTPOP, VT, Custom); 1029 1030 setOperationAction(ISD::UDIV, VT, Expand); 1031 setOperationAction(ISD::SDIV, VT, Expand); 1032 setOperationAction(ISD::UREM, VT, Expand); 1033 setOperationAction(ISD::SREM, VT, Expand); 1034 setOperationAction(ISD::FREM, VT, Expand); 1035 1036 setOperationAction(ISD::FP_TO_SINT, VT, Custom); 1037 setOperationAction(ISD::FP_TO_UINT, VT, Custom); 1038 1039 if (!VT.isFloatingPoint()) 1040 setOperationAction(ISD::ABS, VT, Legal); 1041 1042 // [SU][MIN|MAX] are available for all NEON types apart from i64. 1043 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64) 1044 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) 1045 setOperationAction(Opcode, VT, Legal); 1046 1047 // F[MIN|MAX][NUM|NAN] are available for all FP NEON types. 1048 if (VT.isFloatingPoint() && 1049 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16())) 1050 for (unsigned Opcode : 1051 {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM}) 1052 setOperationAction(Opcode, VT, Legal); 1053 1054 if (Subtarget->isLittleEndian()) { 1055 for (unsigned im = (unsigned)ISD::PRE_INC; 1056 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 1057 setIndexedLoadAction(im, VT, Legal); 1058 setIndexedStoreAction(im, VT, Legal); 1059 } 1060 } 1061 } 1062 1063 void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { 1064 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); 1065 1066 // By default everything must be expanded. 1067 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) 1068 setOperationAction(Op, VT, Expand); 1069 1070 // We use EXTRACT_SUBVECTOR to "cast" a scalable vector to a fixed length one. 1071 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 1072 1073 // Lower fixed length vector operations to scalable equivalents. 1074 setOperationAction(ISD::ADD, VT, Custom); 1075 setOperationAction(ISD::FADD, VT, Custom); 1076 setOperationAction(ISD::LOAD, VT, Custom); 1077 setOperationAction(ISD::STORE, VT, Custom); 1078 setOperationAction(ISD::TRUNCATE, VT, Custom); 1079 } 1080 1081 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) { 1082 addRegisterClass(VT, &AArch64::FPR64RegClass); 1083 addTypeForNEON(VT, MVT::v2i32); 1084 } 1085 1086 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) { 1087 addRegisterClass(VT, &AArch64::FPR128RegClass); 1088 addTypeForNEON(VT, MVT::v4i32); 1089 } 1090 1091 EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, 1092 LLVMContext &C, EVT VT) const { 1093 if (!VT.isVector()) 1094 return MVT::i32; 1095 if (VT.isScalableVector()) 1096 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount()); 1097 return VT.changeVectorElementTypeToInteger(); 1098 } 1099 1100 static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, 1101 const APInt &Demanded, 1102 TargetLowering::TargetLoweringOpt &TLO, 1103 unsigned NewOpc) { 1104 uint64_t OldImm = Imm, NewImm, Enc; 1105 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask; 1106 1107 // Return if the immediate is already all zeros, all ones, a bimm32 or a 1108 // bimm64. 1109 if (Imm == 0 || Imm == Mask || 1110 AArch64_AM::isLogicalImmediate(Imm & Mask, Size)) 1111 return false; 1112 1113 unsigned EltSize = Size; 1114 uint64_t DemandedBits = Demanded.getZExtValue(); 1115 1116 // Clear bits that are not demanded. 1117 Imm &= DemandedBits; 1118 1119 while (true) { 1120 // The goal here is to set the non-demanded bits in a way that minimizes 1121 // the number of switching between 0 and 1. In order to achieve this goal, 1122 // we set the non-demanded bits to the value of the preceding demanded bits. 1123 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a 1124 // non-demanded bit), we copy bit0 (1) to the least significant 'x', 1125 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'. 1126 // The final result is 0b11000011. 1127 uint64_t NonDemandedBits = ~DemandedBits; 1128 uint64_t InvertedImm = ~Imm & DemandedBits; 1129 uint64_t RotatedImm = 1130 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) & 1131 NonDemandedBits; 1132 uint64_t Sum = RotatedImm + NonDemandedBits; 1133 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1)); 1134 uint64_t Ones = (Sum + Carry) & NonDemandedBits; 1135 NewImm = (Imm | Ones) & Mask; 1136 1137 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate 1138 // or all-ones or all-zeros, in which case we can stop searching. Otherwise, 1139 // we halve the element size and continue the search. 1140 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask))) 1141 break; 1142 1143 // We cannot shrink the element size any further if it is 2-bits. 1144 if (EltSize == 2) 1145 return false; 1146 1147 EltSize /= 2; 1148 Mask >>= EltSize; 1149 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize; 1150 1151 // Return if there is mismatch in any of the demanded bits of Imm and Hi. 1152 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0) 1153 return false; 1154 1155 // Merge the upper and lower halves of Imm and DemandedBits. 1156 Imm |= Hi; 1157 DemandedBits |= DemandedBitsHi; 1158 } 1159 1160 ++NumOptimizedImms; 1161 1162 // Replicate the element across the register width. 1163 while (EltSize < Size) { 1164 NewImm |= NewImm << EltSize; 1165 EltSize *= 2; 1166 } 1167 1168 (void)OldImm; 1169 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 && 1170 "demanded bits should never be altered"); 1171 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm"); 1172 1173 // Create the new constant immediate node. 1174 EVT VT = Op.getValueType(); 1175 SDLoc DL(Op); 1176 SDValue New; 1177 1178 // If the new constant immediate is all-zeros or all-ones, let the target 1179 // independent DAG combine optimize this node. 1180 if (NewImm == 0 || NewImm == OrigMask) { 1181 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0), 1182 TLO.DAG.getConstant(NewImm, DL, VT)); 1183 // Otherwise, create a machine node so that target independent DAG combine 1184 // doesn't undo this optimization. 1185 } else { 1186 Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size); 1187 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT); 1188 New = SDValue( 1189 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0); 1190 } 1191 1192 return TLO.CombineTo(Op, New); 1193 } 1194 1195 bool AArch64TargetLowering::targetShrinkDemandedConstant( 1196 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, 1197 TargetLoweringOpt &TLO) const { 1198 // Delay this optimization to as late as possible. 1199 if (!TLO.LegalOps) 1200 return false; 1201 1202 if (!EnableOptimizeLogicalImm) 1203 return false; 1204 1205 EVT VT = Op.getValueType(); 1206 if (VT.isVector()) 1207 return false; 1208 1209 unsigned Size = VT.getSizeInBits(); 1210 assert((Size == 32 || Size == 64) && 1211 "i32 or i64 is expected after legalization."); 1212 1213 // Exit early if we demand all bits. 1214 if (DemandedBits.countPopulation() == Size) 1215 return false; 1216 1217 unsigned NewOpc; 1218 switch (Op.getOpcode()) { 1219 default: 1220 return false; 1221 case ISD::AND: 1222 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri; 1223 break; 1224 case ISD::OR: 1225 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri; 1226 break; 1227 case ISD::XOR: 1228 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri; 1229 break; 1230 } 1231 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 1232 if (!C) 1233 return false; 1234 uint64_t Imm = C->getZExtValue(); 1235 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc); 1236 } 1237 1238 /// computeKnownBitsForTargetNode - Determine which of the bits specified in 1239 /// Mask are known to be either zero or one and return them Known. 1240 void AArch64TargetLowering::computeKnownBitsForTargetNode( 1241 const SDValue Op, KnownBits &Known, 1242 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { 1243 switch (Op.getOpcode()) { 1244 default: 1245 break; 1246 case AArch64ISD::CSEL: { 1247 KnownBits Known2; 1248 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); 1249 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); 1250 Known.Zero &= Known2.Zero; 1251 Known.One &= Known2.One; 1252 break; 1253 } 1254 case AArch64ISD::LOADgot: 1255 case AArch64ISD::ADDlow: { 1256 if (!Subtarget->isTargetILP32()) 1257 break; 1258 // In ILP32 mode all valid pointers are in the low 4GB of the address-space. 1259 Known.Zero = APInt::getHighBitsSet(64, 32); 1260 break; 1261 } 1262 case ISD::INTRINSIC_W_CHAIN: { 1263 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); 1264 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); 1265 switch (IntID) { 1266 default: return; 1267 case Intrinsic::aarch64_ldaxr: 1268 case Intrinsic::aarch64_ldxr: { 1269 unsigned BitWidth = Known.getBitWidth(); 1270 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); 1271 unsigned MemBits = VT.getScalarSizeInBits(); 1272 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); 1273 return; 1274 } 1275 } 1276 break; 1277 } 1278 case ISD::INTRINSIC_WO_CHAIN: 1279 case ISD::INTRINSIC_VOID: { 1280 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 1281 switch (IntNo) { 1282 default: 1283 break; 1284 case Intrinsic::aarch64_neon_umaxv: 1285 case Intrinsic::aarch64_neon_uminv: { 1286 // Figure out the datatype of the vector operand. The UMINV instruction 1287 // will zero extend the result, so we can mark as known zero all the 1288 // bits larger than the element datatype. 32-bit or larget doesn't need 1289 // this as those are legal types and will be handled by isel directly. 1290 MVT VT = Op.getOperand(1).getValueType().getSimpleVT(); 1291 unsigned BitWidth = Known.getBitWidth(); 1292 if (VT == MVT::v8i8 || VT == MVT::v16i8) { 1293 assert(BitWidth >= 8 && "Unexpected width!"); 1294 APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8); 1295 Known.Zero |= Mask; 1296 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) { 1297 assert(BitWidth >= 16 && "Unexpected width!"); 1298 APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16); 1299 Known.Zero |= Mask; 1300 } 1301 break; 1302 } break; 1303 } 1304 } 1305 } 1306 } 1307 1308 MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL, 1309 EVT) const { 1310 return MVT::i64; 1311 } 1312 1313 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( 1314 EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags, 1315 bool *Fast) const { 1316 if (Subtarget->requiresStrictAlign()) 1317 return false; 1318 1319 if (Fast) { 1320 // Some CPUs are fine with unaligned stores except for 128-bit ones. 1321 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 || 1322 // See comments in performSTORECombine() for more details about 1323 // these conditions. 1324 1325 // Code that uses clang vector extensions can mark that it 1326 // wants unaligned accesses to be treated as fast by 1327 // underspecifying alignment to be 1 or 2. 1328 Align <= 2 || 1329 1330 // Disregard v2i64. Memcpy lowering produces those and splitting 1331 // them regresses performance on micro-benchmarks and olden/bh. 1332 VT == MVT::v2i64; 1333 } 1334 return true; 1335 } 1336 1337 // Same as above but handling LLTs instead. 1338 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( 1339 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, 1340 bool *Fast) const { 1341 if (Subtarget->requiresStrictAlign()) 1342 return false; 1343 1344 if (Fast) { 1345 // Some CPUs are fine with unaligned stores except for 128-bit ones. 1346 *Fast = !Subtarget->isMisaligned128StoreSlow() || 1347 Ty.getSizeInBytes() != 16 || 1348 // See comments in performSTORECombine() for more details about 1349 // these conditions. 1350 1351 // Code that uses clang vector extensions can mark that it 1352 // wants unaligned accesses to be treated as fast by 1353 // underspecifying alignment to be 1 or 2. 1354 Alignment <= 2 || 1355 1356 // Disregard v2i64. Memcpy lowering produces those and splitting 1357 // them regresses performance on micro-benchmarks and olden/bh. 1358 Ty == LLT::vector(2, 64); 1359 } 1360 return true; 1361 } 1362 1363 FastISel * 1364 AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 1365 const TargetLibraryInfo *libInfo) const { 1366 return AArch64::createFastISel(funcInfo, libInfo); 1367 } 1368 1369 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { 1370 #define MAKE_CASE(V) \ 1371 case V: \ 1372 return #V; 1373 switch ((AArch64ISD::NodeType)Opcode) { 1374 case AArch64ISD::FIRST_NUMBER: 1375 break; 1376 MAKE_CASE(AArch64ISD::CALL) 1377 MAKE_CASE(AArch64ISD::ADRP) 1378 MAKE_CASE(AArch64ISD::ADR) 1379 MAKE_CASE(AArch64ISD::ADDlow) 1380 MAKE_CASE(AArch64ISD::LOADgot) 1381 MAKE_CASE(AArch64ISD::RET_FLAG) 1382 MAKE_CASE(AArch64ISD::BRCOND) 1383 MAKE_CASE(AArch64ISD::CSEL) 1384 MAKE_CASE(AArch64ISD::FCSEL) 1385 MAKE_CASE(AArch64ISD::CSINV) 1386 MAKE_CASE(AArch64ISD::CSNEG) 1387 MAKE_CASE(AArch64ISD::CSINC) 1388 MAKE_CASE(AArch64ISD::THREAD_POINTER) 1389 MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ) 1390 MAKE_CASE(AArch64ISD::ADD_PRED) 1391 MAKE_CASE(AArch64ISD::SDIV_PRED) 1392 MAKE_CASE(AArch64ISD::UDIV_PRED) 1393 MAKE_CASE(AArch64ISD::SMIN_MERGE_OP1) 1394 MAKE_CASE(AArch64ISD::UMIN_MERGE_OP1) 1395 MAKE_CASE(AArch64ISD::SMAX_MERGE_OP1) 1396 MAKE_CASE(AArch64ISD::UMAX_MERGE_OP1) 1397 MAKE_CASE(AArch64ISD::SHL_MERGE_OP1) 1398 MAKE_CASE(AArch64ISD::SRL_MERGE_OP1) 1399 MAKE_CASE(AArch64ISD::SRA_MERGE_OP1) 1400 MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO) 1401 MAKE_CASE(AArch64ISD::ADC) 1402 MAKE_CASE(AArch64ISD::SBC) 1403 MAKE_CASE(AArch64ISD::ADDS) 1404 MAKE_CASE(AArch64ISD::SUBS) 1405 MAKE_CASE(AArch64ISD::ADCS) 1406 MAKE_CASE(AArch64ISD::SBCS) 1407 MAKE_CASE(AArch64ISD::ANDS) 1408 MAKE_CASE(AArch64ISD::CCMP) 1409 MAKE_CASE(AArch64ISD::CCMN) 1410 MAKE_CASE(AArch64ISD::FCCMP) 1411 MAKE_CASE(AArch64ISD::FCMP) 1412 MAKE_CASE(AArch64ISD::STRICT_FCMP) 1413 MAKE_CASE(AArch64ISD::STRICT_FCMPE) 1414 MAKE_CASE(AArch64ISD::DUP) 1415 MAKE_CASE(AArch64ISD::DUPLANE8) 1416 MAKE_CASE(AArch64ISD::DUPLANE16) 1417 MAKE_CASE(AArch64ISD::DUPLANE32) 1418 MAKE_CASE(AArch64ISD::DUPLANE64) 1419 MAKE_CASE(AArch64ISD::MOVI) 1420 MAKE_CASE(AArch64ISD::MOVIshift) 1421 MAKE_CASE(AArch64ISD::MOVIedit) 1422 MAKE_CASE(AArch64ISD::MOVImsl) 1423 MAKE_CASE(AArch64ISD::FMOV) 1424 MAKE_CASE(AArch64ISD::MVNIshift) 1425 MAKE_CASE(AArch64ISD::MVNImsl) 1426 MAKE_CASE(AArch64ISD::BICi) 1427 MAKE_CASE(AArch64ISD::ORRi) 1428 MAKE_CASE(AArch64ISD::BSP) 1429 MAKE_CASE(AArch64ISD::NEG) 1430 MAKE_CASE(AArch64ISD::EXTR) 1431 MAKE_CASE(AArch64ISD::ZIP1) 1432 MAKE_CASE(AArch64ISD::ZIP2) 1433 MAKE_CASE(AArch64ISD::UZP1) 1434 MAKE_CASE(AArch64ISD::UZP2) 1435 MAKE_CASE(AArch64ISD::TRN1) 1436 MAKE_CASE(AArch64ISD::TRN2) 1437 MAKE_CASE(AArch64ISD::REV16) 1438 MAKE_CASE(AArch64ISD::REV32) 1439 MAKE_CASE(AArch64ISD::REV64) 1440 MAKE_CASE(AArch64ISD::EXT) 1441 MAKE_CASE(AArch64ISD::VSHL) 1442 MAKE_CASE(AArch64ISD::VLSHR) 1443 MAKE_CASE(AArch64ISD::VASHR) 1444 MAKE_CASE(AArch64ISD::VSLI) 1445 MAKE_CASE(AArch64ISD::VSRI) 1446 MAKE_CASE(AArch64ISD::CMEQ) 1447 MAKE_CASE(AArch64ISD::CMGE) 1448 MAKE_CASE(AArch64ISD::CMGT) 1449 MAKE_CASE(AArch64ISD::CMHI) 1450 MAKE_CASE(AArch64ISD::CMHS) 1451 MAKE_CASE(AArch64ISD::FCMEQ) 1452 MAKE_CASE(AArch64ISD::FCMGE) 1453 MAKE_CASE(AArch64ISD::FCMGT) 1454 MAKE_CASE(AArch64ISD::CMEQz) 1455 MAKE_CASE(AArch64ISD::CMGEz) 1456 MAKE_CASE(AArch64ISD::CMGTz) 1457 MAKE_CASE(AArch64ISD::CMLEz) 1458 MAKE_CASE(AArch64ISD::CMLTz) 1459 MAKE_CASE(AArch64ISD::FCMEQz) 1460 MAKE_CASE(AArch64ISD::FCMGEz) 1461 MAKE_CASE(AArch64ISD::FCMGTz) 1462 MAKE_CASE(AArch64ISD::FCMLEz) 1463 MAKE_CASE(AArch64ISD::FCMLTz) 1464 MAKE_CASE(AArch64ISD::SADDV) 1465 MAKE_CASE(AArch64ISD::UADDV) 1466 MAKE_CASE(AArch64ISD::SRHADD) 1467 MAKE_CASE(AArch64ISD::URHADD) 1468 MAKE_CASE(AArch64ISD::SMINV) 1469 MAKE_CASE(AArch64ISD::UMINV) 1470 MAKE_CASE(AArch64ISD::SMAXV) 1471 MAKE_CASE(AArch64ISD::UMAXV) 1472 MAKE_CASE(AArch64ISD::SMAXV_PRED) 1473 MAKE_CASE(AArch64ISD::UMAXV_PRED) 1474 MAKE_CASE(AArch64ISD::SMINV_PRED) 1475 MAKE_CASE(AArch64ISD::UMINV_PRED) 1476 MAKE_CASE(AArch64ISD::ORV_PRED) 1477 MAKE_CASE(AArch64ISD::EORV_PRED) 1478 MAKE_CASE(AArch64ISD::ANDV_PRED) 1479 MAKE_CASE(AArch64ISD::CLASTA_N) 1480 MAKE_CASE(AArch64ISD::CLASTB_N) 1481 MAKE_CASE(AArch64ISD::LASTA) 1482 MAKE_CASE(AArch64ISD::LASTB) 1483 MAKE_CASE(AArch64ISD::REV) 1484 MAKE_CASE(AArch64ISD::REINTERPRET_CAST) 1485 MAKE_CASE(AArch64ISD::TBL) 1486 MAKE_CASE(AArch64ISD::FADD_PRED) 1487 MAKE_CASE(AArch64ISD::FADDA_PRED) 1488 MAKE_CASE(AArch64ISD::FADDV_PRED) 1489 MAKE_CASE(AArch64ISD::FMA_PRED) 1490 MAKE_CASE(AArch64ISD::FMAXV_PRED) 1491 MAKE_CASE(AArch64ISD::FMAXNMV_PRED) 1492 MAKE_CASE(AArch64ISD::FMINV_PRED) 1493 MAKE_CASE(AArch64ISD::FMINNMV_PRED) 1494 MAKE_CASE(AArch64ISD::NOT) 1495 MAKE_CASE(AArch64ISD::BIT) 1496 MAKE_CASE(AArch64ISD::CBZ) 1497 MAKE_CASE(AArch64ISD::CBNZ) 1498 MAKE_CASE(AArch64ISD::TBZ) 1499 MAKE_CASE(AArch64ISD::TBNZ) 1500 MAKE_CASE(AArch64ISD::TC_RETURN) 1501 MAKE_CASE(AArch64ISD::PREFETCH) 1502 MAKE_CASE(AArch64ISD::SITOF) 1503 MAKE_CASE(AArch64ISD::UITOF) 1504 MAKE_CASE(AArch64ISD::NVCAST) 1505 MAKE_CASE(AArch64ISD::SQSHL_I) 1506 MAKE_CASE(AArch64ISD::UQSHL_I) 1507 MAKE_CASE(AArch64ISD::SRSHR_I) 1508 MAKE_CASE(AArch64ISD::URSHR_I) 1509 MAKE_CASE(AArch64ISD::SQSHLU_I) 1510 MAKE_CASE(AArch64ISD::WrapperLarge) 1511 MAKE_CASE(AArch64ISD::LD2post) 1512 MAKE_CASE(AArch64ISD::LD3post) 1513 MAKE_CASE(AArch64ISD::LD4post) 1514 MAKE_CASE(AArch64ISD::ST2post) 1515 MAKE_CASE(AArch64ISD::ST3post) 1516 MAKE_CASE(AArch64ISD::ST4post) 1517 MAKE_CASE(AArch64ISD::LD1x2post) 1518 MAKE_CASE(AArch64ISD::LD1x3post) 1519 MAKE_CASE(AArch64ISD::LD1x4post) 1520 MAKE_CASE(AArch64ISD::ST1x2post) 1521 MAKE_CASE(AArch64ISD::ST1x3post) 1522 MAKE_CASE(AArch64ISD::ST1x4post) 1523 MAKE_CASE(AArch64ISD::LD1DUPpost) 1524 MAKE_CASE(AArch64ISD::LD2DUPpost) 1525 MAKE_CASE(AArch64ISD::LD3DUPpost) 1526 MAKE_CASE(AArch64ISD::LD4DUPpost) 1527 MAKE_CASE(AArch64ISD::LD1LANEpost) 1528 MAKE_CASE(AArch64ISD::LD2LANEpost) 1529 MAKE_CASE(AArch64ISD::LD3LANEpost) 1530 MAKE_CASE(AArch64ISD::LD4LANEpost) 1531 MAKE_CASE(AArch64ISD::ST2LANEpost) 1532 MAKE_CASE(AArch64ISD::ST3LANEpost) 1533 MAKE_CASE(AArch64ISD::ST4LANEpost) 1534 MAKE_CASE(AArch64ISD::SMULL) 1535 MAKE_CASE(AArch64ISD::UMULL) 1536 MAKE_CASE(AArch64ISD::FRECPE) 1537 MAKE_CASE(AArch64ISD::FRECPS) 1538 MAKE_CASE(AArch64ISD::FRSQRTE) 1539 MAKE_CASE(AArch64ISD::FRSQRTS) 1540 MAKE_CASE(AArch64ISD::STG) 1541 MAKE_CASE(AArch64ISD::STZG) 1542 MAKE_CASE(AArch64ISD::ST2G) 1543 MAKE_CASE(AArch64ISD::STZ2G) 1544 MAKE_CASE(AArch64ISD::SUNPKHI) 1545 MAKE_CASE(AArch64ISD::SUNPKLO) 1546 MAKE_CASE(AArch64ISD::UUNPKHI) 1547 MAKE_CASE(AArch64ISD::UUNPKLO) 1548 MAKE_CASE(AArch64ISD::INSR) 1549 MAKE_CASE(AArch64ISD::PTEST) 1550 MAKE_CASE(AArch64ISD::PTRUE) 1551 MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO) 1552 MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO) 1553 MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO) 1554 MAKE_CASE(AArch64ISD::LDNF1S_MERGE_ZERO) 1555 MAKE_CASE(AArch64ISD::LDFF1_MERGE_ZERO) 1556 MAKE_CASE(AArch64ISD::LDFF1S_MERGE_ZERO) 1557 MAKE_CASE(AArch64ISD::LD1RQ_MERGE_ZERO) 1558 MAKE_CASE(AArch64ISD::LD1RO_MERGE_ZERO) 1559 MAKE_CASE(AArch64ISD::SVE_LD2_MERGE_ZERO) 1560 MAKE_CASE(AArch64ISD::SVE_LD3_MERGE_ZERO) 1561 MAKE_CASE(AArch64ISD::SVE_LD4_MERGE_ZERO) 1562 MAKE_CASE(AArch64ISD::GLD1_MERGE_ZERO) 1563 MAKE_CASE(AArch64ISD::GLD1_SCALED_MERGE_ZERO) 1564 MAKE_CASE(AArch64ISD::GLD1_SXTW_MERGE_ZERO) 1565 MAKE_CASE(AArch64ISD::GLD1_UXTW_MERGE_ZERO) 1566 MAKE_CASE(AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO) 1567 MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO) 1568 MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO) 1569 MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO) 1570 MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO) 1571 MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO) 1572 MAKE_CASE(AArch64ISD::GLD1S_UXTW_MERGE_ZERO) 1573 MAKE_CASE(AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO) 1574 MAKE_CASE(AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO) 1575 MAKE_CASE(AArch64ISD::GLD1S_IMM_MERGE_ZERO) 1576 MAKE_CASE(AArch64ISD::GLDFF1_MERGE_ZERO) 1577 MAKE_CASE(AArch64ISD::GLDFF1_SCALED_MERGE_ZERO) 1578 MAKE_CASE(AArch64ISD::GLDFF1_SXTW_MERGE_ZERO) 1579 MAKE_CASE(AArch64ISD::GLDFF1_UXTW_MERGE_ZERO) 1580 MAKE_CASE(AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO) 1581 MAKE_CASE(AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO) 1582 MAKE_CASE(AArch64ISD::GLDFF1_IMM_MERGE_ZERO) 1583 MAKE_CASE(AArch64ISD::GLDFF1S_MERGE_ZERO) 1584 MAKE_CASE(AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO) 1585 MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO) 1586 MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO) 1587 MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO) 1588 MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO) 1589 MAKE_CASE(AArch64ISD::GLDFF1S_IMM_MERGE_ZERO) 1590 MAKE_CASE(AArch64ISD::GLDNT1_MERGE_ZERO) 1591 MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) 1592 MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO) 1593 MAKE_CASE(AArch64ISD::ST1_PRED) 1594 MAKE_CASE(AArch64ISD::SST1_PRED) 1595 MAKE_CASE(AArch64ISD::SST1_SCALED_PRED) 1596 MAKE_CASE(AArch64ISD::SST1_SXTW_PRED) 1597 MAKE_CASE(AArch64ISD::SST1_UXTW_PRED) 1598 MAKE_CASE(AArch64ISD::SST1_SXTW_SCALED_PRED) 1599 MAKE_CASE(AArch64ISD::SST1_UXTW_SCALED_PRED) 1600 MAKE_CASE(AArch64ISD::SST1_IMM_PRED) 1601 MAKE_CASE(AArch64ISD::SSTNT1_PRED) 1602 MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED) 1603 MAKE_CASE(AArch64ISD::LDP) 1604 MAKE_CASE(AArch64ISD::STP) 1605 MAKE_CASE(AArch64ISD::STNP) 1606 MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU) 1607 MAKE_CASE(AArch64ISD::INDEX_VECTOR) 1608 } 1609 #undef MAKE_CASE 1610 return nullptr; 1611 } 1612 1613 MachineBasicBlock * 1614 AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI, 1615 MachineBasicBlock *MBB) const { 1616 // We materialise the F128CSEL pseudo-instruction as some control flow and a 1617 // phi node: 1618 1619 // OrigBB: 1620 // [... previous instrs leading to comparison ...] 1621 // b.ne TrueBB 1622 // b EndBB 1623 // TrueBB: 1624 // ; Fallthrough 1625 // EndBB: 1626 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB] 1627 1628 MachineFunction *MF = MBB->getParent(); 1629 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 1630 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 1631 DebugLoc DL = MI.getDebugLoc(); 1632 MachineFunction::iterator It = ++MBB->getIterator(); 1633 1634 Register DestReg = MI.getOperand(0).getReg(); 1635 Register IfTrueReg = MI.getOperand(1).getReg(); 1636 Register IfFalseReg = MI.getOperand(2).getReg(); 1637 unsigned CondCode = MI.getOperand(3).getImm(); 1638 bool NZCVKilled = MI.getOperand(4).isKill(); 1639 1640 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB); 1641 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB); 1642 MF->insert(It, TrueBB); 1643 MF->insert(It, EndBB); 1644 1645 // Transfer rest of current basic-block to EndBB 1646 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), 1647 MBB->end()); 1648 EndBB->transferSuccessorsAndUpdatePHIs(MBB); 1649 1650 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB); 1651 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB); 1652 MBB->addSuccessor(TrueBB); 1653 MBB->addSuccessor(EndBB); 1654 1655 // TrueBB falls through to the end. 1656 TrueBB->addSuccessor(EndBB); 1657 1658 if (!NZCVKilled) { 1659 TrueBB->addLiveIn(AArch64::NZCV); 1660 EndBB->addLiveIn(AArch64::NZCV); 1661 } 1662 1663 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg) 1664 .addReg(IfTrueReg) 1665 .addMBB(TrueBB) 1666 .addReg(IfFalseReg) 1667 .addMBB(MBB); 1668 1669 MI.eraseFromParent(); 1670 return EndBB; 1671 } 1672 1673 MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet( 1674 MachineInstr &MI, MachineBasicBlock *BB) const { 1675 assert(!isAsynchronousEHPersonality(classifyEHPersonality( 1676 BB->getParent()->getFunction().getPersonalityFn())) && 1677 "SEH does not use catchret!"); 1678 return BB; 1679 } 1680 1681 MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( 1682 MachineInstr &MI, MachineBasicBlock *BB) const { 1683 switch (MI.getOpcode()) { 1684 default: 1685 #ifndef NDEBUG 1686 MI.dump(); 1687 #endif 1688 llvm_unreachable("Unexpected instruction for custom inserter!"); 1689 1690 case AArch64::F128CSEL: 1691 return EmitF128CSEL(MI, BB); 1692 1693 case TargetOpcode::STACKMAP: 1694 case TargetOpcode::PATCHPOINT: 1695 return emitPatchPoint(MI, BB); 1696 1697 case AArch64::CATCHRET: 1698 return EmitLoweredCatchRet(MI, BB); 1699 } 1700 } 1701 1702 //===----------------------------------------------------------------------===// 1703 // AArch64 Lowering private implementation. 1704 //===----------------------------------------------------------------------===// 1705 1706 //===----------------------------------------------------------------------===// 1707 // Lowering Code 1708 //===----------------------------------------------------------------------===// 1709 1710 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 1711 /// CC 1712 static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) { 1713 switch (CC) { 1714 default: 1715 llvm_unreachable("Unknown condition code!"); 1716 case ISD::SETNE: 1717 return AArch64CC::NE; 1718 case ISD::SETEQ: 1719 return AArch64CC::EQ; 1720 case ISD::SETGT: 1721 return AArch64CC::GT; 1722 case ISD::SETGE: 1723 return AArch64CC::GE; 1724 case ISD::SETLT: 1725 return AArch64CC::LT; 1726 case ISD::SETLE: 1727 return AArch64CC::LE; 1728 case ISD::SETUGT: 1729 return AArch64CC::HI; 1730 case ISD::SETUGE: 1731 return AArch64CC::HS; 1732 case ISD::SETULT: 1733 return AArch64CC::LO; 1734 case ISD::SETULE: 1735 return AArch64CC::LS; 1736 } 1737 } 1738 1739 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC. 1740 static void changeFPCCToAArch64CC(ISD::CondCode CC, 1741 AArch64CC::CondCode &CondCode, 1742 AArch64CC::CondCode &CondCode2) { 1743 CondCode2 = AArch64CC::AL; 1744 switch (CC) { 1745 default: 1746 llvm_unreachable("Unknown FP condition!"); 1747 case ISD::SETEQ: 1748 case ISD::SETOEQ: 1749 CondCode = AArch64CC::EQ; 1750 break; 1751 case ISD::SETGT: 1752 case ISD::SETOGT: 1753 CondCode = AArch64CC::GT; 1754 break; 1755 case ISD::SETGE: 1756 case ISD::SETOGE: 1757 CondCode = AArch64CC::GE; 1758 break; 1759 case ISD::SETOLT: 1760 CondCode = AArch64CC::MI; 1761 break; 1762 case ISD::SETOLE: 1763 CondCode = AArch64CC::LS; 1764 break; 1765 case ISD::SETONE: 1766 CondCode = AArch64CC::MI; 1767 CondCode2 = AArch64CC::GT; 1768 break; 1769 case ISD::SETO: 1770 CondCode = AArch64CC::VC; 1771 break; 1772 case ISD::SETUO: 1773 CondCode = AArch64CC::VS; 1774 break; 1775 case ISD::SETUEQ: 1776 CondCode = AArch64CC::EQ; 1777 CondCode2 = AArch64CC::VS; 1778 break; 1779 case ISD::SETUGT: 1780 CondCode = AArch64CC::HI; 1781 break; 1782 case ISD::SETUGE: 1783 CondCode = AArch64CC::PL; 1784 break; 1785 case ISD::SETLT: 1786 case ISD::SETULT: 1787 CondCode = AArch64CC::LT; 1788 break; 1789 case ISD::SETLE: 1790 case ISD::SETULE: 1791 CondCode = AArch64CC::LE; 1792 break; 1793 case ISD::SETNE: 1794 case ISD::SETUNE: 1795 CondCode = AArch64CC::NE; 1796 break; 1797 } 1798 } 1799 1800 /// Convert a DAG fp condition code to an AArch64 CC. 1801 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that 1802 /// should be AND'ed instead of OR'ed. 1803 static void changeFPCCToANDAArch64CC(ISD::CondCode CC, 1804 AArch64CC::CondCode &CondCode, 1805 AArch64CC::CondCode &CondCode2) { 1806 CondCode2 = AArch64CC::AL; 1807 switch (CC) { 1808 default: 1809 changeFPCCToAArch64CC(CC, CondCode, CondCode2); 1810 assert(CondCode2 == AArch64CC::AL); 1811 break; 1812 case ISD::SETONE: 1813 // (a one b) 1814 // == ((a olt b) || (a ogt b)) 1815 // == ((a ord b) && (a une b)) 1816 CondCode = AArch64CC::VC; 1817 CondCode2 = AArch64CC::NE; 1818 break; 1819 case ISD::SETUEQ: 1820 // (a ueq b) 1821 // == ((a uno b) || (a oeq b)) 1822 // == ((a ule b) && (a uge b)) 1823 CondCode = AArch64CC::PL; 1824 CondCode2 = AArch64CC::LE; 1825 break; 1826 } 1827 } 1828 1829 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 1830 /// CC usable with the vector instructions. Fewer operations are available 1831 /// without a real NZCV register, so we have to use less efficient combinations 1832 /// to get the same effect. 1833 static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, 1834 AArch64CC::CondCode &CondCode, 1835 AArch64CC::CondCode &CondCode2, 1836 bool &Invert) { 1837 Invert = false; 1838 switch (CC) { 1839 default: 1840 // Mostly the scalar mappings work fine. 1841 changeFPCCToAArch64CC(CC, CondCode, CondCode2); 1842 break; 1843 case ISD::SETUO: 1844 Invert = true; 1845 LLVM_FALLTHROUGH; 1846 case ISD::SETO: 1847 CondCode = AArch64CC::MI; 1848 CondCode2 = AArch64CC::GE; 1849 break; 1850 case ISD::SETUEQ: 1851 case ISD::SETULT: 1852 case ISD::SETULE: 1853 case ISD::SETUGT: 1854 case ISD::SETUGE: 1855 // All of the compare-mask comparisons are ordered, but we can switch 1856 // between the two by a double inversion. E.g. ULE == !OGT. 1857 Invert = true; 1858 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32), 1859 CondCode, CondCode2); 1860 break; 1861 } 1862 } 1863 1864 static bool isLegalArithImmed(uint64_t C) { 1865 // Matches AArch64DAGToDAGISel::SelectArithImmed(). 1866 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0); 1867 LLVM_DEBUG(dbgs() << "Is imm " << C 1868 << " legal: " << (IsLegal ? "yes\n" : "no\n")); 1869 return IsLegal; 1870 } 1871 1872 // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on 1873 // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags 1874 // can be set differently by this operation. It comes down to whether 1875 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then 1876 // everything is fine. If not then the optimization is wrong. Thus general 1877 // comparisons are only valid if op2 != 0. 1878 // 1879 // So, finally, the only LLVM-native comparisons that don't mention C and V 1880 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in 1881 // the absence of information about op2. 1882 static bool isCMN(SDValue Op, ISD::CondCode CC) { 1883 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) && 1884 (CC == ISD::SETEQ || CC == ISD::SETNE); 1885 } 1886 1887 static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl, 1888 SelectionDAG &DAG, SDValue Chain, 1889 bool IsSignaling) { 1890 EVT VT = LHS.getValueType(); 1891 assert(VT != MVT::f128); 1892 assert(VT != MVT::f16 && "Lowering of strict fp16 not yet implemented"); 1893 unsigned Opcode = 1894 IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP; 1895 return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS}); 1896 } 1897 1898 static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, 1899 const SDLoc &dl, SelectionDAG &DAG) { 1900 EVT VT = LHS.getValueType(); 1901 const bool FullFP16 = 1902 static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16(); 1903 1904 if (VT.isFloatingPoint()) { 1905 assert(VT != MVT::f128); 1906 if (VT == MVT::f16 && !FullFP16) { 1907 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); 1908 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); 1909 VT = MVT::f32; 1910 } 1911 return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS); 1912 } 1913 1914 // The CMP instruction is just an alias for SUBS, and representing it as 1915 // SUBS means that it's possible to get CSE with subtract operations. 1916 // A later phase can perform the optimization of setting the destination 1917 // register to WZR/XZR if it ends up being unused. 1918 unsigned Opcode = AArch64ISD::SUBS; 1919 1920 if (isCMN(RHS, CC)) { 1921 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ? 1922 Opcode = AArch64ISD::ADDS; 1923 RHS = RHS.getOperand(1); 1924 } else if (isCMN(LHS, CC)) { 1925 // As we are looking for EQ/NE compares, the operands can be commuted ; can 1926 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ? 1927 Opcode = AArch64ISD::ADDS; 1928 LHS = LHS.getOperand(1); 1929 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) { 1930 if (LHS.getOpcode() == ISD::AND) { 1931 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST 1932 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one 1933 // of the signed comparisons. 1934 const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl, 1935 DAG.getVTList(VT, MVT_CC), 1936 LHS.getOperand(0), 1937 LHS.getOperand(1)); 1938 // Replace all users of (and X, Y) with newly generated (ands X, Y) 1939 DAG.ReplaceAllUsesWith(LHS, ANDSNode); 1940 return ANDSNode.getValue(1); 1941 } else if (LHS.getOpcode() == AArch64ISD::ANDS) { 1942 // Use result of ANDS 1943 return LHS.getValue(1); 1944 } 1945 } 1946 1947 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS) 1948 .getValue(1); 1949 } 1950 1951 /// \defgroup AArch64CCMP CMP;CCMP matching 1952 /// 1953 /// These functions deal with the formation of CMP;CCMP;... sequences. 1954 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of 1955 /// a comparison. They set the NZCV flags to a predefined value if their 1956 /// predicate is false. This allows to express arbitrary conjunctions, for 1957 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))" 1958 /// expressed as: 1959 /// cmp A 1960 /// ccmp B, inv(CB), CA 1961 /// check for CB flags 1962 /// 1963 /// This naturally lets us implement chains of AND operations with SETCC 1964 /// operands. And we can even implement some other situations by transforming 1965 /// them: 1966 /// - We can implement (NEG SETCC) i.e. negating a single comparison by 1967 /// negating the flags used in a CCMP/FCCMP operations. 1968 /// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations 1969 /// by negating the flags we test for afterwards. i.e. 1970 /// NEG (CMP CCMP CCCMP ...) can be implemented. 1971 /// - Note that we can only ever negate all previously processed results. 1972 /// What we can not implement by flipping the flags to test is a negation 1973 /// of two sub-trees (because the negation affects all sub-trees emitted so 1974 /// far, so the 2nd sub-tree we emit would also affect the first). 1975 /// With those tools we can implement some OR operations: 1976 /// - (OR (SETCC A) (SETCC B)) can be implemented via: 1977 /// NEG (AND (NEG (SETCC A)) (NEG (SETCC B))) 1978 /// - After transforming OR to NEG/AND combinations we may be able to use NEG 1979 /// elimination rules from earlier to implement the whole thing as a 1980 /// CCMP/FCCMP chain. 1981 /// 1982 /// As complete example: 1983 /// or (or (setCA (cmp A)) (setCB (cmp B))) 1984 /// (and (setCC (cmp C)) (setCD (cmp D)))" 1985 /// can be reassociated to: 1986 /// or (and (setCC (cmp C)) setCD (cmp D)) 1987 // (or (setCA (cmp A)) (setCB (cmp B))) 1988 /// can be transformed to: 1989 /// not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) 1990 /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))" 1991 /// which can be implemented as: 1992 /// cmp C 1993 /// ccmp D, inv(CD), CC 1994 /// ccmp A, CA, inv(CD) 1995 /// ccmp B, CB, inv(CA) 1996 /// check for CB flags 1997 /// 1998 /// A counterexample is "or (and A B) (and C D)" which translates to 1999 /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we 2000 /// can only implement 1 of the inner (not) operations, but not both! 2001 /// @{ 2002 2003 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate. 2004 static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, 2005 ISD::CondCode CC, SDValue CCOp, 2006 AArch64CC::CondCode Predicate, 2007 AArch64CC::CondCode OutCC, 2008 const SDLoc &DL, SelectionDAG &DAG) { 2009 unsigned Opcode = 0; 2010 const bool FullFP16 = 2011 static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16(); 2012 2013 if (LHS.getValueType().isFloatingPoint()) { 2014 assert(LHS.getValueType() != MVT::f128); 2015 if (LHS.getValueType() == MVT::f16 && !FullFP16) { 2016 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS); 2017 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS); 2018 } 2019 Opcode = AArch64ISD::FCCMP; 2020 } else if (RHS.getOpcode() == ISD::SUB) { 2021 SDValue SubOp0 = RHS.getOperand(0); 2022 if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { 2023 // See emitComparison() on why we can only do this for SETEQ and SETNE. 2024 Opcode = AArch64ISD::CCMN; 2025 RHS = RHS.getOperand(1); 2026 } 2027 } 2028 if (Opcode == 0) 2029 Opcode = AArch64ISD::CCMP; 2030 2031 SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC); 2032 AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); 2033 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); 2034 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); 2035 return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp); 2036 } 2037 2038 /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be 2039 /// expressed as a conjunction. See \ref AArch64CCMP. 2040 /// \param CanNegate Set to true if we can negate the whole sub-tree just by 2041 /// changing the conditions on the SETCC tests. 2042 /// (this means we can call emitConjunctionRec() with 2043 /// Negate==true on this sub-tree) 2044 /// \param MustBeFirst Set to true if this subtree needs to be negated and we 2045 /// cannot do the negation naturally. We are required to 2046 /// emit the subtree first in this case. 2047 /// \param WillNegate Is true if are called when the result of this 2048 /// subexpression must be negated. This happens when the 2049 /// outer expression is an OR. We can use this fact to know 2050 /// that we have a double negation (or (or ...) ...) that 2051 /// can be implemented for free. 2052 static bool canEmitConjunction(const SDValue Val, bool &CanNegate, 2053 bool &MustBeFirst, bool WillNegate, 2054 unsigned Depth = 0) { 2055 if (!Val.hasOneUse()) 2056 return false; 2057 unsigned Opcode = Val->getOpcode(); 2058 if (Opcode == ISD::SETCC) { 2059 if (Val->getOperand(0).getValueType() == MVT::f128) 2060 return false; 2061 CanNegate = true; 2062 MustBeFirst = false; 2063 return true; 2064 } 2065 // Protect against exponential runtime and stack overflow. 2066 if (Depth > 6) 2067 return false; 2068 if (Opcode == ISD::AND || Opcode == ISD::OR) { 2069 bool IsOR = Opcode == ISD::OR; 2070 SDValue O0 = Val->getOperand(0); 2071 SDValue O1 = Val->getOperand(1); 2072 bool CanNegateL; 2073 bool MustBeFirstL; 2074 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1)) 2075 return false; 2076 bool CanNegateR; 2077 bool MustBeFirstR; 2078 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1)) 2079 return false; 2080 2081 if (MustBeFirstL && MustBeFirstR) 2082 return false; 2083 2084 if (IsOR) { 2085 // For an OR expression we need to be able to naturally negate at least 2086 // one side or we cannot do the transformation at all. 2087 if (!CanNegateL && !CanNegateR) 2088 return false; 2089 // If we the result of the OR will be negated and we can naturally negate 2090 // the leafs, then this sub-tree as a whole negates naturally. 2091 CanNegate = WillNegate && CanNegateL && CanNegateR; 2092 // If we cannot naturally negate the whole sub-tree, then this must be 2093 // emitted first. 2094 MustBeFirst = !CanNegate; 2095 } else { 2096 assert(Opcode == ISD::AND && "Must be OR or AND"); 2097 // We cannot naturally negate an AND operation. 2098 CanNegate = false; 2099 MustBeFirst = MustBeFirstL || MustBeFirstR; 2100 } 2101 return true; 2102 } 2103 return false; 2104 } 2105 2106 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain 2107 /// of CCMP/CFCMP ops. See @ref AArch64CCMP. 2108 /// Tries to transform the given i1 producing node @p Val to a series compare 2109 /// and conditional compare operations. @returns an NZCV flags producing node 2110 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if 2111 /// transformation was not possible. 2112 /// \p Negate is true if we want this sub-tree being negated just by changing 2113 /// SETCC conditions. 2114 static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, 2115 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, 2116 AArch64CC::CondCode Predicate) { 2117 // We're at a tree leaf, produce a conditional comparison operation. 2118 unsigned Opcode = Val->getOpcode(); 2119 if (Opcode == ISD::SETCC) { 2120 SDValue LHS = Val->getOperand(0); 2121 SDValue RHS = Val->getOperand(1); 2122 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get(); 2123 bool isInteger = LHS.getValueType().isInteger(); 2124 if (Negate) 2125 CC = getSetCCInverse(CC, LHS.getValueType()); 2126 SDLoc DL(Val); 2127 // Determine OutCC and handle FP special case. 2128 if (isInteger) { 2129 OutCC = changeIntCCToAArch64CC(CC); 2130 } else { 2131 assert(LHS.getValueType().isFloatingPoint()); 2132 AArch64CC::CondCode ExtraCC; 2133 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC); 2134 // Some floating point conditions can't be tested with a single condition 2135 // code. Construct an additional comparison in this case. 2136 if (ExtraCC != AArch64CC::AL) { 2137 SDValue ExtraCmp; 2138 if (!CCOp.getNode()) 2139 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG); 2140 else 2141 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, 2142 ExtraCC, DL, DAG); 2143 CCOp = ExtraCmp; 2144 Predicate = ExtraCC; 2145 } 2146 } 2147 2148 // Produce a normal comparison if we are first in the chain 2149 if (!CCOp) 2150 return emitComparison(LHS, RHS, CC, DL, DAG); 2151 // Otherwise produce a ccmp. 2152 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL, 2153 DAG); 2154 } 2155 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree"); 2156 2157 bool IsOR = Opcode == ISD::OR; 2158 2159 SDValue LHS = Val->getOperand(0); 2160 bool CanNegateL; 2161 bool MustBeFirstL; 2162 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR); 2163 assert(ValidL && "Valid conjunction/disjunction tree"); 2164 (void)ValidL; 2165 2166 SDValue RHS = Val->getOperand(1); 2167 bool CanNegateR; 2168 bool MustBeFirstR; 2169 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR); 2170 assert(ValidR && "Valid conjunction/disjunction tree"); 2171 (void)ValidR; 2172 2173 // Swap sub-tree that must come first to the right side. 2174 if (MustBeFirstL) { 2175 assert(!MustBeFirstR && "Valid conjunction/disjunction tree"); 2176 std::swap(LHS, RHS); 2177 std::swap(CanNegateL, CanNegateR); 2178 std::swap(MustBeFirstL, MustBeFirstR); 2179 } 2180 2181 bool NegateR; 2182 bool NegateAfterR; 2183 bool NegateL; 2184 bool NegateAfterAll; 2185 if (Opcode == ISD::OR) { 2186 // Swap the sub-tree that we can negate naturally to the left. 2187 if (!CanNegateL) { 2188 assert(CanNegateR && "at least one side must be negatable"); 2189 assert(!MustBeFirstR && "invalid conjunction/disjunction tree"); 2190 assert(!Negate); 2191 std::swap(LHS, RHS); 2192 NegateR = false; 2193 NegateAfterR = true; 2194 } else { 2195 // Negate the left sub-tree if possible, otherwise negate the result. 2196 NegateR = CanNegateR; 2197 NegateAfterR = !CanNegateR; 2198 } 2199 NegateL = true; 2200 NegateAfterAll = !Negate; 2201 } else { 2202 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree"); 2203 assert(!Negate && "Valid conjunction/disjunction tree"); 2204 2205 NegateL = false; 2206 NegateR = false; 2207 NegateAfterR = false; 2208 NegateAfterAll = false; 2209 } 2210 2211 // Emit sub-trees. 2212 AArch64CC::CondCode RHSCC; 2213 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate); 2214 if (NegateAfterR) 2215 RHSCC = AArch64CC::getInvertedCondCode(RHSCC); 2216 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC); 2217 if (NegateAfterAll) 2218 OutCC = AArch64CC::getInvertedCondCode(OutCC); 2219 return CmpL; 2220 } 2221 2222 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops). 2223 /// In some cases this is even possible with OR operations in the expression. 2224 /// See \ref AArch64CCMP. 2225 /// \see emitConjunctionRec(). 2226 static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, 2227 AArch64CC::CondCode &OutCC) { 2228 bool DummyCanNegate; 2229 bool DummyMustBeFirst; 2230 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false)) 2231 return SDValue(); 2232 2233 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL); 2234 } 2235 2236 /// @} 2237 2238 /// Returns how profitable it is to fold a comparison's operand's shift and/or 2239 /// extension operations. 2240 static unsigned getCmpOperandFoldingProfit(SDValue Op) { 2241 auto isSupportedExtend = [&](SDValue V) { 2242 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG) 2243 return true; 2244 2245 if (V.getOpcode() == ISD::AND) 2246 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) { 2247 uint64_t Mask = MaskCst->getZExtValue(); 2248 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF); 2249 } 2250 2251 return false; 2252 }; 2253 2254 if (!Op.hasOneUse()) 2255 return 0; 2256 2257 if (isSupportedExtend(Op)) 2258 return 1; 2259 2260 unsigned Opc = Op.getOpcode(); 2261 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA) 2262 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 2263 uint64_t Shift = ShiftCst->getZExtValue(); 2264 if (isSupportedExtend(Op.getOperand(0))) 2265 return (Shift <= 4) ? 2 : 1; 2266 EVT VT = Op.getValueType(); 2267 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63)) 2268 return 1; 2269 } 2270 2271 return 0; 2272 } 2273 2274 static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, 2275 SDValue &AArch64cc, SelectionDAG &DAG, 2276 const SDLoc &dl) { 2277 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { 2278 EVT VT = RHS.getValueType(); 2279 uint64_t C = RHSC->getZExtValue(); 2280 if (!isLegalArithImmed(C)) { 2281 // Constant does not fit, try adjusting it by one? 2282 switch (CC) { 2283 default: 2284 break; 2285 case ISD::SETLT: 2286 case ISD::SETGE: 2287 if ((VT == MVT::i32 && C != 0x80000000 && 2288 isLegalArithImmed((uint32_t)(C - 1))) || 2289 (VT == MVT::i64 && C != 0x80000000ULL && 2290 isLegalArithImmed(C - 1ULL))) { 2291 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; 2292 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; 2293 RHS = DAG.getConstant(C, dl, VT); 2294 } 2295 break; 2296 case ISD::SETULT: 2297 case ISD::SETUGE: 2298 if ((VT == MVT::i32 && C != 0 && 2299 isLegalArithImmed((uint32_t)(C - 1))) || 2300 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) { 2301 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; 2302 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; 2303 RHS = DAG.getConstant(C, dl, VT); 2304 } 2305 break; 2306 case ISD::SETLE: 2307 case ISD::SETGT: 2308 if ((VT == MVT::i32 && C != INT32_MAX && 2309 isLegalArithImmed((uint32_t)(C + 1))) || 2310 (VT == MVT::i64 && C != INT64_MAX && 2311 isLegalArithImmed(C + 1ULL))) { 2312 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; 2313 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; 2314 RHS = DAG.getConstant(C, dl, VT); 2315 } 2316 break; 2317 case ISD::SETULE: 2318 case ISD::SETUGT: 2319 if ((VT == MVT::i32 && C != UINT32_MAX && 2320 isLegalArithImmed((uint32_t)(C + 1))) || 2321 (VT == MVT::i64 && C != UINT64_MAX && 2322 isLegalArithImmed(C + 1ULL))) { 2323 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; 2324 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; 2325 RHS = DAG.getConstant(C, dl, VT); 2326 } 2327 break; 2328 } 2329 } 2330 } 2331 2332 // Comparisons are canonicalized so that the RHS operand is simpler than the 2333 // LHS one, the extreme case being when RHS is an immediate. However, AArch64 2334 // can fold some shift+extend operations on the RHS operand, so swap the 2335 // operands if that can be done. 2336 // 2337 // For example: 2338 // lsl w13, w11, #1 2339 // cmp w13, w12 2340 // can be turned into: 2341 // cmp w12, w11, lsl #1 2342 if (!isa<ConstantSDNode>(RHS) || 2343 !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) { 2344 SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS; 2345 2346 if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) { 2347 std::swap(LHS, RHS); 2348 CC = ISD::getSetCCSwappedOperands(CC); 2349 } 2350 } 2351 2352 SDValue Cmp; 2353 AArch64CC::CondCode AArch64CC; 2354 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) { 2355 const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS); 2356 2357 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095. 2358 // For the i8 operand, the largest immediate is 255, so this can be easily 2359 // encoded in the compare instruction. For the i16 operand, however, the 2360 // largest immediate cannot be encoded in the compare. 2361 // Therefore, use a sign extending load and cmn to avoid materializing the 2362 // -1 constant. For example, 2363 // movz w1, #65535 2364 // ldrh w0, [x0, #0] 2365 // cmp w0, w1 2366 // > 2367 // ldrsh w0, [x0, #0] 2368 // cmn w0, #1 2369 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS) 2370 // if and only if (sext LHS) == (sext RHS). The checks are in place to 2371 // ensure both the LHS and RHS are truly zero extended and to make sure the 2372 // transformation is profitable. 2373 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) && 2374 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD && 2375 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 && 2376 LHS.getNode()->hasNUsesOfValue(1, 0)) { 2377 int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue(); 2378 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { 2379 SDValue SExt = 2380 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, 2381 DAG.getValueType(MVT::i16)); 2382 Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl, 2383 RHS.getValueType()), 2384 CC, dl, DAG); 2385 AArch64CC = changeIntCCToAArch64CC(CC); 2386 } 2387 } 2388 2389 if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) { 2390 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) { 2391 if ((CC == ISD::SETNE) ^ RHSC->isNullValue()) 2392 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC); 2393 } 2394 } 2395 } 2396 2397 if (!Cmp) { 2398 Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 2399 AArch64CC = changeIntCCToAArch64CC(CC); 2400 } 2401 AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC); 2402 return Cmp; 2403 } 2404 2405 static std::pair<SDValue, SDValue> 2406 getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { 2407 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) && 2408 "Unsupported value type"); 2409 SDValue Value, Overflow; 2410 SDLoc DL(Op); 2411 SDValue LHS = Op.getOperand(0); 2412 SDValue RHS = Op.getOperand(1); 2413 unsigned Opc = 0; 2414 switch (Op.getOpcode()) { 2415 default: 2416 llvm_unreachable("Unknown overflow instruction!"); 2417 case ISD::SADDO: 2418 Opc = AArch64ISD::ADDS; 2419 CC = AArch64CC::VS; 2420 break; 2421 case ISD::UADDO: 2422 Opc = AArch64ISD::ADDS; 2423 CC = AArch64CC::HS; 2424 break; 2425 case ISD::SSUBO: 2426 Opc = AArch64ISD::SUBS; 2427 CC = AArch64CC::VS; 2428 break; 2429 case ISD::USUBO: 2430 Opc = AArch64ISD::SUBS; 2431 CC = AArch64CC::LO; 2432 break; 2433 // Multiply needs a little bit extra work. 2434 case ISD::SMULO: 2435 case ISD::UMULO: { 2436 CC = AArch64CC::NE; 2437 bool IsSigned = Op.getOpcode() == ISD::SMULO; 2438 if (Op.getValueType() == MVT::i32) { 2439 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 2440 // For a 32 bit multiply with overflow check we want the instruction 2441 // selector to generate a widening multiply (SMADDL/UMADDL). For that we 2442 // need to generate the following pattern: 2443 // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b)) 2444 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS); 2445 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS); 2446 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); 2447 SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul, 2448 DAG.getConstant(0, DL, MVT::i64)); 2449 // On AArch64 the upper 32 bits are always zero extended for a 32 bit 2450 // operation. We need to clear out the upper 32 bits, because we used a 2451 // widening multiply that wrote all 64 bits. In the end this should be a 2452 // noop. 2453 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add); 2454 if (IsSigned) { 2455 // The signed overflow check requires more than just a simple check for 2456 // any bit set in the upper 32 bits of the result. These bits could be 2457 // just the sign bits of a negative number. To perform the overflow 2458 // check we have to arithmetic shift right the 32nd bit of the result by 2459 // 31 bits. Then we compare the result to the upper 32 bits. 2460 SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add, 2461 DAG.getConstant(32, DL, MVT::i64)); 2462 UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits); 2463 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value, 2464 DAG.getConstant(31, DL, MVT::i64)); 2465 // It is important that LowerBits is last, otherwise the arithmetic 2466 // shift will not be folded into the compare (SUBS). 2467 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32); 2468 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) 2469 .getValue(1); 2470 } else { 2471 // The overflow check for unsigned multiply is easy. We only need to 2472 // check if any of the upper 32 bits are set. This can be done with a 2473 // CMP (shifted register). For that we need to generate the following 2474 // pattern: 2475 // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32) 2476 SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul, 2477 DAG.getConstant(32, DL, MVT::i64)); 2478 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); 2479 Overflow = 2480 DAG.getNode(AArch64ISD::SUBS, DL, VTs, 2481 DAG.getConstant(0, DL, MVT::i64), 2482 UpperBits).getValue(1); 2483 } 2484 break; 2485 } 2486 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type"); 2487 // For the 64 bit multiply 2488 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); 2489 if (IsSigned) { 2490 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS); 2491 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value, 2492 DAG.getConstant(63, DL, MVT::i64)); 2493 // It is important that LowerBits is last, otherwise the arithmetic 2494 // shift will not be folded into the compare (SUBS). 2495 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); 2496 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) 2497 .getValue(1); 2498 } else { 2499 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS); 2500 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); 2501 Overflow = 2502 DAG.getNode(AArch64ISD::SUBS, DL, VTs, 2503 DAG.getConstant(0, DL, MVT::i64), 2504 UpperBits).getValue(1); 2505 } 2506 break; 2507 } 2508 } // switch (...) 2509 2510 if (Opc) { 2511 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32); 2512 2513 // Emit the AArch64 operation with overflow check. 2514 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS); 2515 Overflow = Value.getValue(1); 2516 } 2517 return std::make_pair(Value, Overflow); 2518 } 2519 2520 SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, 2521 RTLIB::Libcall Call) const { 2522 bool IsStrict = Op->isStrictFPOpcode(); 2523 unsigned Offset = IsStrict ? 1 : 0; 2524 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); 2525 SmallVector<SDValue, 2> Ops(Op->op_begin() + Offset, Op->op_end()); 2526 MakeLibCallOptions CallOptions; 2527 SDValue Result; 2528 SDLoc dl(Op); 2529 std::tie(Result, Chain) = makeLibCall(DAG, Call, Op.getValueType(), Ops, 2530 CallOptions, dl, Chain); 2531 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result; 2532 } 2533 2534 static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) { 2535 SDValue Sel = Op.getOperand(0); 2536 SDValue Other = Op.getOperand(1); 2537 SDLoc dl(Sel); 2538 2539 // If the operand is an overflow checking operation, invert the condition 2540 // code and kill the Not operation. I.e., transform: 2541 // (xor (overflow_op_bool, 1)) 2542 // --> 2543 // (csel 1, 0, invert(cc), overflow_op_bool) 2544 // ... which later gets transformed to just a cset instruction with an 2545 // inverted condition code, rather than a cset + eor sequence. 2546 if (isOneConstant(Other) && ISD::isOverflowIntrOpRes(Sel)) { 2547 // Only lower legal XALUO ops. 2548 if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0))) 2549 return SDValue(); 2550 2551 SDValue TVal = DAG.getConstant(1, dl, MVT::i32); 2552 SDValue FVal = DAG.getConstant(0, dl, MVT::i32); 2553 AArch64CC::CondCode CC; 2554 SDValue Value, Overflow; 2555 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG); 2556 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32); 2557 return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal, 2558 CCVal, Overflow); 2559 } 2560 // If neither operand is a SELECT_CC, give up. 2561 if (Sel.getOpcode() != ISD::SELECT_CC) 2562 std::swap(Sel, Other); 2563 if (Sel.getOpcode() != ISD::SELECT_CC) 2564 return Op; 2565 2566 // The folding we want to perform is: 2567 // (xor x, (select_cc a, b, cc, 0, -1) ) 2568 // --> 2569 // (csel x, (xor x, -1), cc ...) 2570 // 2571 // The latter will get matched to a CSINV instruction. 2572 2573 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get(); 2574 SDValue LHS = Sel.getOperand(0); 2575 SDValue RHS = Sel.getOperand(1); 2576 SDValue TVal = Sel.getOperand(2); 2577 SDValue FVal = Sel.getOperand(3); 2578 2579 // FIXME: This could be generalized to non-integer comparisons. 2580 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) 2581 return Op; 2582 2583 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal); 2584 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal); 2585 2586 // The values aren't constants, this isn't the pattern we're looking for. 2587 if (!CFVal || !CTVal) 2588 return Op; 2589 2590 // We can commute the SELECT_CC by inverting the condition. This 2591 // might be needed to make this fit into a CSINV pattern. 2592 if (CTVal->isAllOnesValue() && CFVal->isNullValue()) { 2593 std::swap(TVal, FVal); 2594 std::swap(CTVal, CFVal); 2595 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 2596 } 2597 2598 // If the constants line up, perform the transform! 2599 if (CTVal->isNullValue() && CFVal->isAllOnesValue()) { 2600 SDValue CCVal; 2601 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); 2602 2603 FVal = Other; 2604 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other, 2605 DAG.getConstant(-1ULL, dl, Other.getValueType())); 2606 2607 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal, 2608 CCVal, Cmp); 2609 } 2610 2611 return Op; 2612 } 2613 2614 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 2615 EVT VT = Op.getValueType(); 2616 2617 // Let legalize expand this if it isn't a legal type yet. 2618 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 2619 return SDValue(); 2620 2621 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 2622 2623 unsigned Opc; 2624 bool ExtraOp = false; 2625 switch (Op.getOpcode()) { 2626 default: 2627 llvm_unreachable("Invalid code"); 2628 case ISD::ADDC: 2629 Opc = AArch64ISD::ADDS; 2630 break; 2631 case ISD::SUBC: 2632 Opc = AArch64ISD::SUBS; 2633 break; 2634 case ISD::ADDE: 2635 Opc = AArch64ISD::ADCS; 2636 ExtraOp = true; 2637 break; 2638 case ISD::SUBE: 2639 Opc = AArch64ISD::SBCS; 2640 ExtraOp = true; 2641 break; 2642 } 2643 2644 if (!ExtraOp) 2645 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1)); 2646 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1), 2647 Op.getOperand(2)); 2648 } 2649 2650 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { 2651 // Let legalize expand this if it isn't a legal type yet. 2652 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 2653 return SDValue(); 2654 2655 SDLoc dl(Op); 2656 AArch64CC::CondCode CC; 2657 // The actual operation that sets the overflow or carry flag. 2658 SDValue Value, Overflow; 2659 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG); 2660 2661 // We use 0 and 1 as false and true values. 2662 SDValue TVal = DAG.getConstant(1, dl, MVT::i32); 2663 SDValue FVal = DAG.getConstant(0, dl, MVT::i32); 2664 2665 // We use an inverted condition, because the conditional select is inverted 2666 // too. This will allow it to be selected to a single instruction: 2667 // CSINC Wd, WZR, WZR, invert(cond). 2668 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32); 2669 Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal, 2670 CCVal, Overflow); 2671 2672 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 2673 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 2674 } 2675 2676 // Prefetch operands are: 2677 // 1: Address to prefetch 2678 // 2: bool isWrite 2679 // 3: int locality (0 = no locality ... 3 = extreme locality) 2680 // 4: bool isDataCache 2681 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) { 2682 SDLoc DL(Op); 2683 unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 2684 unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 2685 unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 2686 2687 bool IsStream = !Locality; 2688 // When the locality number is set 2689 if (Locality) { 2690 // The front-end should have filtered out the out-of-range values 2691 assert(Locality <= 3 && "Prefetch locality out-of-range"); 2692 // The locality degree is the opposite of the cache speed. 2693 // Put the number the other way around. 2694 // The encoding starts at 0 for level 1 2695 Locality = 3 - Locality; 2696 } 2697 2698 // built the mask value encoding the expected behavior. 2699 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit 2700 (!IsData << 3) | // IsDataCache bit 2701 (Locality << 1) | // Cache level bits 2702 (unsigned)IsStream; // Stream bit 2703 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0), 2704 DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1)); 2705 } 2706 2707 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, 2708 SelectionDAG &DAG) const { 2709 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering"); 2710 2711 RTLIB::Libcall LC; 2712 LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); 2713 2714 return LowerF128Call(Op, DAG, LC); 2715 } 2716 2717 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, 2718 SelectionDAG &DAG) const { 2719 bool IsStrict = Op->isStrictFPOpcode(); 2720 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 2721 EVT SrcVT = SrcVal.getValueType(); 2722 2723 if (SrcVT != MVT::f128) { 2724 // Expand cases where the input is a vector bigger than NEON. 2725 if (useSVEForFixedLengthVectorVT(SrcVT)) 2726 return SDValue(); 2727 2728 // It's legal except when f128 is involved 2729 return Op; 2730 } 2731 2732 RTLIB::Libcall LC; 2733 LC = RTLIB::getFPROUND(SrcVT, Op.getValueType()); 2734 2735 // FP_ROUND node has a second operand indicating whether it is known to be 2736 // precise. That doesn't take part in the LibCall so we can't directly use 2737 // LowerF128Call. 2738 MakeLibCallOptions CallOptions; 2739 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); 2740 SDValue Result; 2741 SDLoc dl(Op); 2742 std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal, 2743 CallOptions, dl, Chain); 2744 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result; 2745 } 2746 2747 SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, 2748 SelectionDAG &DAG) const { 2749 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. 2750 // Any additional optimization in this function should be recorded 2751 // in the cost tables. 2752 EVT InVT = Op.getOperand(0).getValueType(); 2753 EVT VT = Op.getValueType(); 2754 unsigned NumElts = InVT.getVectorNumElements(); 2755 2756 // f16 conversions are promoted to f32 when full fp16 is not supported. 2757 if (InVT.getVectorElementType() == MVT::f16 && 2758 !Subtarget->hasFullFP16()) { 2759 MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts); 2760 SDLoc dl(Op); 2761 return DAG.getNode( 2762 Op.getOpcode(), dl, Op.getValueType(), 2763 DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0))); 2764 } 2765 2766 if (VT.getSizeInBits() < InVT.getSizeInBits()) { 2767 SDLoc dl(Op); 2768 SDValue Cv = 2769 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(), 2770 Op.getOperand(0)); 2771 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv); 2772 } 2773 2774 if (VT.getSizeInBits() > InVT.getSizeInBits()) { 2775 SDLoc dl(Op); 2776 MVT ExtVT = 2777 MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()), 2778 VT.getVectorNumElements()); 2779 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0)); 2780 return DAG.getNode(Op.getOpcode(), dl, VT, Ext); 2781 } 2782 2783 // Type changing conversions are illegal. 2784 return Op; 2785 } 2786 2787 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, 2788 SelectionDAG &DAG) const { 2789 bool IsStrict = Op->isStrictFPOpcode(); 2790 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 2791 2792 if (SrcVal.getValueType().isVector()) 2793 return LowerVectorFP_TO_INT(Op, DAG); 2794 2795 // f16 conversions are promoted to f32 when full fp16 is not supported. 2796 if (SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { 2797 assert(!IsStrict && "Lowering of strict fp16 not yet implemented"); 2798 SDLoc dl(Op); 2799 return DAG.getNode( 2800 Op.getOpcode(), dl, Op.getValueType(), 2801 DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal)); 2802 } 2803 2804 if (SrcVal.getValueType() != MVT::f128) { 2805 // It's legal except when f128 is involved 2806 return Op; 2807 } 2808 2809 RTLIB::Libcall LC; 2810 if (Op.getOpcode() == ISD::FP_TO_SINT || 2811 Op.getOpcode() == ISD::STRICT_FP_TO_SINT) 2812 LC = RTLIB::getFPTOSINT(SrcVal.getValueType(), Op.getValueType()); 2813 else 2814 LC = RTLIB::getFPTOUINT(SrcVal.getValueType(), Op.getValueType()); 2815 2816 return LowerF128Call(Op, DAG, LC); 2817 } 2818 2819 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 2820 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. 2821 // Any additional optimization in this function should be recorded 2822 // in the cost tables. 2823 EVT VT = Op.getValueType(); 2824 SDLoc dl(Op); 2825 SDValue In = Op.getOperand(0); 2826 EVT InVT = In.getValueType(); 2827 2828 if (VT.getSizeInBits() < InVT.getSizeInBits()) { 2829 MVT CastVT = 2830 MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()), 2831 InVT.getVectorNumElements()); 2832 In = DAG.getNode(Op.getOpcode(), dl, CastVT, In); 2833 return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl)); 2834 } 2835 2836 if (VT.getSizeInBits() > InVT.getSizeInBits()) { 2837 unsigned CastOpc = 2838 Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 2839 EVT CastVT = VT.changeVectorElementTypeToInteger(); 2840 In = DAG.getNode(CastOpc, dl, CastVT, In); 2841 return DAG.getNode(Op.getOpcode(), dl, VT, In); 2842 } 2843 2844 return Op; 2845 } 2846 2847 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, 2848 SelectionDAG &DAG) const { 2849 if (Op.getValueType().isVector()) 2850 return LowerVectorINT_TO_FP(Op, DAG); 2851 2852 bool IsStrict = Op->isStrictFPOpcode(); 2853 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 2854 2855 // f16 conversions are promoted to f32 when full fp16 is not supported. 2856 if (Op.getValueType() == MVT::f16 && 2857 !Subtarget->hasFullFP16()) { 2858 assert(!IsStrict && "Lowering of strict fp16 not yet implemented"); 2859 SDLoc dl(Op); 2860 return DAG.getNode( 2861 ISD::FP_ROUND, dl, MVT::f16, 2862 DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal), 2863 DAG.getIntPtrConstant(0, dl)); 2864 } 2865 2866 // i128 conversions are libcalls. 2867 if (SrcVal.getValueType() == MVT::i128) 2868 return SDValue(); 2869 2870 // Other conversions are legal, unless it's to the completely software-based 2871 // fp128. 2872 if (Op.getValueType() != MVT::f128) 2873 return Op; 2874 2875 RTLIB::Libcall LC; 2876 if (Op.getOpcode() == ISD::SINT_TO_FP || 2877 Op.getOpcode() == ISD::STRICT_SINT_TO_FP) 2878 LC = RTLIB::getSINTTOFP(SrcVal.getValueType(), Op.getValueType()); 2879 else 2880 LC = RTLIB::getUINTTOFP(SrcVal.getValueType(), Op.getValueType()); 2881 2882 return LowerF128Call(Op, DAG, LC); 2883 } 2884 2885 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, 2886 SelectionDAG &DAG) const { 2887 // For iOS, we want to call an alternative entry point: __sincos_stret, 2888 // which returns the values in two S / D registers. 2889 SDLoc dl(Op); 2890 SDValue Arg = Op.getOperand(0); 2891 EVT ArgVT = Arg.getValueType(); 2892 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 2893 2894 ArgListTy Args; 2895 ArgListEntry Entry; 2896 2897 Entry.Node = Arg; 2898 Entry.Ty = ArgTy; 2899 Entry.IsSExt = false; 2900 Entry.IsZExt = false; 2901 Args.push_back(Entry); 2902 2903 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64 2904 : RTLIB::SINCOS_STRET_F32; 2905 const char *LibcallName = getLibcallName(LC); 2906 SDValue Callee = 2907 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout())); 2908 2909 StructType *RetTy = StructType::get(ArgTy, ArgTy); 2910 TargetLowering::CallLoweringInfo CLI(DAG); 2911 CLI.setDebugLoc(dl) 2912 .setChain(DAG.getEntryNode()) 2913 .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args)); 2914 2915 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 2916 return CallResult.first; 2917 } 2918 2919 static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) { 2920 EVT OpVT = Op.getValueType(); 2921 if (OpVT != MVT::f16 && OpVT != MVT::bf16) 2922 return SDValue(); 2923 2924 assert(Op.getOperand(0).getValueType() == MVT::i16); 2925 SDLoc DL(Op); 2926 2927 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0)); 2928 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op); 2929 return SDValue( 2930 DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, OpVT, Op, 2931 DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)), 2932 0); 2933 } 2934 2935 static EVT getExtensionTo64Bits(const EVT &OrigVT) { 2936 if (OrigVT.getSizeInBits() >= 64) 2937 return OrigVT; 2938 2939 assert(OrigVT.isSimple() && "Expecting a simple value type"); 2940 2941 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; 2942 switch (OrigSimpleTy) { 2943 default: llvm_unreachable("Unexpected Vector Type"); 2944 case MVT::v2i8: 2945 case MVT::v2i16: 2946 return MVT::v2i32; 2947 case MVT::v4i8: 2948 return MVT::v4i16; 2949 } 2950 } 2951 2952 static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, 2953 const EVT &OrigTy, 2954 const EVT &ExtTy, 2955 unsigned ExtOpcode) { 2956 // The vector originally had a size of OrigTy. It was then extended to ExtTy. 2957 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than 2958 // 64-bits we need to insert a new extension so that it will be 64-bits. 2959 assert(ExtTy.is128BitVector() && "Unexpected extension size"); 2960 if (OrigTy.getSizeInBits() >= 64) 2961 return N; 2962 2963 // Must extend size to at least 64 bits to be used as an operand for VMULL. 2964 EVT NewVT = getExtensionTo64Bits(OrigTy); 2965 2966 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); 2967 } 2968 2969 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, 2970 bool isSigned) { 2971 EVT VT = N->getValueType(0); 2972 2973 if (N->getOpcode() != ISD::BUILD_VECTOR) 2974 return false; 2975 2976 for (const SDValue &Elt : N->op_values()) { 2977 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { 2978 unsigned EltSize = VT.getScalarSizeInBits(); 2979 unsigned HalfSize = EltSize / 2; 2980 if (isSigned) { 2981 if (!isIntN(HalfSize, C->getSExtValue())) 2982 return false; 2983 } else { 2984 if (!isUIntN(HalfSize, C->getZExtValue())) 2985 return false; 2986 } 2987 continue; 2988 } 2989 return false; 2990 } 2991 2992 return true; 2993 } 2994 2995 static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) { 2996 if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) 2997 return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG, 2998 N->getOperand(0)->getValueType(0), 2999 N->getValueType(0), 3000 N->getOpcode()); 3001 3002 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); 3003 EVT VT = N->getValueType(0); 3004 SDLoc dl(N); 3005 unsigned EltSize = VT.getScalarSizeInBits() / 2; 3006 unsigned NumElts = VT.getVectorNumElements(); 3007 MVT TruncVT = MVT::getIntegerVT(EltSize); 3008 SmallVector<SDValue, 8> Ops; 3009 for (unsigned i = 0; i != NumElts; ++i) { 3010 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); 3011 const APInt &CInt = C->getAPIntValue(); 3012 // Element types smaller than 32 bits are not legal, so use i32 elements. 3013 // The values are implicitly truncated so sext vs. zext doesn't matter. 3014 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); 3015 } 3016 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops); 3017 } 3018 3019 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { 3020 return N->getOpcode() == ISD::SIGN_EXTEND || 3021 isExtendedBUILD_VECTOR(N, DAG, true); 3022 } 3023 3024 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { 3025 return N->getOpcode() == ISD::ZERO_EXTEND || 3026 isExtendedBUILD_VECTOR(N, DAG, false); 3027 } 3028 3029 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { 3030 unsigned Opcode = N->getOpcode(); 3031 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 3032 SDNode *N0 = N->getOperand(0).getNode(); 3033 SDNode *N1 = N->getOperand(1).getNode(); 3034 return N0->hasOneUse() && N1->hasOneUse() && 3035 isSignExtended(N0, DAG) && isSignExtended(N1, DAG); 3036 } 3037 return false; 3038 } 3039 3040 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { 3041 unsigned Opcode = N->getOpcode(); 3042 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 3043 SDNode *N0 = N->getOperand(0).getNode(); 3044 SDNode *N1 = N->getOperand(1).getNode(); 3045 return N0->hasOneUse() && N1->hasOneUse() && 3046 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); 3047 } 3048 return false; 3049 } 3050 3051 SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 3052 SelectionDAG &DAG) const { 3053 // The rounding mode is in bits 23:22 of the FPSCR. 3054 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 3055 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) 3056 // so that the shift + and get folded into a bitfield extract. 3057 SDLoc dl(Op); 3058 3059 SDValue Chain = Op.getOperand(0); 3060 SDValue FPCR_64 = DAG.getNode( 3061 ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other}, 3062 {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)}); 3063 Chain = FPCR_64.getValue(1); 3064 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64); 3065 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32, 3066 DAG.getConstant(1U << 22, dl, MVT::i32)); 3067 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, 3068 DAG.getConstant(22, dl, MVT::i32)); 3069 SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, 3070 DAG.getConstant(3, dl, MVT::i32)); 3071 return DAG.getMergeValues({AND, Chain}, dl); 3072 } 3073 3074 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { 3075 // Multiplications are only custom-lowered for 128-bit vectors so that 3076 // VMULL can be detected. Otherwise v2i64 multiplications are not legal. 3077 EVT VT = Op.getValueType(); 3078 assert(VT.is128BitVector() && VT.isInteger() && 3079 "unexpected type for custom-lowering ISD::MUL"); 3080 SDNode *N0 = Op.getOperand(0).getNode(); 3081 SDNode *N1 = Op.getOperand(1).getNode(); 3082 unsigned NewOpc = 0; 3083 bool isMLA = false; 3084 bool isN0SExt = isSignExtended(N0, DAG); 3085 bool isN1SExt = isSignExtended(N1, DAG); 3086 if (isN0SExt && isN1SExt) 3087 NewOpc = AArch64ISD::SMULL; 3088 else { 3089 bool isN0ZExt = isZeroExtended(N0, DAG); 3090 bool isN1ZExt = isZeroExtended(N1, DAG); 3091 if (isN0ZExt && isN1ZExt) 3092 NewOpc = AArch64ISD::UMULL; 3093 else if (isN1SExt || isN1ZExt) { 3094 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these 3095 // into (s/zext A * s/zext C) + (s/zext B * s/zext C) 3096 if (isN1SExt && isAddSubSExt(N0, DAG)) { 3097 NewOpc = AArch64ISD::SMULL; 3098 isMLA = true; 3099 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { 3100 NewOpc = AArch64ISD::UMULL; 3101 isMLA = true; 3102 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { 3103 std::swap(N0, N1); 3104 NewOpc = AArch64ISD::UMULL; 3105 isMLA = true; 3106 } 3107 } 3108 3109 if (!NewOpc) { 3110 if (VT == MVT::v2i64) 3111 // Fall through to expand this. It is not legal. 3112 return SDValue(); 3113 else 3114 // Other vector multiplications are legal. 3115 return Op; 3116 } 3117 } 3118 3119 // Legalize to a S/UMULL instruction 3120 SDLoc DL(Op); 3121 SDValue Op0; 3122 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG); 3123 if (!isMLA) { 3124 Op0 = skipExtensionForVectorMULL(N0, DAG); 3125 assert(Op0.getValueType().is64BitVector() && 3126 Op1.getValueType().is64BitVector() && 3127 "unexpected types for extended operands to VMULL"); 3128 return DAG.getNode(NewOpc, DL, VT, Op0, Op1); 3129 } 3130 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during 3131 // isel lowering to take advantage of no-stall back to back s/umul + s/umla. 3132 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57 3133 SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG); 3134 SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG); 3135 EVT Op1VT = Op1.getValueType(); 3136 return DAG.getNode(N0->getOpcode(), DL, VT, 3137 DAG.getNode(NewOpc, DL, VT, 3138 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), 3139 DAG.getNode(NewOpc, DL, VT, 3140 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); 3141 } 3142 3143 static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, 3144 int Pattern) { 3145 return DAG.getNode(AArch64ISD::PTRUE, DL, VT, 3146 DAG.getTargetConstant(Pattern, DL, MVT::i32)); 3147 } 3148 3149 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, 3150 SelectionDAG &DAG) const { 3151 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 3152 SDLoc dl(Op); 3153 switch (IntNo) { 3154 default: return SDValue(); // Don't custom lower most intrinsics. 3155 case Intrinsic::thread_pointer: { 3156 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3157 return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT); 3158 } 3159 case Intrinsic::aarch64_neon_abs: { 3160 EVT Ty = Op.getValueType(); 3161 if (Ty == MVT::i64) { 3162 SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, 3163 Op.getOperand(1)); 3164 Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result); 3165 return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result); 3166 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) { 3167 return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1)); 3168 } else { 3169 report_fatal_error("Unexpected type for AArch64 NEON intrinic"); 3170 } 3171 } 3172 case Intrinsic::aarch64_neon_smax: 3173 return DAG.getNode(ISD::SMAX, dl, Op.getValueType(), 3174 Op.getOperand(1), Op.getOperand(2)); 3175 case Intrinsic::aarch64_neon_umax: 3176 return DAG.getNode(ISD::UMAX, dl, Op.getValueType(), 3177 Op.getOperand(1), Op.getOperand(2)); 3178 case Intrinsic::aarch64_neon_smin: 3179 return DAG.getNode(ISD::SMIN, dl, Op.getValueType(), 3180 Op.getOperand(1), Op.getOperand(2)); 3181 case Intrinsic::aarch64_neon_umin: 3182 return DAG.getNode(ISD::UMIN, dl, Op.getValueType(), 3183 Op.getOperand(1), Op.getOperand(2)); 3184 3185 case Intrinsic::aarch64_sve_sunpkhi: 3186 return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(), 3187 Op.getOperand(1)); 3188 case Intrinsic::aarch64_sve_sunpklo: 3189 return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(), 3190 Op.getOperand(1)); 3191 case Intrinsic::aarch64_sve_uunpkhi: 3192 return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(), 3193 Op.getOperand(1)); 3194 case Intrinsic::aarch64_sve_uunpklo: 3195 return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(), 3196 Op.getOperand(1)); 3197 case Intrinsic::aarch64_sve_clasta_n: 3198 return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(), 3199 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 3200 case Intrinsic::aarch64_sve_clastb_n: 3201 return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(), 3202 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 3203 case Intrinsic::aarch64_sve_lasta: 3204 return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(), 3205 Op.getOperand(1), Op.getOperand(2)); 3206 case Intrinsic::aarch64_sve_lastb: 3207 return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(), 3208 Op.getOperand(1), Op.getOperand(2)); 3209 case Intrinsic::aarch64_sve_rev: 3210 return DAG.getNode(AArch64ISD::REV, dl, Op.getValueType(), 3211 Op.getOperand(1)); 3212 case Intrinsic::aarch64_sve_tbl: 3213 return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(), 3214 Op.getOperand(1), Op.getOperand(2)); 3215 case Intrinsic::aarch64_sve_trn1: 3216 return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(), 3217 Op.getOperand(1), Op.getOperand(2)); 3218 case Intrinsic::aarch64_sve_trn2: 3219 return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(), 3220 Op.getOperand(1), Op.getOperand(2)); 3221 case Intrinsic::aarch64_sve_uzp1: 3222 return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(), 3223 Op.getOperand(1), Op.getOperand(2)); 3224 case Intrinsic::aarch64_sve_uzp2: 3225 return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(), 3226 Op.getOperand(1), Op.getOperand(2)); 3227 case Intrinsic::aarch64_sve_zip1: 3228 return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(), 3229 Op.getOperand(1), Op.getOperand(2)); 3230 case Intrinsic::aarch64_sve_zip2: 3231 return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(), 3232 Op.getOperand(1), Op.getOperand(2)); 3233 case Intrinsic::aarch64_sve_ptrue: 3234 return DAG.getNode(AArch64ISD::PTRUE, dl, Op.getValueType(), 3235 Op.getOperand(1)); 3236 case Intrinsic::aarch64_sve_dupq_lane: 3237 return LowerDUPQLane(Op, DAG); 3238 case Intrinsic::aarch64_sve_convert_from_svbool: 3239 return DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, Op.getValueType(), 3240 Op.getOperand(1)); 3241 case Intrinsic::aarch64_sve_convert_to_svbool: { 3242 EVT OutVT = Op.getValueType(); 3243 EVT InVT = Op.getOperand(1).getValueType(); 3244 // Return the operand if the cast isn't changing type, 3245 // i.e. <n x 16 x i1> -> <n x 16 x i1> 3246 if (InVT == OutVT) 3247 return Op.getOperand(1); 3248 // Otherwise, zero the newly introduced lanes. 3249 SDValue Reinterpret = 3250 DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, OutVT, Op.getOperand(1)); 3251 SDValue Mask = getPTrue(DAG, dl, InVT, AArch64SVEPredPattern::all); 3252 SDValue MaskReinterpret = 3253 DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, OutVT, Mask); 3254 return DAG.getNode(ISD::AND, dl, OutVT, Reinterpret, MaskReinterpret); 3255 } 3256 3257 case Intrinsic::aarch64_sve_insr: { 3258 SDValue Scalar = Op.getOperand(2); 3259 EVT ScalarTy = Scalar.getValueType(); 3260 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) 3261 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar); 3262 3263 return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(), 3264 Op.getOperand(1), Scalar); 3265 } 3266 3267 case Intrinsic::localaddress: { 3268 const auto &MF = DAG.getMachineFunction(); 3269 const auto *RegInfo = Subtarget->getRegisterInfo(); 3270 unsigned Reg = RegInfo->getLocalAddressRegister(MF); 3271 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, 3272 Op.getSimpleValueType()); 3273 } 3274 3275 case Intrinsic::eh_recoverfp: { 3276 // FIXME: This needs to be implemented to correctly handle highly aligned 3277 // stack objects. For now we simply return the incoming FP. Refer D53541 3278 // for more details. 3279 SDValue FnOp = Op.getOperand(1); 3280 SDValue IncomingFPOp = Op.getOperand(2); 3281 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp); 3282 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr); 3283 if (!Fn) 3284 report_fatal_error( 3285 "llvm.eh.recoverfp must take a function as the first argument"); 3286 return IncomingFPOp; 3287 } 3288 3289 case Intrinsic::aarch64_neon_vsri: 3290 case Intrinsic::aarch64_neon_vsli: { 3291 EVT Ty = Op.getValueType(); 3292 3293 if (!Ty.isVector()) 3294 report_fatal_error("Unexpected type for aarch64_neon_vsli"); 3295 3296 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits()); 3297 3298 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri; 3299 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI; 3300 return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2), 3301 Op.getOperand(3)); 3302 } 3303 3304 case Intrinsic::aarch64_neon_srhadd: 3305 case Intrinsic::aarch64_neon_urhadd: { 3306 bool IsSignedAdd = IntNo == Intrinsic::aarch64_neon_srhadd; 3307 unsigned Opcode = IsSignedAdd ? AArch64ISD::SRHADD : AArch64ISD::URHADD; 3308 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), 3309 Op.getOperand(2)); 3310 } 3311 } 3312 } 3313 3314 bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { 3315 return ExtVal.getValueType().isScalableVector(); 3316 } 3317 3318 // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16. 3319 static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, 3320 EVT VT, EVT MemVT, 3321 SelectionDAG &DAG) { 3322 assert(VT.isVector() && "VT should be a vector type"); 3323 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16); 3324 3325 SDValue Value = ST->getValue(); 3326 3327 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract 3328 // the word lane which represent the v4i8 subvector. It optimizes the store 3329 // to: 3330 // 3331 // xtn v0.8b, v0.8h 3332 // str s0, [x0] 3333 3334 SDValue Undef = DAG.getUNDEF(MVT::i16); 3335 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL, 3336 {Undef, Undef, Undef, Undef}); 3337 3338 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, 3339 Value, UndefVec); 3340 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt); 3341 3342 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc); 3343 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, 3344 Trunc, DAG.getConstant(0, DL, MVT::i64)); 3345 3346 return DAG.getStore(ST->getChain(), DL, ExtractTrunc, 3347 ST->getBasePtr(), ST->getMemOperand()); 3348 } 3349 3350 // Custom lowering for any store, vector or scalar and/or default or with 3351 // a truncate operations. Currently only custom lower truncate operation 3352 // from vector v4i16 to v4i8 or volatile stores of i128. 3353 SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, 3354 SelectionDAG &DAG) const { 3355 SDLoc Dl(Op); 3356 StoreSDNode *StoreNode = cast<StoreSDNode>(Op); 3357 assert (StoreNode && "Can only custom lower store nodes"); 3358 3359 SDValue Value = StoreNode->getValue(); 3360 3361 EVT VT = Value.getValueType(); 3362 EVT MemVT = StoreNode->getMemoryVT(); 3363 3364 if (VT.isVector()) { 3365 if (useSVEForFixedLengthVectorVT(VT)) 3366 return LowerFixedLengthVectorStoreToSVE(Op, DAG); 3367 3368 unsigned AS = StoreNode->getAddressSpace(); 3369 Align Alignment = StoreNode->getAlign(); 3370 if (Alignment < MemVT.getStoreSize() && 3371 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment.value(), 3372 StoreNode->getMemOperand()->getFlags(), 3373 nullptr)) { 3374 return scalarizeVectorStore(StoreNode, DAG); 3375 } 3376 3377 if (StoreNode->isTruncatingStore()) { 3378 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG); 3379 } 3380 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of 3381 // the custom lowering, as there are no un-paired non-temporal stores and 3382 // legalization will break up 256 bit inputs. 3383 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u && 3384 MemVT.getVectorElementCount().Min % 2u == 0 && 3385 ((MemVT.getScalarSizeInBits() == 8u || 3386 MemVT.getScalarSizeInBits() == 16u || 3387 MemVT.getScalarSizeInBits() == 32u || 3388 MemVT.getScalarSizeInBits() == 64u))) { 3389 SDValue Lo = 3390 DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl, 3391 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), 3392 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64)); 3393 SDValue Hi = DAG.getNode( 3394 ISD::EXTRACT_SUBVECTOR, Dl, 3395 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), 3396 StoreNode->getValue(), 3397 DAG.getConstant(MemVT.getVectorElementCount().Min / 2, Dl, MVT::i64)); 3398 SDValue Result = DAG.getMemIntrinsicNode( 3399 AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other), 3400 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()}, 3401 StoreNode->getMemoryVT(), StoreNode->getMemOperand()); 3402 return Result; 3403 } 3404 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) { 3405 assert(StoreNode->getValue()->getValueType(0) == MVT::i128); 3406 SDValue Lo = 3407 DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(), 3408 DAG.getConstant(0, Dl, MVT::i64)); 3409 SDValue Hi = 3410 DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(), 3411 DAG.getConstant(1, Dl, MVT::i64)); 3412 SDValue Result = DAG.getMemIntrinsicNode( 3413 AArch64ISD::STP, Dl, DAG.getVTList(MVT::Other), 3414 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()}, 3415 StoreNode->getMemoryVT(), StoreNode->getMemOperand()); 3416 return Result; 3417 } 3418 3419 return SDValue(); 3420 } 3421 3422 SDValue AArch64TargetLowering::LowerOperation(SDValue Op, 3423 SelectionDAG &DAG) const { 3424 LLVM_DEBUG(dbgs() << "Custom lowering: "); 3425 LLVM_DEBUG(Op.dump()); 3426 3427 switch (Op.getOpcode()) { 3428 default: 3429 llvm_unreachable("unimplemented operand"); 3430 return SDValue(); 3431 case ISD::BITCAST: 3432 return LowerBITCAST(Op, DAG); 3433 case ISD::GlobalAddress: 3434 return LowerGlobalAddress(Op, DAG); 3435 case ISD::GlobalTLSAddress: 3436 return LowerGlobalTLSAddress(Op, DAG); 3437 case ISD::SETCC: 3438 case ISD::STRICT_FSETCC: 3439 case ISD::STRICT_FSETCCS: 3440 return LowerSETCC(Op, DAG); 3441 case ISD::BR_CC: 3442 return LowerBR_CC(Op, DAG); 3443 case ISD::SELECT: 3444 return LowerSELECT(Op, DAG); 3445 case ISD::SELECT_CC: 3446 return LowerSELECT_CC(Op, DAG); 3447 case ISD::JumpTable: 3448 return LowerJumpTable(Op, DAG); 3449 case ISD::BR_JT: 3450 return LowerBR_JT(Op, DAG); 3451 case ISD::ConstantPool: 3452 return LowerConstantPool(Op, DAG); 3453 case ISD::BlockAddress: 3454 return LowerBlockAddress(Op, DAG); 3455 case ISD::VASTART: 3456 return LowerVASTART(Op, DAG); 3457 case ISD::VACOPY: 3458 return LowerVACOPY(Op, DAG); 3459 case ISD::VAARG: 3460 return LowerVAARG(Op, DAG); 3461 case ISD::ADDC: 3462 case ISD::ADDE: 3463 case ISD::SUBC: 3464 case ISD::SUBE: 3465 return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 3466 case ISD::SADDO: 3467 case ISD::UADDO: 3468 case ISD::SSUBO: 3469 case ISD::USUBO: 3470 case ISD::SMULO: 3471 case ISD::UMULO: 3472 return LowerXALUO(Op, DAG); 3473 case ISD::FADD: 3474 if (useSVEForFixedLengthVectorVT(Op.getValueType())) 3475 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED); 3476 return LowerF128Call(Op, DAG, RTLIB::ADD_F128); 3477 case ISD::FSUB: 3478 return LowerF128Call(Op, DAG, RTLIB::SUB_F128); 3479 case ISD::FMUL: 3480 return LowerF128Call(Op, DAG, RTLIB::MUL_F128); 3481 case ISD::FMA: 3482 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED); 3483 case ISD::FDIV: 3484 return LowerF128Call(Op, DAG, RTLIB::DIV_F128); 3485 case ISD::FP_ROUND: 3486 case ISD::STRICT_FP_ROUND: 3487 return LowerFP_ROUND(Op, DAG); 3488 case ISD::FP_EXTEND: 3489 return LowerFP_EXTEND(Op, DAG); 3490 case ISD::FRAMEADDR: 3491 return LowerFRAMEADDR(Op, DAG); 3492 case ISD::SPONENTRY: 3493 return LowerSPONENTRY(Op, DAG); 3494 case ISD::RETURNADDR: 3495 return LowerRETURNADDR(Op, DAG); 3496 case ISD::ADDROFRETURNADDR: 3497 return LowerADDROFRETURNADDR(Op, DAG); 3498 case ISD::INSERT_VECTOR_ELT: 3499 return LowerINSERT_VECTOR_ELT(Op, DAG); 3500 case ISD::EXTRACT_VECTOR_ELT: 3501 return LowerEXTRACT_VECTOR_ELT(Op, DAG); 3502 case ISD::BUILD_VECTOR: 3503 return LowerBUILD_VECTOR(Op, DAG); 3504 case ISD::VECTOR_SHUFFLE: 3505 return LowerVECTOR_SHUFFLE(Op, DAG); 3506 case ISD::SPLAT_VECTOR: 3507 return LowerSPLAT_VECTOR(Op, DAG); 3508 case ISD::EXTRACT_SUBVECTOR: 3509 return LowerEXTRACT_SUBVECTOR(Op, DAG); 3510 case ISD::INSERT_SUBVECTOR: 3511 return LowerINSERT_SUBVECTOR(Op, DAG); 3512 case ISD::SDIV: 3513 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SDIV_PRED); 3514 case ISD::UDIV: 3515 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UDIV_PRED); 3516 case ISD::SMIN: 3517 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_MERGE_OP1); 3518 case ISD::UMIN: 3519 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_MERGE_OP1); 3520 case ISD::SMAX: 3521 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_MERGE_OP1); 3522 case ISD::UMAX: 3523 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_MERGE_OP1); 3524 case ISD::SRA: 3525 case ISD::SRL: 3526 case ISD::SHL: 3527 return LowerVectorSRA_SRL_SHL(Op, DAG); 3528 case ISD::SHL_PARTS: 3529 return LowerShiftLeftParts(Op, DAG); 3530 case ISD::SRL_PARTS: 3531 case ISD::SRA_PARTS: 3532 return LowerShiftRightParts(Op, DAG); 3533 case ISD::CTPOP: 3534 return LowerCTPOP(Op, DAG); 3535 case ISD::FCOPYSIGN: 3536 return LowerFCOPYSIGN(Op, DAG); 3537 case ISD::OR: 3538 return LowerVectorOR(Op, DAG); 3539 case ISD::XOR: 3540 return LowerXOR(Op, DAG); 3541 case ISD::PREFETCH: 3542 return LowerPREFETCH(Op, DAG); 3543 case ISD::SINT_TO_FP: 3544 case ISD::UINT_TO_FP: 3545 case ISD::STRICT_SINT_TO_FP: 3546 case ISD::STRICT_UINT_TO_FP: 3547 return LowerINT_TO_FP(Op, DAG); 3548 case ISD::FP_TO_SINT: 3549 case ISD::FP_TO_UINT: 3550 case ISD::STRICT_FP_TO_SINT: 3551 case ISD::STRICT_FP_TO_UINT: 3552 return LowerFP_TO_INT(Op, DAG); 3553 case ISD::FSINCOS: 3554 return LowerFSINCOS(Op, DAG); 3555 case ISD::FLT_ROUNDS_: 3556 return LowerFLT_ROUNDS_(Op, DAG); 3557 case ISD::MUL: 3558 return LowerMUL(Op, DAG); 3559 case ISD::INTRINSIC_WO_CHAIN: 3560 return LowerINTRINSIC_WO_CHAIN(Op, DAG); 3561 case ISD::STORE: 3562 return LowerSTORE(Op, DAG); 3563 case ISD::VECREDUCE_ADD: 3564 case ISD::VECREDUCE_SMAX: 3565 case ISD::VECREDUCE_SMIN: 3566 case ISD::VECREDUCE_UMAX: 3567 case ISD::VECREDUCE_UMIN: 3568 case ISD::VECREDUCE_FMAX: 3569 case ISD::VECREDUCE_FMIN: 3570 return LowerVECREDUCE(Op, DAG); 3571 case ISD::ATOMIC_LOAD_SUB: 3572 return LowerATOMIC_LOAD_SUB(Op, DAG); 3573 case ISD::ATOMIC_LOAD_AND: 3574 return LowerATOMIC_LOAD_AND(Op, DAG); 3575 case ISD::DYNAMIC_STACKALLOC: 3576 return LowerDYNAMIC_STACKALLOC(Op, DAG); 3577 case ISD::VSCALE: 3578 return LowerVSCALE(Op, DAG); 3579 case ISD::TRUNCATE: 3580 return LowerTRUNCATE(Op, DAG); 3581 case ISD::LOAD: 3582 if (useSVEForFixedLengthVectorVT(Op.getValueType())) 3583 return LowerFixedLengthVectorLoadToSVE(Op, DAG); 3584 llvm_unreachable("Unexpected request to lower ISD::LOAD"); 3585 case ISD::ADD: 3586 if (useSVEForFixedLengthVectorVT(Op.getValueType())) 3587 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED); 3588 llvm_unreachable("Unexpected request to lower ISD::ADD"); 3589 } 3590 } 3591 3592 bool AArch64TargetLowering::useSVEForFixedLengthVectors() const { 3593 // Prefer NEON unless larger SVE registers are available. 3594 return Subtarget->hasSVE() && Subtarget->getMinSVEVectorSizeInBits() >= 256; 3595 } 3596 3597 bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(EVT VT) const { 3598 if (!useSVEForFixedLengthVectors()) 3599 return false; 3600 3601 if (!VT.isFixedLengthVector()) 3602 return false; 3603 3604 // Fixed length predicates should be promoted to i8. 3605 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work. 3606 if (VT.getVectorElementType() == MVT::i1) 3607 return false; 3608 3609 // Don't use SVE for vectors we cannot scalarize if required. 3610 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { 3611 default: 3612 return false; 3613 case MVT::i8: 3614 case MVT::i16: 3615 case MVT::i32: 3616 case MVT::i64: 3617 case MVT::f16: 3618 case MVT::f32: 3619 case MVT::f64: 3620 break; 3621 } 3622 3623 // Ensure NEON MVTs only belong to a single register class. 3624 if (VT.getSizeInBits() <= 128) 3625 return false; 3626 3627 // Don't use SVE for types that don't fit. 3628 if (VT.getSizeInBits() > Subtarget->getMinSVEVectorSizeInBits()) 3629 return false; 3630 3631 // TODO: Perhaps an artificial restriction, but worth having whilst getting 3632 // the base fixed length SVE support in place. 3633 if (!VT.isPow2VectorType()) 3634 return false; 3635 3636 return true; 3637 } 3638 3639 //===----------------------------------------------------------------------===// 3640 // Calling Convention Implementation 3641 //===----------------------------------------------------------------------===// 3642 3643 /// Selects the correct CCAssignFn for a given CallingConvention value. 3644 CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, 3645 bool IsVarArg) const { 3646 switch (CC) { 3647 default: 3648 report_fatal_error("Unsupported calling convention."); 3649 case CallingConv::WebKit_JS: 3650 return CC_AArch64_WebKit_JS; 3651 case CallingConv::GHC: 3652 return CC_AArch64_GHC; 3653 case CallingConv::C: 3654 case CallingConv::Fast: 3655 case CallingConv::PreserveMost: 3656 case CallingConv::CXX_FAST_TLS: 3657 case CallingConv::Swift: 3658 if (Subtarget->isTargetWindows() && IsVarArg) 3659 return CC_AArch64_Win64_VarArg; 3660 if (!Subtarget->isTargetDarwin()) 3661 return CC_AArch64_AAPCS; 3662 if (!IsVarArg) 3663 return CC_AArch64_DarwinPCS; 3664 return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg 3665 : CC_AArch64_DarwinPCS_VarArg; 3666 case CallingConv::Win64: 3667 return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS; 3668 case CallingConv::CFGuard_Check: 3669 return CC_AArch64_Win64_CFGuard_Check; 3670 case CallingConv::AArch64_VectorCall: 3671 case CallingConv::AArch64_SVE_VectorCall: 3672 return CC_AArch64_AAPCS; 3673 } 3674 } 3675 3676 CCAssignFn * 3677 AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const { 3678 return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS 3679 : RetCC_AArch64_AAPCS; 3680 } 3681 3682 SDValue AArch64TargetLowering::LowerFormalArguments( 3683 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 3684 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, 3685 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 3686 MachineFunction &MF = DAG.getMachineFunction(); 3687 MachineFrameInfo &MFI = MF.getFrameInfo(); 3688 bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()); 3689 3690 // Assign locations to all of the incoming arguments. 3691 SmallVector<CCValAssign, 16> ArgLocs; 3692 DenseMap<unsigned, SDValue> CopiedRegs; 3693 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 3694 *DAG.getContext()); 3695 3696 // At this point, Ins[].VT may already be promoted to i32. To correctly 3697 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and 3698 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. 3699 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here 3700 // we use a special version of AnalyzeFormalArguments to pass in ValVT and 3701 // LocVT. 3702 unsigned NumArgs = Ins.size(); 3703 Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin(); 3704 unsigned CurArgIdx = 0; 3705 for (unsigned i = 0; i != NumArgs; ++i) { 3706 MVT ValVT = Ins[i].VT; 3707 if (Ins[i].isOrigArg()) { 3708 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx); 3709 CurArgIdx = Ins[i].getOrigArgIndex(); 3710 3711 // Get type of the original argument. 3712 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(), 3713 /*AllowUnknown*/ true); 3714 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other; 3715 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. 3716 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) 3717 ValVT = MVT::i8; 3718 else if (ActualMVT == MVT::i16) 3719 ValVT = MVT::i16; 3720 } 3721 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); 3722 bool Res = 3723 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo); 3724 assert(!Res && "Call operand has unhandled type"); 3725 (void)Res; 3726 } 3727 assert(ArgLocs.size() == Ins.size()); 3728 SmallVector<SDValue, 16> ArgValues; 3729 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3730 CCValAssign &VA = ArgLocs[i]; 3731 3732 if (Ins[i].Flags.isByVal()) { 3733 // Byval is used for HFAs in the PCS, but the system should work in a 3734 // non-compliant manner for larger structs. 3735 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3736 int Size = Ins[i].Flags.getByValSize(); 3737 unsigned NumRegs = (Size + 7) / 8; 3738 3739 // FIXME: This works on big-endian for composite byvals, which are the common 3740 // case. It should also work for fundamental types too. 3741 unsigned FrameIdx = 3742 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false); 3743 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT); 3744 InVals.push_back(FrameIdxN); 3745 3746 continue; 3747 } 3748 3749 SDValue ArgValue; 3750 if (VA.isRegLoc()) { 3751 // Arguments stored in registers. 3752 EVT RegVT = VA.getLocVT(); 3753 const TargetRegisterClass *RC; 3754 3755 if (RegVT == MVT::i32) 3756 RC = &AArch64::GPR32RegClass; 3757 else if (RegVT == MVT::i64) 3758 RC = &AArch64::GPR64RegClass; 3759 else if (RegVT == MVT::f16 || RegVT == MVT::bf16) 3760 RC = &AArch64::FPR16RegClass; 3761 else if (RegVT == MVT::f32) 3762 RC = &AArch64::FPR32RegClass; 3763 else if (RegVT == MVT::f64 || RegVT.is64BitVector()) 3764 RC = &AArch64::FPR64RegClass; 3765 else if (RegVT == MVT::f128 || RegVT.is128BitVector()) 3766 RC = &AArch64::FPR128RegClass; 3767 else if (RegVT.isScalableVector() && 3768 RegVT.getVectorElementType() == MVT::i1) 3769 RC = &AArch64::PPRRegClass; 3770 else if (RegVT.isScalableVector()) 3771 RC = &AArch64::ZPRRegClass; 3772 else 3773 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); 3774 3775 // Transform the arguments in physical registers into virtual ones. 3776 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 3777 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT); 3778 3779 // If this is an 8, 16 or 32-bit value, it is really passed promoted 3780 // to 64 bits. Insert an assert[sz]ext to capture this, then 3781 // truncate to the right size. 3782 switch (VA.getLocInfo()) { 3783 default: 3784 llvm_unreachable("Unknown loc info!"); 3785 case CCValAssign::Full: 3786 break; 3787 case CCValAssign::Indirect: 3788 assert(VA.getValVT().isScalableVector() && 3789 "Only scalable vectors can be passed indirectly"); 3790 break; 3791 case CCValAssign::BCvt: 3792 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue); 3793 break; 3794 case CCValAssign::AExt: 3795 case CCValAssign::SExt: 3796 case CCValAssign::ZExt: 3797 break; 3798 case CCValAssign::AExtUpper: 3799 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue, 3800 DAG.getConstant(32, DL, RegVT)); 3801 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT()); 3802 break; 3803 } 3804 } else { // VA.isRegLoc() 3805 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem"); 3806 unsigned ArgOffset = VA.getLocMemOffset(); 3807 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect 3808 ? VA.getLocVT().getSizeInBits() 3809 : VA.getValVT().getSizeInBits()) / 8; 3810 3811 uint32_t BEAlign = 0; 3812 if (!Subtarget->isLittleEndian() && ArgSize < 8 && 3813 !Ins[i].Flags.isInConsecutiveRegs()) 3814 BEAlign = 8 - ArgSize; 3815 3816 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true); 3817 3818 // Create load nodes to retrieve arguments from the stack. 3819 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 3820 3821 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) 3822 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; 3823 MVT MemVT = VA.getValVT(); 3824 3825 switch (VA.getLocInfo()) { 3826 default: 3827 break; 3828 case CCValAssign::Trunc: 3829 case CCValAssign::BCvt: 3830 MemVT = VA.getLocVT(); 3831 break; 3832 case CCValAssign::Indirect: 3833 assert(VA.getValVT().isScalableVector() && 3834 "Only scalable vectors can be passed indirectly"); 3835 MemVT = VA.getLocVT(); 3836 break; 3837 case CCValAssign::SExt: 3838 ExtType = ISD::SEXTLOAD; 3839 break; 3840 case CCValAssign::ZExt: 3841 ExtType = ISD::ZEXTLOAD; 3842 break; 3843 case CCValAssign::AExt: 3844 ExtType = ISD::EXTLOAD; 3845 break; 3846 } 3847 3848 ArgValue = DAG.getExtLoad( 3849 ExtType, DL, VA.getLocVT(), Chain, FIN, 3850 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), 3851 MemVT); 3852 3853 } 3854 3855 if (VA.getLocInfo() == CCValAssign::Indirect) { 3856 assert(VA.getValVT().isScalableVector() && 3857 "Only scalable vectors can be passed indirectly"); 3858 // If value is passed via pointer - do a load. 3859 ArgValue = 3860 DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue, MachinePointerInfo()); 3861 } 3862 3863 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer()) 3864 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(), 3865 ArgValue, DAG.getValueType(MVT::i32)); 3866 InVals.push_back(ArgValue); 3867 } 3868 3869 // varargs 3870 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 3871 if (isVarArg) { 3872 if (!Subtarget->isTargetDarwin() || IsWin64) { 3873 // The AAPCS variadic function ABI is identical to the non-variadic 3874 // one. As a result there may be more arguments in registers and we should 3875 // save them for future reference. 3876 // Win64 variadic functions also pass arguments in registers, but all float 3877 // arguments are passed in integer registers. 3878 saveVarArgRegisters(CCInfo, DAG, DL, Chain); 3879 } 3880 3881 // This will point to the next argument passed via stack. 3882 unsigned StackOffset = CCInfo.getNextStackOffset(); 3883 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32 3884 StackOffset = alignTo(StackOffset, Subtarget->isTargetILP32() ? 4 : 8); 3885 FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true)); 3886 3887 if (MFI.hasMustTailInVarArgFunc()) { 3888 SmallVector<MVT, 2> RegParmTypes; 3889 RegParmTypes.push_back(MVT::i64); 3890 RegParmTypes.push_back(MVT::f128); 3891 // Compute the set of forwarded registers. The rest are scratch. 3892 SmallVectorImpl<ForwardedRegister> &Forwards = 3893 FuncInfo->getForwardedMustTailRegParms(); 3894 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, 3895 CC_AArch64_AAPCS); 3896 3897 // Conservatively forward X8, since it might be used for aggregate return. 3898 if (!CCInfo.isAllocated(AArch64::X8)) { 3899 unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass); 3900 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64)); 3901 } 3902 } 3903 } 3904 3905 // On Windows, InReg pointers must be returned, so record the pointer in a 3906 // virtual register at the start of the function so it can be returned in the 3907 // epilogue. 3908 if (IsWin64) { 3909 for (unsigned I = 0, E = Ins.size(); I != E; ++I) { 3910 if (Ins[I].Flags.isInReg()) { 3911 assert(!FuncInfo->getSRetReturnReg()); 3912 3913 MVT PtrTy = getPointerTy(DAG.getDataLayout()); 3914 Register Reg = 3915 MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); 3916 FuncInfo->setSRetReturnReg(Reg); 3917 3918 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]); 3919 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain); 3920 break; 3921 } 3922 } 3923 } 3924 3925 unsigned StackArgSize = CCInfo.getNextStackOffset(); 3926 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 3927 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) { 3928 // This is a non-standard ABI so by fiat I say we're allowed to make full 3929 // use of the stack area to be popped, which must be aligned to 16 bytes in 3930 // any case: 3931 StackArgSize = alignTo(StackArgSize, 16); 3932 3933 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding 3934 // a multiple of 16. 3935 FuncInfo->setArgumentStackToRestore(StackArgSize); 3936 3937 // This realignment carries over to the available bytes below. Our own 3938 // callers will guarantee the space is free by giving an aligned value to 3939 // CALLSEQ_START. 3940 } 3941 // Even if we're not expected to free up the space, it's useful to know how 3942 // much is there while considering tail calls (because we can reuse it). 3943 FuncInfo->setBytesInStackArgArea(StackArgSize); 3944 3945 if (Subtarget->hasCustomCallingConv()) 3946 Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF); 3947 3948 return Chain; 3949 } 3950 3951 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, 3952 SelectionDAG &DAG, 3953 const SDLoc &DL, 3954 SDValue &Chain) const { 3955 MachineFunction &MF = DAG.getMachineFunction(); 3956 MachineFrameInfo &MFI = MF.getFrameInfo(); 3957 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 3958 auto PtrVT = getPointerTy(DAG.getDataLayout()); 3959 bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()); 3960 3961 SmallVector<SDValue, 8> MemOps; 3962 3963 static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2, 3964 AArch64::X3, AArch64::X4, AArch64::X5, 3965 AArch64::X6, AArch64::X7 }; 3966 static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs); 3967 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs); 3968 3969 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR); 3970 int GPRIdx = 0; 3971 if (GPRSaveSize != 0) { 3972 if (IsWin64) { 3973 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false); 3974 if (GPRSaveSize & 15) 3975 // The extra size here, if triggered, will always be 8. 3976 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false); 3977 } else 3978 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false); 3979 3980 SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT); 3981 3982 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) { 3983 unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass); 3984 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); 3985 SDValue Store = DAG.getStore( 3986 Val.getValue(1), DL, Val, FIN, 3987 IsWin64 3988 ? MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), 3989 GPRIdx, 3990 (i - FirstVariadicGPR) * 8) 3991 : MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8)); 3992 MemOps.push_back(Store); 3993 FIN = 3994 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT)); 3995 } 3996 } 3997 FuncInfo->setVarArgsGPRIndex(GPRIdx); 3998 FuncInfo->setVarArgsGPRSize(GPRSaveSize); 3999 4000 if (Subtarget->hasFPARMv8() && !IsWin64) { 4001 static const MCPhysReg FPRArgRegs[] = { 4002 AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, 4003 AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7}; 4004 static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs); 4005 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs); 4006 4007 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); 4008 int FPRIdx = 0; 4009 if (FPRSaveSize != 0) { 4010 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false); 4011 4012 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT); 4013 4014 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) { 4015 unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass); 4016 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); 4017 4018 SDValue Store = DAG.getStore( 4019 Val.getValue(1), DL, Val, FIN, 4020 MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16)); 4021 MemOps.push_back(Store); 4022 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, 4023 DAG.getConstant(16, DL, PtrVT)); 4024 } 4025 } 4026 FuncInfo->setVarArgsFPRIndex(FPRIdx); 4027 FuncInfo->setVarArgsFPRSize(FPRSaveSize); 4028 } 4029 4030 if (!MemOps.empty()) { 4031 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); 4032 } 4033 } 4034 4035 /// LowerCallResult - Lower the result values of a call into the 4036 /// appropriate copies out of appropriate physical registers. 4037 SDValue AArch64TargetLowering::LowerCallResult( 4038 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, 4039 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, 4040 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn, 4041 SDValue ThisVal) const { 4042 CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS 4043 ? RetCC_AArch64_WebKit_JS 4044 : RetCC_AArch64_AAPCS; 4045 // Assign locations to each value returned by this call. 4046 SmallVector<CCValAssign, 16> RVLocs; 4047 DenseMap<unsigned, SDValue> CopiedRegs; 4048 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 4049 *DAG.getContext()); 4050 CCInfo.AnalyzeCallResult(Ins, RetCC); 4051 4052 // Copy all of the result registers out of their specified physreg. 4053 for (unsigned i = 0; i != RVLocs.size(); ++i) { 4054 CCValAssign VA = RVLocs[i]; 4055 4056 // Pass 'this' value directly from the argument to return value, to avoid 4057 // reg unit interference 4058 if (i == 0 && isThisReturn) { 4059 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 && 4060 "unexpected return calling convention register assignment"); 4061 InVals.push_back(ThisVal); 4062 continue; 4063 } 4064 4065 // Avoid copying a physreg twice since RegAllocFast is incompetent and only 4066 // allows one use of a physreg per block. 4067 SDValue Val = CopiedRegs.lookup(VA.getLocReg()); 4068 if (!Val) { 4069 Val = 4070 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag); 4071 Chain = Val.getValue(1); 4072 InFlag = Val.getValue(2); 4073 CopiedRegs[VA.getLocReg()] = Val; 4074 } 4075 4076 switch (VA.getLocInfo()) { 4077 default: 4078 llvm_unreachable("Unknown loc info!"); 4079 case CCValAssign::Full: 4080 break; 4081 case CCValAssign::BCvt: 4082 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); 4083 break; 4084 case CCValAssign::AExtUpper: 4085 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val, 4086 DAG.getConstant(32, DL, VA.getLocVT())); 4087 LLVM_FALLTHROUGH; 4088 case CCValAssign::AExt: 4089 LLVM_FALLTHROUGH; 4090 case CCValAssign::ZExt: 4091 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT()); 4092 break; 4093 } 4094 4095 InVals.push_back(Val); 4096 } 4097 4098 return Chain; 4099 } 4100 4101 /// Return true if the calling convention is one that we can guarantee TCO for. 4102 static bool canGuaranteeTCO(CallingConv::ID CC) { 4103 return CC == CallingConv::Fast; 4104 } 4105 4106 /// Return true if we might ever do TCO for calls with this calling convention. 4107 static bool mayTailCallThisCC(CallingConv::ID CC) { 4108 switch (CC) { 4109 case CallingConv::C: 4110 case CallingConv::AArch64_SVE_VectorCall: 4111 case CallingConv::PreserveMost: 4112 case CallingConv::Swift: 4113 return true; 4114 default: 4115 return canGuaranteeTCO(CC); 4116 } 4117 } 4118 4119 bool AArch64TargetLowering::isEligibleForTailCallOptimization( 4120 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, 4121 const SmallVectorImpl<ISD::OutputArg> &Outs, 4122 const SmallVectorImpl<SDValue> &OutVals, 4123 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { 4124 if (!mayTailCallThisCC(CalleeCC)) 4125 return false; 4126 4127 MachineFunction &MF = DAG.getMachineFunction(); 4128 const Function &CallerF = MF.getFunction(); 4129 CallingConv::ID CallerCC = CallerF.getCallingConv(); 4130 4131 // If this function uses the C calling convention but has an SVE signature, 4132 // then it preserves more registers and should assume the SVE_VectorCall CC. 4133 // The check for matching callee-saved regs will determine whether it is 4134 // eligible for TCO. 4135 if (CallerCC == CallingConv::C && 4136 AArch64RegisterInfo::hasSVEArgsOrReturn(&MF)) 4137 CallerCC = CallingConv::AArch64_SVE_VectorCall; 4138 4139 bool CCMatch = CallerCC == CalleeCC; 4140 4141 // When using the Windows calling convention on a non-windows OS, we want 4142 // to back up and restore X18 in such functions; we can't do a tail call 4143 // from those functions. 4144 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() && 4145 CalleeCC != CallingConv::Win64) 4146 return false; 4147 4148 // Byval parameters hand the function a pointer directly into the stack area 4149 // we want to reuse during a tail call. Working around this *is* possible (see 4150 // X86) but less efficient and uglier in LowerCall. 4151 for (Function::const_arg_iterator i = CallerF.arg_begin(), 4152 e = CallerF.arg_end(); 4153 i != e; ++i) { 4154 if (i->hasByValAttr()) 4155 return false; 4156 4157 // On Windows, "inreg" attributes signify non-aggregate indirect returns. 4158 // In this case, it is necessary to save/restore X0 in the callee. Tail 4159 // call opt interferes with this. So we disable tail call opt when the 4160 // caller has an argument with "inreg" attribute. 4161 4162 // FIXME: Check whether the callee also has an "inreg" argument. 4163 if (i->hasInRegAttr()) 4164 return false; 4165 } 4166 4167 if (getTargetMachine().Options.GuaranteedTailCallOpt) 4168 return canGuaranteeTCO(CalleeCC) && CCMatch; 4169 4170 // Externally-defined functions with weak linkage should not be 4171 // tail-called on AArch64 when the OS does not support dynamic 4172 // pre-emption of symbols, as the AAELF spec requires normal calls 4173 // to undefined weak functions to be replaced with a NOP or jump to the 4174 // next instruction. The behaviour of branch instructions in this 4175 // situation (as used for tail calls) is implementation-defined, so we 4176 // cannot rely on the linker replacing the tail call with a return. 4177 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 4178 const GlobalValue *GV = G->getGlobal(); 4179 const Triple &TT = getTargetMachine().getTargetTriple(); 4180 if (GV->hasExternalWeakLinkage() && 4181 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) 4182 return false; 4183 } 4184 4185 // Now we search for cases where we can use a tail call without changing the 4186 // ABI. Sibcall is used in some places (particularly gcc) to refer to this 4187 // concept. 4188 4189 // I want anyone implementing a new calling convention to think long and hard 4190 // about this assert. 4191 assert((!isVarArg || CalleeCC == CallingConv::C) && 4192 "Unexpected variadic calling convention"); 4193 4194 LLVMContext &C = *DAG.getContext(); 4195 if (isVarArg && !Outs.empty()) { 4196 // At least two cases here: if caller is fastcc then we can't have any 4197 // memory arguments (we'd be expected to clean up the stack afterwards). If 4198 // caller is C then we could potentially use its argument area. 4199 4200 // FIXME: for now we take the most conservative of these in both cases: 4201 // disallow all variadic memory operands. 4202 SmallVector<CCValAssign, 16> ArgLocs; 4203 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); 4204 4205 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true)); 4206 for (const CCValAssign &ArgLoc : ArgLocs) 4207 if (!ArgLoc.isRegLoc()) 4208 return false; 4209 } 4210 4211 // Check that the call results are passed in the same way. 4212 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, 4213 CCAssignFnForCall(CalleeCC, isVarArg), 4214 CCAssignFnForCall(CallerCC, isVarArg))) 4215 return false; 4216 // The callee has to preserve all registers the caller needs to preserve. 4217 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 4218 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); 4219 if (!CCMatch) { 4220 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); 4221 if (Subtarget->hasCustomCallingConv()) { 4222 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved); 4223 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved); 4224 } 4225 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) 4226 return false; 4227 } 4228 4229 // Nothing more to check if the callee is taking no arguments 4230 if (Outs.empty()) 4231 return true; 4232 4233 SmallVector<CCValAssign, 16> ArgLocs; 4234 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); 4235 4236 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); 4237 4238 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 4239 4240 // If any of the arguments is passed indirectly, it must be SVE, so the 4241 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to 4242 // allocate space on the stack. That is why we determine this explicitly here 4243 // the call cannot be a tailcall. 4244 if (llvm::any_of(ArgLocs, [](CCValAssign &A) { 4245 assert((A.getLocInfo() != CCValAssign::Indirect || 4246 A.getValVT().isScalableVector()) && 4247 "Expected value to be scalable"); 4248 return A.getLocInfo() == CCValAssign::Indirect; 4249 })) 4250 return false; 4251 4252 // If the stack arguments for this call do not fit into our own save area then 4253 // the call cannot be made tail. 4254 if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) 4255 return false; 4256 4257 const MachineRegisterInfo &MRI = MF.getRegInfo(); 4258 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) 4259 return false; 4260 4261 return true; 4262 } 4263 4264 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain, 4265 SelectionDAG &DAG, 4266 MachineFrameInfo &MFI, 4267 int ClobberedFI) const { 4268 SmallVector<SDValue, 8> ArgChains; 4269 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI); 4270 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1; 4271 4272 // Include the original chain at the beginning of the list. When this is 4273 // used by target LowerCall hooks, this helps legalize find the 4274 // CALLSEQ_BEGIN node. 4275 ArgChains.push_back(Chain); 4276 4277 // Add a chain value for each stack argument corresponding 4278 for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(), 4279 UE = DAG.getEntryNode().getNode()->use_end(); 4280 U != UE; ++U) 4281 if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) 4282 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) 4283 if (FI->getIndex() < 0) { 4284 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex()); 4285 int64_t InLastByte = InFirstByte; 4286 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1; 4287 4288 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || 4289 (FirstByte <= InFirstByte && InFirstByte <= LastByte)) 4290 ArgChains.push_back(SDValue(L, 1)); 4291 } 4292 4293 // Build a tokenfactor for all the chains. 4294 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); 4295 } 4296 4297 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC, 4298 bool TailCallOpt) const { 4299 return CallCC == CallingConv::Fast && TailCallOpt; 4300 } 4301 4302 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain, 4303 /// and add input and output parameter nodes. 4304 SDValue 4305 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, 4306 SmallVectorImpl<SDValue> &InVals) const { 4307 SelectionDAG &DAG = CLI.DAG; 4308 SDLoc &DL = CLI.DL; 4309 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; 4310 SmallVector<SDValue, 32> &OutVals = CLI.OutVals; 4311 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; 4312 SDValue Chain = CLI.Chain; 4313 SDValue Callee = CLI.Callee; 4314 bool &IsTailCall = CLI.IsTailCall; 4315 CallingConv::ID CallConv = CLI.CallConv; 4316 bool IsVarArg = CLI.IsVarArg; 4317 4318 MachineFunction &MF = DAG.getMachineFunction(); 4319 MachineFunction::CallSiteInfo CSInfo; 4320 bool IsThisReturn = false; 4321 4322 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 4323 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 4324 bool IsSibCall = false; 4325 4326 // Check callee args/returns for SVE registers and set calling convention 4327 // accordingly. 4328 if (CallConv == CallingConv::C) { 4329 bool CalleeOutSVE = any_of(Outs, [](ISD::OutputArg &Out){ 4330 return Out.VT.isScalableVector(); 4331 }); 4332 bool CalleeInSVE = any_of(Ins, [](ISD::InputArg &In){ 4333 return In.VT.isScalableVector(); 4334 }); 4335 4336 if (CalleeInSVE || CalleeOutSVE) 4337 CallConv = CallingConv::AArch64_SVE_VectorCall; 4338 } 4339 4340 if (IsTailCall) { 4341 // Check if it's really possible to do a tail call. 4342 IsTailCall = isEligibleForTailCallOptimization( 4343 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); 4344 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) 4345 report_fatal_error("failed to perform tail call elimination on a call " 4346 "site marked musttail"); 4347 4348 // A sibling call is one where we're under the usual C ABI and not planning 4349 // to change that but can still do a tail call: 4350 if (!TailCallOpt && IsTailCall) 4351 IsSibCall = true; 4352 4353 if (IsTailCall) 4354 ++NumTailCalls; 4355 } 4356 4357 // Analyze operands of the call, assigning locations to each operand. 4358 SmallVector<CCValAssign, 16> ArgLocs; 4359 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs, 4360 *DAG.getContext()); 4361 4362 if (IsVarArg) { 4363 // Handle fixed and variable vector arguments differently. 4364 // Variable vector arguments always go into memory. 4365 unsigned NumArgs = Outs.size(); 4366 4367 for (unsigned i = 0; i != NumArgs; ++i) { 4368 MVT ArgVT = Outs[i].VT; 4369 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; 4370 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, 4371 /*IsVarArg=*/ !Outs[i].IsFixed); 4372 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo); 4373 assert(!Res && "Call operand has unhandled type"); 4374 (void)Res; 4375 } 4376 } else { 4377 // At this point, Outs[].VT may already be promoted to i32. To correctly 4378 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and 4379 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. 4380 // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here 4381 // we use a special version of AnalyzeCallOperands to pass in ValVT and 4382 // LocVT. 4383 unsigned NumArgs = Outs.size(); 4384 for (unsigned i = 0; i != NumArgs; ++i) { 4385 MVT ValVT = Outs[i].VT; 4386 // Get type of the original argument. 4387 EVT ActualVT = getValueType(DAG.getDataLayout(), 4388 CLI.getArgs()[Outs[i].OrigArgIndex].Ty, 4389 /*AllowUnknown*/ true); 4390 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT; 4391 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; 4392 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. 4393 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) 4394 ValVT = MVT::i8; 4395 else if (ActualMVT == MVT::i16) 4396 ValVT = MVT::i16; 4397 4398 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); 4399 bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo); 4400 assert(!Res && "Call operand has unhandled type"); 4401 (void)Res; 4402 } 4403 } 4404 4405 // Get a count of how many bytes are to be pushed on the stack. 4406 unsigned NumBytes = CCInfo.getNextStackOffset(); 4407 4408 if (IsSibCall) { 4409 // Since we're not changing the ABI to make this a tail call, the memory 4410 // operands are already available in the caller's incoming argument space. 4411 NumBytes = 0; 4412 } 4413 4414 // FPDiff is the byte offset of the call's argument area from the callee's. 4415 // Stores to callee stack arguments will be placed in FixedStackSlots offset 4416 // by this amount for a tail call. In a sibling call it must be 0 because the 4417 // caller will deallocate the entire stack and the callee still expects its 4418 // arguments to begin at SP+0. Completely unused for non-tail calls. 4419 int FPDiff = 0; 4420 4421 if (IsTailCall && !IsSibCall) { 4422 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea(); 4423 4424 // Since callee will pop argument stack as a tail call, we must keep the 4425 // popped size 16-byte aligned. 4426 NumBytes = alignTo(NumBytes, 16); 4427 4428 // FPDiff will be negative if this tail call requires more space than we 4429 // would automatically have in our incoming argument space. Positive if we 4430 // can actually shrink the stack. 4431 FPDiff = NumReusableBytes - NumBytes; 4432 4433 // The stack pointer must be 16-byte aligned at all times it's used for a 4434 // memory operation, which in practice means at *all* times and in 4435 // particular across call boundaries. Therefore our own arguments started at 4436 // a 16-byte aligned SP and the delta applied for the tail call should 4437 // satisfy the same constraint. 4438 assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); 4439 } 4440 4441 // Adjust the stack pointer for the new arguments... 4442 // These operations are automatically eliminated by the prolog/epilog pass 4443 if (!IsSibCall) 4444 Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL); 4445 4446 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, 4447 getPointerTy(DAG.getDataLayout())); 4448 4449 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 4450 SmallSet<unsigned, 8> RegsUsed; 4451 SmallVector<SDValue, 8> MemOpChains; 4452 auto PtrVT = getPointerTy(DAG.getDataLayout()); 4453 4454 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) { 4455 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms(); 4456 for (const auto &F : Forwards) { 4457 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT); 4458 RegsToPass.emplace_back(F.PReg, Val); 4459 } 4460 } 4461 4462 // Walk the register/memloc assignments, inserting copies/loads. 4463 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 4464 CCValAssign &VA = ArgLocs[i]; 4465 SDValue Arg = OutVals[i]; 4466 ISD::ArgFlagsTy Flags = Outs[i].Flags; 4467 4468 // Promote the value if needed. 4469 switch (VA.getLocInfo()) { 4470 default: 4471 llvm_unreachable("Unknown loc info!"); 4472 case CCValAssign::Full: 4473 break; 4474 case CCValAssign::SExt: 4475 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); 4476 break; 4477 case CCValAssign::ZExt: 4478 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); 4479 break; 4480 case CCValAssign::AExt: 4481 if (Outs[i].ArgVT == MVT::i1) { 4482 // AAPCS requires i1 to be zero-extended to 8-bits by the caller. 4483 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); 4484 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg); 4485 } 4486 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); 4487 break; 4488 case CCValAssign::AExtUpper: 4489 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits"); 4490 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); 4491 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg, 4492 DAG.getConstant(32, DL, VA.getLocVT())); 4493 break; 4494 case CCValAssign::BCvt: 4495 Arg = DAG.getBitcast(VA.getLocVT(), Arg); 4496 break; 4497 case CCValAssign::Trunc: 4498 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT()); 4499 break; 4500 case CCValAssign::FPExt: 4501 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg); 4502 break; 4503 case CCValAssign::Indirect: 4504 assert(VA.getValVT().isScalableVector() && 4505 "Only scalable vectors can be passed indirectly"); 4506 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 4507 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext()); 4508 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty); 4509 int FI = MFI.CreateStackObject( 4510 VA.getValVT().getStoreSize().getKnownMinSize(), Alignment, false); 4511 MFI.setStackID(FI, TargetStackID::SVEVector); 4512 4513 SDValue SpillSlot = DAG.getFrameIndex( 4514 FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); 4515 Chain = DAG.getStore( 4516 Chain, DL, Arg, SpillSlot, 4517 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); 4518 Arg = SpillSlot; 4519 break; 4520 } 4521 4522 if (VA.isRegLoc()) { 4523 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() && 4524 Outs[0].VT == MVT::i64) { 4525 assert(VA.getLocVT() == MVT::i64 && 4526 "unexpected calling convention register assignment"); 4527 assert(!Ins.empty() && Ins[0].VT == MVT::i64 && 4528 "unexpected use of 'returned'"); 4529 IsThisReturn = true; 4530 } 4531 if (RegsUsed.count(VA.getLocReg())) { 4532 // If this register has already been used then we're trying to pack 4533 // parts of an [N x i32] into an X-register. The extension type will 4534 // take care of putting the two halves in the right place but we have to 4535 // combine them. 4536 SDValue &Bits = 4537 std::find_if(RegsToPass.begin(), RegsToPass.end(), 4538 [=](const std::pair<unsigned, SDValue> &Elt) { 4539 return Elt.first == VA.getLocReg(); 4540 }) 4541 ->second; 4542 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg); 4543 // Call site info is used for function's parameter entry value 4544 // tracking. For now we track only simple cases when parameter 4545 // is transferred through whole register. 4546 CSInfo.erase(std::remove_if(CSInfo.begin(), CSInfo.end(), 4547 [&VA](MachineFunction::ArgRegPair ArgReg) { 4548 return ArgReg.Reg == VA.getLocReg(); 4549 }), 4550 CSInfo.end()); 4551 } else { 4552 RegsToPass.emplace_back(VA.getLocReg(), Arg); 4553 RegsUsed.insert(VA.getLocReg()); 4554 const TargetOptions &Options = DAG.getTarget().Options; 4555 if (Options.EmitCallSiteInfo) 4556 CSInfo.emplace_back(VA.getLocReg(), i); 4557 } 4558 } else { 4559 assert(VA.isMemLoc()); 4560 4561 SDValue DstAddr; 4562 MachinePointerInfo DstInfo; 4563 4564 // FIXME: This works on big-endian for composite byvals, which are the 4565 // common case. It should also work for fundamental types too. 4566 uint32_t BEAlign = 0; 4567 unsigned OpSize; 4568 if (VA.getLocInfo() == CCValAssign::Indirect) 4569 OpSize = VA.getLocVT().getSizeInBits(); 4570 else 4571 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 4572 : VA.getValVT().getSizeInBits(); 4573 OpSize = (OpSize + 7) / 8; 4574 if (!Subtarget->isLittleEndian() && !Flags.isByVal() && 4575 !Flags.isInConsecutiveRegs()) { 4576 if (OpSize < 8) 4577 BEAlign = 8 - OpSize; 4578 } 4579 unsigned LocMemOffset = VA.getLocMemOffset(); 4580 int32_t Offset = LocMemOffset + BEAlign; 4581 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); 4582 PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); 4583 4584 if (IsTailCall) { 4585 Offset = Offset + FPDiff; 4586 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true); 4587 4588 DstAddr = DAG.getFrameIndex(FI, PtrVT); 4589 DstInfo = 4590 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); 4591 4592 // Make sure any stack arguments overlapping with where we're storing 4593 // are loaded before this eventual operation. Otherwise they'll be 4594 // clobbered. 4595 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI); 4596 } else { 4597 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); 4598 4599 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); 4600 DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(), 4601 LocMemOffset); 4602 } 4603 4604 if (Outs[i].Flags.isByVal()) { 4605 SDValue SizeNode = 4606 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64); 4607 SDValue Cpy = DAG.getMemcpy( 4608 Chain, DL, DstAddr, Arg, SizeNode, 4609 Outs[i].Flags.getNonZeroByValAlign(), 4610 /*isVol = */ false, /*AlwaysInline = */ false, 4611 /*isTailCall = */ false, DstInfo, MachinePointerInfo()); 4612 4613 MemOpChains.push_back(Cpy); 4614 } else { 4615 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already 4616 // promoted to a legal register type i32, we should truncate Arg back to 4617 // i1/i8/i16. 4618 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 || 4619 VA.getValVT() == MVT::i16) 4620 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg); 4621 4622 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo); 4623 MemOpChains.push_back(Store); 4624 } 4625 } 4626 } 4627 4628 if (!MemOpChains.empty()) 4629 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); 4630 4631 // Build a sequence of copy-to-reg nodes chained together with token chain 4632 // and flag operands which copy the outgoing args into the appropriate regs. 4633 SDValue InFlag; 4634 for (auto &RegToPass : RegsToPass) { 4635 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first, 4636 RegToPass.second, InFlag); 4637 InFlag = Chain.getValue(1); 4638 } 4639 4640 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every 4641 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol 4642 // node so that legalize doesn't hack it. 4643 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 4644 auto GV = G->getGlobal(); 4645 unsigned OpFlags = 4646 Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()); 4647 if (OpFlags & AArch64II::MO_GOT) { 4648 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags); 4649 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); 4650 } else { 4651 const GlobalValue *GV = G->getGlobal(); 4652 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); 4653 } 4654 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 4655 if (getTargetMachine().getCodeModel() == CodeModel::Large && 4656 Subtarget->isTargetMachO()) { 4657 const char *Sym = S->getSymbol(); 4658 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT); 4659 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); 4660 } else { 4661 const char *Sym = S->getSymbol(); 4662 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0); 4663 } 4664 } 4665 4666 // We don't usually want to end the call-sequence here because we would tidy 4667 // the frame up *after* the call, however in the ABI-changing tail-call case 4668 // we've carefully laid out the parameters so that when sp is reset they'll be 4669 // in the correct location. 4670 if (IsTailCall && !IsSibCall) { 4671 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true), 4672 DAG.getIntPtrConstant(0, DL, true), InFlag, DL); 4673 InFlag = Chain.getValue(1); 4674 } 4675 4676 std::vector<SDValue> Ops; 4677 Ops.push_back(Chain); 4678 Ops.push_back(Callee); 4679 4680 if (IsTailCall) { 4681 // Each tail call may have to adjust the stack by a different amount, so 4682 // this information must travel along with the operation for eventual 4683 // consumption by emitEpilogue. 4684 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32)); 4685 } 4686 4687 // Add argument registers to the end of the list so that they are known live 4688 // into the call. 4689 for (auto &RegToPass : RegsToPass) 4690 Ops.push_back(DAG.getRegister(RegToPass.first, 4691 RegToPass.second.getValueType())); 4692 4693 // Add a register mask operand representing the call-preserved registers. 4694 const uint32_t *Mask; 4695 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 4696 if (IsThisReturn) { 4697 // For 'this' returns, use the X0-preserving mask if applicable 4698 Mask = TRI->getThisReturnPreservedMask(MF, CallConv); 4699 if (!Mask) { 4700 IsThisReturn = false; 4701 Mask = TRI->getCallPreservedMask(MF, CallConv); 4702 } 4703 } else 4704 Mask = TRI->getCallPreservedMask(MF, CallConv); 4705 4706 if (Subtarget->hasCustomCallingConv()) 4707 TRI->UpdateCustomCallPreservedMask(MF, &Mask); 4708 4709 if (TRI->isAnyArgRegReserved(MF)) 4710 TRI->emitReservedArgRegCallError(MF); 4711 4712 assert(Mask && "Missing call preserved mask for calling convention"); 4713 Ops.push_back(DAG.getRegisterMask(Mask)); 4714 4715 if (InFlag.getNode()) 4716 Ops.push_back(InFlag); 4717 4718 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 4719 4720 // If we're doing a tall call, use a TC_RETURN here rather than an 4721 // actual call instruction. 4722 if (IsTailCall) { 4723 MF.getFrameInfo().setHasTailCall(); 4724 SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops); 4725 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); 4726 return Ret; 4727 } 4728 4729 // Returns a chain and a flag for retval copy to use. 4730 Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops); 4731 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); 4732 InFlag = Chain.getValue(1); 4733 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); 4734 4735 uint64_t CalleePopBytes = 4736 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0; 4737 4738 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true), 4739 DAG.getIntPtrConstant(CalleePopBytes, DL, true), 4740 InFlag, DL); 4741 if (!Ins.empty()) 4742 InFlag = Chain.getValue(1); 4743 4744 // Handle result values, copying them out of physregs into vregs that we 4745 // return. 4746 return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG, 4747 InVals, IsThisReturn, 4748 IsThisReturn ? OutVals[0] : SDValue()); 4749 } 4750 4751 bool AArch64TargetLowering::CanLowerReturn( 4752 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, 4753 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { 4754 CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS 4755 ? RetCC_AArch64_WebKit_JS 4756 : RetCC_AArch64_AAPCS; 4757 SmallVector<CCValAssign, 16> RVLocs; 4758 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 4759 return CCInfo.CheckReturn(Outs, RetCC); 4760 } 4761 4762 SDValue 4763 AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 4764 bool isVarArg, 4765 const SmallVectorImpl<ISD::OutputArg> &Outs, 4766 const SmallVectorImpl<SDValue> &OutVals, 4767 const SDLoc &DL, SelectionDAG &DAG) const { 4768 auto &MF = DAG.getMachineFunction(); 4769 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 4770 4771 CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS 4772 ? RetCC_AArch64_WebKit_JS 4773 : RetCC_AArch64_AAPCS; 4774 SmallVector<CCValAssign, 16> RVLocs; 4775 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 4776 *DAG.getContext()); 4777 CCInfo.AnalyzeReturn(Outs, RetCC); 4778 4779 // Copy the result values into the output registers. 4780 SDValue Flag; 4781 SmallVector<std::pair<unsigned, SDValue>, 4> RetVals; 4782 SmallSet<unsigned, 4> RegsUsed; 4783 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); 4784 ++i, ++realRVLocIdx) { 4785 CCValAssign &VA = RVLocs[i]; 4786 assert(VA.isRegLoc() && "Can only return in registers!"); 4787 SDValue Arg = OutVals[realRVLocIdx]; 4788 4789 switch (VA.getLocInfo()) { 4790 default: 4791 llvm_unreachable("Unknown loc info!"); 4792 case CCValAssign::Full: 4793 if (Outs[i].ArgVT == MVT::i1) { 4794 // AAPCS requires i1 to be zero-extended to i8 by the producer of the 4795 // value. This is strictly redundant on Darwin (which uses "zeroext 4796 // i1"), but will be optimised out before ISel. 4797 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); 4798 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); 4799 } 4800 break; 4801 case CCValAssign::BCvt: 4802 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); 4803 break; 4804 case CCValAssign::AExt: 4805 case CCValAssign::ZExt: 4806 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT()); 4807 break; 4808 case CCValAssign::AExtUpper: 4809 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits"); 4810 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT()); 4811 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg, 4812 DAG.getConstant(32, DL, VA.getLocVT())); 4813 break; 4814 } 4815 4816 if (RegsUsed.count(VA.getLocReg())) { 4817 SDValue &Bits = 4818 std::find_if(RetVals.begin(), RetVals.end(), 4819 [=](const std::pair<unsigned, SDValue> &Elt) { 4820 return Elt.first == VA.getLocReg(); 4821 }) 4822 ->second; 4823 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg); 4824 } else { 4825 RetVals.emplace_back(VA.getLocReg(), Arg); 4826 RegsUsed.insert(VA.getLocReg()); 4827 } 4828 } 4829 4830 SmallVector<SDValue, 4> RetOps(1, Chain); 4831 for (auto &RetVal : RetVals) { 4832 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Flag); 4833 Flag = Chain.getValue(1); 4834 RetOps.push_back( 4835 DAG.getRegister(RetVal.first, RetVal.second.getValueType())); 4836 } 4837 4838 // Windows AArch64 ABIs require that for returning structs by value we copy 4839 // the sret argument into X0 for the return. 4840 // We saved the argument into a virtual register in the entry block, 4841 // so now we copy the value out and into X0. 4842 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) { 4843 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg, 4844 getPointerTy(MF.getDataLayout())); 4845 4846 unsigned RetValReg = AArch64::X0; 4847 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Flag); 4848 Flag = Chain.getValue(1); 4849 4850 RetOps.push_back( 4851 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); 4852 } 4853 4854 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 4855 const MCPhysReg *I = 4856 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 4857 if (I) { 4858 for (; *I; ++I) { 4859 if (AArch64::GPR64RegClass.contains(*I)) 4860 RetOps.push_back(DAG.getRegister(*I, MVT::i64)); 4861 else if (AArch64::FPR64RegClass.contains(*I)) 4862 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); 4863 else 4864 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 4865 } 4866 } 4867 4868 RetOps[0] = Chain; // Update chain. 4869 4870 // Add the flag if we have it. 4871 if (Flag.getNode()) 4872 RetOps.push_back(Flag); 4873 4874 return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps); 4875 } 4876 4877 //===----------------------------------------------------------------------===// 4878 // Other Lowering Code 4879 //===----------------------------------------------------------------------===// 4880 4881 SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty, 4882 SelectionDAG &DAG, 4883 unsigned Flag) const { 4884 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty, 4885 N->getOffset(), Flag); 4886 } 4887 4888 SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty, 4889 SelectionDAG &DAG, 4890 unsigned Flag) const { 4891 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag); 4892 } 4893 4894 SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty, 4895 SelectionDAG &DAG, 4896 unsigned Flag) const { 4897 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(), 4898 N->getOffset(), Flag); 4899 } 4900 4901 SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty, 4902 SelectionDAG &DAG, 4903 unsigned Flag) const { 4904 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag); 4905 } 4906 4907 // (loadGOT sym) 4908 template <class NodeTy> 4909 SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG, 4910 unsigned Flags) const { 4911 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n"); 4912 SDLoc DL(N); 4913 EVT Ty = getPointerTy(DAG.getDataLayout()); 4914 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags); 4915 // FIXME: Once remat is capable of dealing with instructions with register 4916 // operands, expand this into two nodes instead of using a wrapper node. 4917 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr); 4918 } 4919 4920 // (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym)) 4921 template <class NodeTy> 4922 SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG, 4923 unsigned Flags) const { 4924 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n"); 4925 SDLoc DL(N); 4926 EVT Ty = getPointerTy(DAG.getDataLayout()); 4927 const unsigned char MO_NC = AArch64II::MO_NC; 4928 return DAG.getNode( 4929 AArch64ISD::WrapperLarge, DL, Ty, 4930 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags), 4931 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags), 4932 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags), 4933 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags)); 4934 } 4935 4936 // (addlow (adrp %hi(sym)) %lo(sym)) 4937 template <class NodeTy> 4938 SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG, 4939 unsigned Flags) const { 4940 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n"); 4941 SDLoc DL(N); 4942 EVT Ty = getPointerTy(DAG.getDataLayout()); 4943 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags); 4944 SDValue Lo = getTargetNode(N, Ty, DAG, 4945 AArch64II::MO_PAGEOFF | AArch64II::MO_NC | Flags); 4946 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi); 4947 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo); 4948 } 4949 4950 // (adr sym) 4951 template <class NodeTy> 4952 SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG, 4953 unsigned Flags) const { 4954 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n"); 4955 SDLoc DL(N); 4956 EVT Ty = getPointerTy(DAG.getDataLayout()); 4957 SDValue Sym = getTargetNode(N, Ty, DAG, Flags); 4958 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym); 4959 } 4960 4961 SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, 4962 SelectionDAG &DAG) const { 4963 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); 4964 const GlobalValue *GV = GN->getGlobal(); 4965 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 4966 4967 if (OpFlags != AArch64II::MO_NO_FLAG) 4968 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 && 4969 "unexpected offset in global node"); 4970 4971 // This also catches the large code model case for Darwin, and tiny code 4972 // model with got relocations. 4973 if ((OpFlags & AArch64II::MO_GOT) != 0) { 4974 return getGOT(GN, DAG, OpFlags); 4975 } 4976 4977 SDValue Result; 4978 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 4979 Result = getAddrLarge(GN, DAG, OpFlags); 4980 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) { 4981 Result = getAddrTiny(GN, DAG, OpFlags); 4982 } else { 4983 Result = getAddr(GN, DAG, OpFlags); 4984 } 4985 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 4986 SDLoc DL(GN); 4987 if (OpFlags & (AArch64II::MO_DLLIMPORT | AArch64II::MO_COFFSTUB)) 4988 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, 4989 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 4990 return Result; 4991 } 4992 4993 /// Convert a TLS address reference into the correct sequence of loads 4994 /// and calls to compute the variable's address (for Darwin, currently) and 4995 /// return an SDValue containing the final node. 4996 4997 /// Darwin only has one TLS scheme which must be capable of dealing with the 4998 /// fully general situation, in the worst case. This means: 4999 /// + "extern __thread" declaration. 5000 /// + Defined in a possibly unknown dynamic library. 5001 /// 5002 /// The general system is that each __thread variable has a [3 x i64] descriptor 5003 /// which contains information used by the runtime to calculate the address. The 5004 /// only part of this the compiler needs to know about is the first xword, which 5005 /// contains a function pointer that must be called with the address of the 5006 /// entire descriptor in "x0". 5007 /// 5008 /// Since this descriptor may be in a different unit, in general even the 5009 /// descriptor must be accessed via an indirect load. The "ideal" code sequence 5010 /// is: 5011 /// adrp x0, _var@TLVPPAGE 5012 /// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor 5013 /// ldr x1, [x0] ; x1 contains 1st entry of descriptor, 5014 /// ; the function pointer 5015 /// blr x1 ; Uses descriptor address in x0 5016 /// ; Address of _var is now in x0. 5017 /// 5018 /// If the address of _var's descriptor *is* known to the linker, then it can 5019 /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for 5020 /// a slight efficiency gain. 5021 SDValue 5022 AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, 5023 SelectionDAG &DAG) const { 5024 assert(Subtarget->isTargetDarwin() && 5025 "This function expects a Darwin target"); 5026 5027 SDLoc DL(Op); 5028 MVT PtrVT = getPointerTy(DAG.getDataLayout()); 5029 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout()); 5030 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 5031 5032 SDValue TLVPAddr = 5033 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); 5034 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr); 5035 5036 // The first entry in the descriptor is a function pointer that we must call 5037 // to obtain the address of the variable. 5038 SDValue Chain = DAG.getEntryNode(); 5039 SDValue FuncTLVGet = DAG.getLoad( 5040 PtrMemVT, DL, Chain, DescAddr, 5041 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 5042 /* Alignment = */ PtrMemVT.getSizeInBits() / 8, 5043 MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable); 5044 Chain = FuncTLVGet.getValue(1); 5045 5046 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer. 5047 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT); 5048 5049 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 5050 MFI.setAdjustsStack(true); 5051 5052 // TLS calls preserve all registers except those that absolutely must be 5053 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be 5054 // silly). 5055 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 5056 const uint32_t *Mask = TRI->getTLSCallPreservedMask(); 5057 if (Subtarget->hasCustomCallingConv()) 5058 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask); 5059 5060 // Finally, we can make the call. This is just a degenerate version of a 5061 // normal AArch64 call node: x0 takes the address of the descriptor, and 5062 // returns the address of the variable in this thread. 5063 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue()); 5064 Chain = 5065 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), 5066 Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64), 5067 DAG.getRegisterMask(Mask), Chain.getValue(1)); 5068 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1)); 5069 } 5070 5071 /// Convert a thread-local variable reference into a sequence of instructions to 5072 /// compute the variable's address for the local exec TLS model of ELF targets. 5073 /// The sequence depends on the maximum TLS area size. 5074 SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV, 5075 SDValue ThreadBase, 5076 const SDLoc &DL, 5077 SelectionDAG &DAG) const { 5078 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 5079 SDValue TPOff, Addr; 5080 5081 switch (DAG.getTarget().Options.TLSSize) { 5082 default: 5083 llvm_unreachable("Unexpected TLS size"); 5084 5085 case 12: { 5086 // mrs x0, TPIDR_EL0 5087 // add x0, x0, :tprel_lo12:a 5088 SDValue Var = DAG.getTargetGlobalAddress( 5089 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF); 5090 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase, 5091 Var, 5092 DAG.getTargetConstant(0, DL, MVT::i32)), 5093 0); 5094 } 5095 5096 case 24: { 5097 // mrs x0, TPIDR_EL0 5098 // add x0, x0, :tprel_hi12:a 5099 // add x0, x0, :tprel_lo12_nc:a 5100 SDValue HiVar = DAG.getTargetGlobalAddress( 5101 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); 5102 SDValue LoVar = DAG.getTargetGlobalAddress( 5103 GV, DL, PtrVT, 0, 5104 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 5105 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase, 5106 HiVar, 5107 DAG.getTargetConstant(0, DL, MVT::i32)), 5108 0); 5109 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr, 5110 LoVar, 5111 DAG.getTargetConstant(0, DL, MVT::i32)), 5112 0); 5113 } 5114 5115 case 32: { 5116 // mrs x1, TPIDR_EL0 5117 // movz x0, #:tprel_g1:a 5118 // movk x0, #:tprel_g0_nc:a 5119 // add x0, x1, x0 5120 SDValue HiVar = DAG.getTargetGlobalAddress( 5121 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1); 5122 SDValue LoVar = DAG.getTargetGlobalAddress( 5123 GV, DL, PtrVT, 0, 5124 AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC); 5125 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar, 5126 DAG.getTargetConstant(16, DL, MVT::i32)), 5127 0); 5128 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar, 5129 DAG.getTargetConstant(0, DL, MVT::i32)), 5130 0); 5131 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); 5132 } 5133 5134 case 48: { 5135 // mrs x1, TPIDR_EL0 5136 // movz x0, #:tprel_g2:a 5137 // movk x0, #:tprel_g1_nc:a 5138 // movk x0, #:tprel_g0_nc:a 5139 // add x0, x1, x0 5140 SDValue HiVar = DAG.getTargetGlobalAddress( 5141 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2); 5142 SDValue MiVar = DAG.getTargetGlobalAddress( 5143 GV, DL, PtrVT, 0, 5144 AArch64II::MO_TLS | AArch64II::MO_G1 | AArch64II::MO_NC); 5145 SDValue LoVar = DAG.getTargetGlobalAddress( 5146 GV, DL, PtrVT, 0, 5147 AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC); 5148 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar, 5149 DAG.getTargetConstant(32, DL, MVT::i32)), 5150 0); 5151 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar, 5152 DAG.getTargetConstant(16, DL, MVT::i32)), 5153 0); 5154 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar, 5155 DAG.getTargetConstant(0, DL, MVT::i32)), 5156 0); 5157 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); 5158 } 5159 } 5160 } 5161 5162 /// When accessing thread-local variables under either the general-dynamic or 5163 /// local-dynamic system, we make a "TLS-descriptor" call. The variable will 5164 /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry 5165 /// is a function pointer to carry out the resolution. 5166 /// 5167 /// The sequence is: 5168 /// adrp x0, :tlsdesc:var 5169 /// ldr x1, [x0, #:tlsdesc_lo12:var] 5170 /// add x0, x0, #:tlsdesc_lo12:var 5171 /// .tlsdesccall var 5172 /// blr x1 5173 /// (TPIDR_EL0 offset now in x0) 5174 /// 5175 /// The above sequence must be produced unscheduled, to enable the linker to 5176 /// optimize/relax this sequence. 5177 /// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the 5178 /// above sequence, and expanded really late in the compilation flow, to ensure 5179 /// the sequence is produced as per above. 5180 SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, 5181 const SDLoc &DL, 5182 SelectionDAG &DAG) const { 5183 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 5184 5185 SDValue Chain = DAG.getEntryNode(); 5186 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 5187 5188 Chain = 5189 DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr}); 5190 SDValue Glue = Chain.getValue(1); 5191 5192 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue); 5193 } 5194 5195 SDValue 5196 AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, 5197 SelectionDAG &DAG) const { 5198 assert(Subtarget->isTargetELF() && "This function expects an ELF target"); 5199 5200 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 5201 5202 TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal()); 5203 5204 if (!EnableAArch64ELFLocalDynamicTLSGeneration) { 5205 if (Model == TLSModel::LocalDynamic) 5206 Model = TLSModel::GeneralDynamic; 5207 } 5208 5209 if (getTargetMachine().getCodeModel() == CodeModel::Large && 5210 Model != TLSModel::LocalExec) 5211 report_fatal_error("ELF TLS only supported in small memory model or " 5212 "in local exec TLS model"); 5213 // Different choices can be made for the maximum size of the TLS area for a 5214 // module. For the small address model, the default TLS size is 16MiB and the 5215 // maximum TLS size is 4GiB. 5216 // FIXME: add tiny and large code model support for TLS access models other 5217 // than local exec. We currently generate the same code as small for tiny, 5218 // which may be larger than needed. 5219 5220 SDValue TPOff; 5221 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 5222 SDLoc DL(Op); 5223 const GlobalValue *GV = GA->getGlobal(); 5224 5225 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT); 5226 5227 if (Model == TLSModel::LocalExec) { 5228 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG); 5229 } else if (Model == TLSModel::InitialExec) { 5230 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); 5231 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff); 5232 } else if (Model == TLSModel::LocalDynamic) { 5233 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS 5234 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate 5235 // the beginning of the module's TLS region, followed by a DTPREL offset 5236 // calculation. 5237 5238 // These accesses will need deduplicating if there's more than one. 5239 AArch64FunctionInfo *MFI = 5240 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); 5241 MFI->incNumLocalDynamicTLSAccesses(); 5242 5243 // The call needs a relocation too for linker relaxation. It doesn't make 5244 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of 5245 // the address. 5246 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, 5247 AArch64II::MO_TLS); 5248 5249 // Now we can calculate the offset from TPIDR_EL0 to this module's 5250 // thread-local area. 5251 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); 5252 5253 // Now use :dtprel_whatever: operations to calculate this variable's offset 5254 // in its thread-storage area. 5255 SDValue HiVar = DAG.getTargetGlobalAddress( 5256 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); 5257 SDValue LoVar = DAG.getTargetGlobalAddress( 5258 GV, DL, MVT::i64, 0, 5259 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 5260 5261 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar, 5262 DAG.getTargetConstant(0, DL, MVT::i32)), 5263 0); 5264 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar, 5265 DAG.getTargetConstant(0, DL, MVT::i32)), 5266 0); 5267 } else if (Model == TLSModel::GeneralDynamic) { 5268 // The call needs a relocation too for linker relaxation. It doesn't make 5269 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of 5270 // the address. 5271 SDValue SymAddr = 5272 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); 5273 5274 // Finally we can make a call to calculate the offset from tpidr_el0. 5275 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); 5276 } else 5277 llvm_unreachable("Unsupported ELF TLS access model"); 5278 5279 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); 5280 } 5281 5282 SDValue 5283 AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op, 5284 SelectionDAG &DAG) const { 5285 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering"); 5286 5287 SDValue Chain = DAG.getEntryNode(); 5288 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 5289 SDLoc DL(Op); 5290 5291 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64); 5292 5293 // Load the ThreadLocalStoragePointer from the TEB 5294 // A pointer to the TLS array is located at offset 0x58 from the TEB. 5295 SDValue TLSArray = 5296 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL)); 5297 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo()); 5298 Chain = TLSArray.getValue(1); 5299 5300 // Load the TLS index from the C runtime; 5301 // This does the same as getAddr(), but without having a GlobalAddressSDNode. 5302 // This also does the same as LOADgot, but using a generic i32 load, 5303 // while LOADgot only loads i64. 5304 SDValue TLSIndexHi = 5305 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE); 5306 SDValue TLSIndexLo = DAG.getTargetExternalSymbol( 5307 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 5308 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi); 5309 SDValue TLSIndex = 5310 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo); 5311 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo()); 5312 Chain = TLSIndex.getValue(1); 5313 5314 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8 5315 // offset into the TLSArray. 5316 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex); 5317 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex, 5318 DAG.getConstant(3, DL, PtrVT)); 5319 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain, 5320 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot), 5321 MachinePointerInfo()); 5322 Chain = TLS.getValue(1); 5323 5324 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 5325 const GlobalValue *GV = GA->getGlobal(); 5326 SDValue TGAHi = DAG.getTargetGlobalAddress( 5327 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); 5328 SDValue TGALo = DAG.getTargetGlobalAddress( 5329 GV, DL, PtrVT, 0, 5330 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 5331 5332 // Add the offset from the start of the .tls section (section base). 5333 SDValue Addr = 5334 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi, 5335 DAG.getTargetConstant(0, DL, MVT::i32)), 5336 0); 5337 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo); 5338 return Addr; 5339 } 5340 5341 SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, 5342 SelectionDAG &DAG) const { 5343 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 5344 if (DAG.getTarget().useEmulatedTLS()) 5345 return LowerToTLSEmulatedModel(GA, DAG); 5346 5347 if (Subtarget->isTargetDarwin()) 5348 return LowerDarwinGlobalTLSAddress(Op, DAG); 5349 if (Subtarget->isTargetELF()) 5350 return LowerELFGlobalTLSAddress(Op, DAG); 5351 if (Subtarget->isTargetWindows()) 5352 return LowerWindowsGlobalTLSAddress(Op, DAG); 5353 5354 llvm_unreachable("Unexpected platform trying to use TLS"); 5355 } 5356 5357 SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { 5358 SDValue Chain = Op.getOperand(0); 5359 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 5360 SDValue LHS = Op.getOperand(2); 5361 SDValue RHS = Op.getOperand(3); 5362 SDValue Dest = Op.getOperand(4); 5363 SDLoc dl(Op); 5364 5365 MachineFunction &MF = DAG.getMachineFunction(); 5366 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions 5367 // will not be produced, as they are conditional branch instructions that do 5368 // not set flags. 5369 bool ProduceNonFlagSettingCondBr = 5370 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening); 5371 5372 // Handle f128 first, since lowering it will result in comparing the return 5373 // value of a libcall against zero, which is just what the rest of LowerBR_CC 5374 // is expecting to deal with. 5375 if (LHS.getValueType() == MVT::f128) { 5376 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS); 5377 5378 // If softenSetCCOperands returned a scalar, we need to compare the result 5379 // against zero to select between true and false values. 5380 if (!RHS.getNode()) { 5381 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 5382 CC = ISD::SETNE; 5383 } 5384 } 5385 5386 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 5387 // instruction. 5388 if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) && 5389 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 5390 // Only lower legal XALUO ops. 5391 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) 5392 return SDValue(); 5393 5394 // The actual operation with overflow check. 5395 AArch64CC::CondCode OFCC; 5396 SDValue Value, Overflow; 5397 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG); 5398 5399 if (CC == ISD::SETNE) 5400 OFCC = getInvertedCondCode(OFCC); 5401 SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32); 5402 5403 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, 5404 Overflow); 5405 } 5406 5407 if (LHS.getValueType().isInteger()) { 5408 assert((LHS.getValueType() == RHS.getValueType()) && 5409 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); 5410 5411 // If the RHS of the comparison is zero, we can potentially fold this 5412 // to a specialized branch. 5413 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS); 5414 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) { 5415 if (CC == ISD::SETEQ) { 5416 // See if we can use a TBZ to fold in an AND as well. 5417 // TBZ has a smaller branch displacement than CBZ. If the offset is 5418 // out of bounds, a late MI-layer pass rewrites branches. 5419 // 403.gcc is an example that hits this case. 5420 if (LHS.getOpcode() == ISD::AND && 5421 isa<ConstantSDNode>(LHS.getOperand(1)) && 5422 isPowerOf2_64(LHS.getConstantOperandVal(1))) { 5423 SDValue Test = LHS.getOperand(0); 5424 uint64_t Mask = LHS.getConstantOperandVal(1); 5425 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test, 5426 DAG.getConstant(Log2_64(Mask), dl, MVT::i64), 5427 Dest); 5428 } 5429 5430 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest); 5431 } else if (CC == ISD::SETNE) { 5432 // See if we can use a TBZ to fold in an AND as well. 5433 // TBZ has a smaller branch displacement than CBZ. If the offset is 5434 // out of bounds, a late MI-layer pass rewrites branches. 5435 // 403.gcc is an example that hits this case. 5436 if (LHS.getOpcode() == ISD::AND && 5437 isa<ConstantSDNode>(LHS.getOperand(1)) && 5438 isPowerOf2_64(LHS.getConstantOperandVal(1))) { 5439 SDValue Test = LHS.getOperand(0); 5440 uint64_t Mask = LHS.getConstantOperandVal(1); 5441 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test, 5442 DAG.getConstant(Log2_64(Mask), dl, MVT::i64), 5443 Dest); 5444 } 5445 5446 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest); 5447 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) { 5448 // Don't combine AND since emitComparison converts the AND to an ANDS 5449 // (a.k.a. TST) and the test in the test bit and branch instruction 5450 // becomes redundant. This would also increase register pressure. 5451 uint64_t Mask = LHS.getValueSizeInBits() - 1; 5452 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS, 5453 DAG.getConstant(Mask, dl, MVT::i64), Dest); 5454 } 5455 } 5456 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT && 5457 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) { 5458 // Don't combine AND since emitComparison converts the AND to an ANDS 5459 // (a.k.a. TST) and the test in the test bit and branch instruction 5460 // becomes redundant. This would also increase register pressure. 5461 uint64_t Mask = LHS.getValueSizeInBits() - 1; 5462 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS, 5463 DAG.getConstant(Mask, dl, MVT::i64), Dest); 5464 } 5465 5466 SDValue CCVal; 5467 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); 5468 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, 5469 Cmp); 5470 } 5471 5472 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 || 5473 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); 5474 5475 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally 5476 // clean. Some of them require two branches to implement. 5477 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 5478 AArch64CC::CondCode CC1, CC2; 5479 changeFPCCToAArch64CC(CC, CC1, CC2); 5480 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 5481 SDValue BR1 = 5482 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp); 5483 if (CC2 != AArch64CC::AL) { 5484 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); 5485 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val, 5486 Cmp); 5487 } 5488 5489 return BR1; 5490 } 5491 5492 SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, 5493 SelectionDAG &DAG) const { 5494 EVT VT = Op.getValueType(); 5495 SDLoc DL(Op); 5496 5497 SDValue In1 = Op.getOperand(0); 5498 SDValue In2 = Op.getOperand(1); 5499 EVT SrcVT = In2.getValueType(); 5500 5501 if (SrcVT.bitsLT(VT)) 5502 In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2); 5503 else if (SrcVT.bitsGT(VT)) 5504 In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL)); 5505 5506 EVT VecVT; 5507 uint64_t EltMask; 5508 SDValue VecVal1, VecVal2; 5509 5510 auto setVecVal = [&] (int Idx) { 5511 if (!VT.isVector()) { 5512 VecVal1 = DAG.getTargetInsertSubreg(Idx, DL, VecVT, 5513 DAG.getUNDEF(VecVT), In1); 5514 VecVal2 = DAG.getTargetInsertSubreg(Idx, DL, VecVT, 5515 DAG.getUNDEF(VecVT), In2); 5516 } else { 5517 VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); 5518 VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); 5519 } 5520 }; 5521 5522 if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) { 5523 VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32); 5524 EltMask = 0x80000000ULL; 5525 setVecVal(AArch64::ssub); 5526 } else if (VT == MVT::f64 || VT == MVT::v2f64) { 5527 VecVT = MVT::v2i64; 5528 5529 // We want to materialize a mask with the high bit set, but the AdvSIMD 5530 // immediate moves cannot materialize that in a single instruction for 5531 // 64-bit elements. Instead, materialize zero and then negate it. 5532 EltMask = 0; 5533 5534 setVecVal(AArch64::dsub); 5535 } else if (VT == MVT::f16 || VT == MVT::v4f16 || VT == MVT::v8f16) { 5536 VecVT = (VT == MVT::v4f16 ? MVT::v4i16 : MVT::v8i16); 5537 EltMask = 0x8000ULL; 5538 setVecVal(AArch64::hsub); 5539 } else { 5540 llvm_unreachable("Invalid type for copysign!"); 5541 } 5542 5543 SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT); 5544 5545 // If we couldn't materialize the mask above, then the mask vector will be 5546 // the zero vector, and we need to negate it here. 5547 if (VT == MVT::f64 || VT == MVT::v2f64) { 5548 BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec); 5549 BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec); 5550 BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec); 5551 } 5552 5553 SDValue Sel = 5554 DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec); 5555 5556 if (VT == MVT::f16) 5557 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Sel); 5558 if (VT == MVT::f32) 5559 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel); 5560 else if (VT == MVT::f64) 5561 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel); 5562 else 5563 return DAG.getNode(ISD::BITCAST, DL, VT, Sel); 5564 } 5565 5566 SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { 5567 if (DAG.getMachineFunction().getFunction().hasFnAttribute( 5568 Attribute::NoImplicitFloat)) 5569 return SDValue(); 5570 5571 if (!Subtarget->hasNEON()) 5572 return SDValue(); 5573 5574 // While there is no integer popcount instruction, it can 5575 // be more efficiently lowered to the following sequence that uses 5576 // AdvSIMD registers/instructions as long as the copies to/from 5577 // the AdvSIMD registers are cheap. 5578 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd 5579 // CNT V0.8B, V0.8B // 8xbyte pop-counts 5580 // ADDV B0, V0.8B // sum 8xbyte pop-counts 5581 // UMOV X0, V0.B[0] // copy byte result back to integer reg 5582 SDValue Val = Op.getOperand(0); 5583 SDLoc DL(Op); 5584 EVT VT = Op.getValueType(); 5585 5586 if (VT == MVT::i32 || VT == MVT::i64) { 5587 if (VT == MVT::i32) 5588 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val); 5589 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val); 5590 5591 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val); 5592 SDValue UaddLV = DAG.getNode( 5593 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, 5594 DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop); 5595 5596 if (VT == MVT::i64) 5597 UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV); 5598 return UaddLV; 5599 } else if (VT == MVT::i128) { 5600 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val); 5601 5602 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val); 5603 SDValue UaddLV = DAG.getNode( 5604 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, 5605 DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop); 5606 5607 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV); 5608 } 5609 5610 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 || 5611 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) && 5612 "Unexpected type for custom ctpop lowering"); 5613 5614 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; 5615 Val = DAG.getBitcast(VT8Bit, Val); 5616 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val); 5617 5618 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds. 5619 unsigned EltSize = 8; 5620 unsigned NumElts = VT.is64BitVector() ? 8 : 16; 5621 while (EltSize != VT.getScalarSizeInBits()) { 5622 EltSize *= 2; 5623 NumElts /= 2; 5624 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts); 5625 Val = DAG.getNode( 5626 ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, 5627 DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val); 5628 } 5629 5630 return Val; 5631 } 5632 5633 SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 5634 5635 if (Op.getValueType().isVector()) 5636 return LowerVSETCC(Op, DAG); 5637 5638 bool IsStrict = Op->isStrictFPOpcode(); 5639 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS; 5640 unsigned OpNo = IsStrict ? 1 : 0; 5641 SDValue Chain; 5642 if (IsStrict) 5643 Chain = Op.getOperand(0); 5644 SDValue LHS = Op.getOperand(OpNo + 0); 5645 SDValue RHS = Op.getOperand(OpNo + 1); 5646 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get(); 5647 SDLoc dl(Op); 5648 5649 // We chose ZeroOrOneBooleanContents, so use zero and one. 5650 EVT VT = Op.getValueType(); 5651 SDValue TVal = DAG.getConstant(1, dl, VT); 5652 SDValue FVal = DAG.getConstant(0, dl, VT); 5653 5654 // Handle f128 first, since one possible outcome is a normal integer 5655 // comparison which gets picked up by the next if statement. 5656 if (LHS.getValueType() == MVT::f128) { 5657 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain, 5658 IsSignaling); 5659 5660 // If softenSetCCOperands returned a scalar, use it. 5661 if (!RHS.getNode()) { 5662 assert(LHS.getValueType() == Op.getValueType() && 5663 "Unexpected setcc expansion!"); 5664 return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS; 5665 } 5666 } 5667 5668 if (LHS.getValueType().isInteger()) { 5669 SDValue CCVal; 5670 SDValue Cmp = getAArch64Cmp( 5671 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl); 5672 5673 // Note that we inverted the condition above, so we reverse the order of 5674 // the true and false operands here. This will allow the setcc to be 5675 // matched to a single CSINC instruction. 5676 SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp); 5677 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res; 5678 } 5679 5680 // Now we know we're dealing with FP values. 5681 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 || 5682 LHS.getValueType() == MVT::f64); 5683 5684 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead 5685 // and do the comparison. 5686 SDValue Cmp; 5687 if (IsStrict) 5688 Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling); 5689 else 5690 Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 5691 5692 AArch64CC::CondCode CC1, CC2; 5693 changeFPCCToAArch64CC(CC, CC1, CC2); 5694 SDValue Res; 5695 if (CC2 == AArch64CC::AL) { 5696 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1, 5697 CC2); 5698 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 5699 5700 // Note that we inverted the condition above, so we reverse the order of 5701 // the true and false operands here. This will allow the setcc to be 5702 // matched to a single CSINC instruction. 5703 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp); 5704 } else { 5705 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't 5706 // totally clean. Some of them require two CSELs to implement. As is in 5707 // this case, we emit the first CSEL and then emit a second using the output 5708 // of the first as the RHS. We're effectively OR'ing the two CC's together. 5709 5710 // FIXME: It would be nice if we could match the two CSELs to two CSINCs. 5711 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 5712 SDValue CS1 = 5713 DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); 5714 5715 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); 5716 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); 5717 } 5718 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res; 5719 } 5720 5721 SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, 5722 SDValue RHS, SDValue TVal, 5723 SDValue FVal, const SDLoc &dl, 5724 SelectionDAG &DAG) const { 5725 // Handle f128 first, because it will result in a comparison of some RTLIB 5726 // call result against zero. 5727 if (LHS.getValueType() == MVT::f128) { 5728 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS); 5729 5730 // If softenSetCCOperands returned a scalar, we need to compare the result 5731 // against zero to select between true and false values. 5732 if (!RHS.getNode()) { 5733 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 5734 CC = ISD::SETNE; 5735 } 5736 } 5737 5738 // Also handle f16, for which we need to do a f32 comparison. 5739 if (LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { 5740 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); 5741 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); 5742 } 5743 5744 // Next, handle integers. 5745 if (LHS.getValueType().isInteger()) { 5746 assert((LHS.getValueType() == RHS.getValueType()) && 5747 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); 5748 5749 unsigned Opcode = AArch64ISD::CSEL; 5750 5751 // If both the TVal and the FVal are constants, see if we can swap them in 5752 // order to for a CSINV or CSINC out of them. 5753 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal); 5754 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal); 5755 5756 if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) { 5757 std::swap(TVal, FVal); 5758 std::swap(CTVal, CFVal); 5759 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 5760 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) { 5761 std::swap(TVal, FVal); 5762 std::swap(CTVal, CFVal); 5763 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 5764 } else if (TVal.getOpcode() == ISD::XOR) { 5765 // If TVal is a NOT we want to swap TVal and FVal so that we can match 5766 // with a CSINV rather than a CSEL. 5767 if (isAllOnesConstant(TVal.getOperand(1))) { 5768 std::swap(TVal, FVal); 5769 std::swap(CTVal, CFVal); 5770 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 5771 } 5772 } else if (TVal.getOpcode() == ISD::SUB) { 5773 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so 5774 // that we can match with a CSNEG rather than a CSEL. 5775 if (isNullConstant(TVal.getOperand(0))) { 5776 std::swap(TVal, FVal); 5777 std::swap(CTVal, CFVal); 5778 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 5779 } 5780 } else if (CTVal && CFVal) { 5781 const int64_t TrueVal = CTVal->getSExtValue(); 5782 const int64_t FalseVal = CFVal->getSExtValue(); 5783 bool Swap = false; 5784 5785 // If both TVal and FVal are constants, see if FVal is the 5786 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC 5787 // instead of a CSEL in that case. 5788 if (TrueVal == ~FalseVal) { 5789 Opcode = AArch64ISD::CSINV; 5790 } else if (TrueVal == -FalseVal) { 5791 Opcode = AArch64ISD::CSNEG; 5792 } else if (TVal.getValueType() == MVT::i32) { 5793 // If our operands are only 32-bit wide, make sure we use 32-bit 5794 // arithmetic for the check whether we can use CSINC. This ensures that 5795 // the addition in the check will wrap around properly in case there is 5796 // an overflow (which would not be the case if we do the check with 5797 // 64-bit arithmetic). 5798 const uint32_t TrueVal32 = CTVal->getZExtValue(); 5799 const uint32_t FalseVal32 = CFVal->getZExtValue(); 5800 5801 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) { 5802 Opcode = AArch64ISD::CSINC; 5803 5804 if (TrueVal32 > FalseVal32) { 5805 Swap = true; 5806 } 5807 } 5808 // 64-bit check whether we can use CSINC. 5809 } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) { 5810 Opcode = AArch64ISD::CSINC; 5811 5812 if (TrueVal > FalseVal) { 5813 Swap = true; 5814 } 5815 } 5816 5817 // Swap TVal and FVal if necessary. 5818 if (Swap) { 5819 std::swap(TVal, FVal); 5820 std::swap(CTVal, CFVal); 5821 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 5822 } 5823 5824 if (Opcode != AArch64ISD::CSEL) { 5825 // Drop FVal since we can get its value by simply inverting/negating 5826 // TVal. 5827 FVal = TVal; 5828 } 5829 } 5830 5831 // Avoid materializing a constant when possible by reusing a known value in 5832 // a register. However, don't perform this optimization if the known value 5833 // is one, zero or negative one in the case of a CSEL. We can always 5834 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the 5835 // FVal, respectively. 5836 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS); 5837 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() && 5838 !RHSVal->isNullValue() && !RHSVal->isAllOnesValue()) { 5839 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); 5840 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to 5841 // "a != C ? x : a" to avoid materializing C. 5842 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ) 5843 TVal = LHS; 5844 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE) 5845 FVal = LHS; 5846 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) { 5847 assert (CTVal && CFVal && "Expected constant operands for CSNEG."); 5848 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to 5849 // avoid materializing C. 5850 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); 5851 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) { 5852 Opcode = AArch64ISD::CSINV; 5853 TVal = LHS; 5854 FVal = DAG.getConstant(0, dl, FVal.getValueType()); 5855 } 5856 } 5857 5858 SDValue CCVal; 5859 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); 5860 EVT VT = TVal.getValueType(); 5861 return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp); 5862 } 5863 5864 // Now we know we're dealing with FP values. 5865 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 || 5866 LHS.getValueType() == MVT::f64); 5867 assert(LHS.getValueType() == RHS.getValueType()); 5868 EVT VT = TVal.getValueType(); 5869 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 5870 5871 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally 5872 // clean. Some of them require two CSELs to implement. 5873 AArch64CC::CondCode CC1, CC2; 5874 changeFPCCToAArch64CC(CC, CC1, CC2); 5875 5876 if (DAG.getTarget().Options.UnsafeFPMath) { 5877 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and 5878 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0. 5879 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS); 5880 if (RHSVal && RHSVal->isZero()) { 5881 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal); 5882 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal); 5883 5884 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) && 5885 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType()) 5886 TVal = LHS; 5887 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) && 5888 CFVal && CFVal->isZero() && 5889 FVal.getValueType() == LHS.getValueType()) 5890 FVal = LHS; 5891 } 5892 } 5893 5894 // Emit first, and possibly only, CSEL. 5895 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 5896 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); 5897 5898 // If we need a second CSEL, emit it, using the output of the first as the 5899 // RHS. We're effectively OR'ing the two CC's together. 5900 if (CC2 != AArch64CC::AL) { 5901 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); 5902 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); 5903 } 5904 5905 // Otherwise, return the output of the first CSEL. 5906 return CS1; 5907 } 5908 5909 SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, 5910 SelectionDAG &DAG) const { 5911 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 5912 SDValue LHS = Op.getOperand(0); 5913 SDValue RHS = Op.getOperand(1); 5914 SDValue TVal = Op.getOperand(2); 5915 SDValue FVal = Op.getOperand(3); 5916 SDLoc DL(Op); 5917 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); 5918 } 5919 5920 SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, 5921 SelectionDAG &DAG) const { 5922 SDValue CCVal = Op->getOperand(0); 5923 SDValue TVal = Op->getOperand(1); 5924 SDValue FVal = Op->getOperand(2); 5925 SDLoc DL(Op); 5926 5927 EVT Ty = Op.getValueType(); 5928 if (Ty.isScalableVector()) { 5929 SDValue TruncCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, CCVal); 5930 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount()); 5931 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, TruncCC); 5932 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal); 5933 } 5934 5935 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select 5936 // instruction. 5937 if (ISD::isOverflowIntrOpRes(CCVal)) { 5938 // Only lower legal XALUO ops. 5939 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0))) 5940 return SDValue(); 5941 5942 AArch64CC::CondCode OFCC; 5943 SDValue Value, Overflow; 5944 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG); 5945 SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32); 5946 5947 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal, 5948 CCVal, Overflow); 5949 } 5950 5951 // Lower it the same way as we would lower a SELECT_CC node. 5952 ISD::CondCode CC; 5953 SDValue LHS, RHS; 5954 if (CCVal.getOpcode() == ISD::SETCC) { 5955 LHS = CCVal.getOperand(0); 5956 RHS = CCVal.getOperand(1); 5957 CC = cast<CondCodeSDNode>(CCVal->getOperand(2))->get(); 5958 } else { 5959 LHS = CCVal; 5960 RHS = DAG.getConstant(0, DL, CCVal.getValueType()); 5961 CC = ISD::SETNE; 5962 } 5963 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); 5964 } 5965 5966 SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op, 5967 SelectionDAG &DAG) const { 5968 // Jump table entries as PC relative offsets. No additional tweaking 5969 // is necessary here. Just get the address of the jump table. 5970 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 5971 5972 if (getTargetMachine().getCodeModel() == CodeModel::Large && 5973 !Subtarget->isTargetMachO()) { 5974 return getAddrLarge(JT, DAG); 5975 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) { 5976 return getAddrTiny(JT, DAG); 5977 } 5978 return getAddr(JT, DAG); 5979 } 5980 5981 SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op, 5982 SelectionDAG &DAG) const { 5983 // Jump table entries as PC relative offsets. No additional tweaking 5984 // is necessary here. Just get the address of the jump table. 5985 SDLoc DL(Op); 5986 SDValue JT = Op.getOperand(1); 5987 SDValue Entry = Op.getOperand(2); 5988 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex(); 5989 5990 SDNode *Dest = 5991 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT, 5992 Entry, DAG.getTargetJumpTable(JTI, MVT::i32)); 5993 return DAG.getNode(ISD::BRIND, DL, MVT::Other, Op.getOperand(0), 5994 SDValue(Dest, 0)); 5995 } 5996 5997 SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op, 5998 SelectionDAG &DAG) const { 5999 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 6000 6001 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 6002 // Use the GOT for the large code model on iOS. 6003 if (Subtarget->isTargetMachO()) { 6004 return getGOT(CP, DAG); 6005 } 6006 return getAddrLarge(CP, DAG); 6007 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) { 6008 return getAddrTiny(CP, DAG); 6009 } else { 6010 return getAddr(CP, DAG); 6011 } 6012 } 6013 6014 SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op, 6015 SelectionDAG &DAG) const { 6016 BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op); 6017 if (getTargetMachine().getCodeModel() == CodeModel::Large && 6018 !Subtarget->isTargetMachO()) { 6019 return getAddrLarge(BA, DAG); 6020 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) { 6021 return getAddrTiny(BA, DAG); 6022 } 6023 return getAddr(BA, DAG); 6024 } 6025 6026 SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op, 6027 SelectionDAG &DAG) const { 6028 AArch64FunctionInfo *FuncInfo = 6029 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); 6030 6031 SDLoc DL(Op); 6032 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), 6033 getPointerTy(DAG.getDataLayout())); 6034 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout())); 6035 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 6036 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 6037 MachinePointerInfo(SV)); 6038 } 6039 6040 SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op, 6041 SelectionDAG &DAG) const { 6042 AArch64FunctionInfo *FuncInfo = 6043 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); 6044 6045 SDLoc DL(Op); 6046 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0 6047 ? FuncInfo->getVarArgsGPRIndex() 6048 : FuncInfo->getVarArgsStackIndex(), 6049 getPointerTy(DAG.getDataLayout())); 6050 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 6051 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 6052 MachinePointerInfo(SV)); 6053 } 6054 6055 SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, 6056 SelectionDAG &DAG) const { 6057 // The layout of the va_list struct is specified in the AArch64 Procedure Call 6058 // Standard, section B.3. 6059 MachineFunction &MF = DAG.getMachineFunction(); 6060 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 6061 auto PtrVT = getPointerTy(DAG.getDataLayout()); 6062 SDLoc DL(Op); 6063 6064 SDValue Chain = Op.getOperand(0); 6065 SDValue VAList = Op.getOperand(1); 6066 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 6067 SmallVector<SDValue, 4> MemOps; 6068 6069 // void *__stack at offset 0 6070 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT); 6071 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList, 6072 MachinePointerInfo(SV), /* Alignment = */ 8)); 6073 6074 // void *__gr_top at offset 8 6075 int GPRSize = FuncInfo->getVarArgsGPRSize(); 6076 if (GPRSize > 0) { 6077 SDValue GRTop, GRTopAddr; 6078 6079 GRTopAddr = 6080 DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(8, DL, PtrVT)); 6081 6082 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT); 6083 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop, 6084 DAG.getConstant(GPRSize, DL, PtrVT)); 6085 6086 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr, 6087 MachinePointerInfo(SV, 8), 6088 /* Alignment = */ 8)); 6089 } 6090 6091 // void *__vr_top at offset 16 6092 int FPRSize = FuncInfo->getVarArgsFPRSize(); 6093 if (FPRSize > 0) { 6094 SDValue VRTop, VRTopAddr; 6095 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 6096 DAG.getConstant(16, DL, PtrVT)); 6097 6098 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT); 6099 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop, 6100 DAG.getConstant(FPRSize, DL, PtrVT)); 6101 6102 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr, 6103 MachinePointerInfo(SV, 16), 6104 /* Alignment = */ 8)); 6105 } 6106 6107 // int __gr_offs at offset 24 6108 SDValue GROffsAddr = 6109 DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT)); 6110 MemOps.push_back(DAG.getStore( 6111 Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr, 6112 MachinePointerInfo(SV, 24), /* Alignment = */ 4)); 6113 6114 // int __vr_offs at offset 28 6115 SDValue VROffsAddr = 6116 DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT)); 6117 MemOps.push_back(DAG.getStore( 6118 Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr, 6119 MachinePointerInfo(SV, 28), /* Alignment = */ 4)); 6120 6121 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); 6122 } 6123 6124 SDValue AArch64TargetLowering::LowerVASTART(SDValue Op, 6125 SelectionDAG &DAG) const { 6126 MachineFunction &MF = DAG.getMachineFunction(); 6127 6128 if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv())) 6129 return LowerWin64_VASTART(Op, DAG); 6130 else if (Subtarget->isTargetDarwin()) 6131 return LowerDarwin_VASTART(Op, DAG); 6132 else 6133 return LowerAAPCS_VASTART(Op, DAG); 6134 } 6135 6136 SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, 6137 SelectionDAG &DAG) const { 6138 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single 6139 // pointer. 6140 SDLoc DL(Op); 6141 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8; 6142 unsigned VaListSize = (Subtarget->isTargetDarwin() || 6143 Subtarget->isTargetWindows()) ? PtrSize : 32; 6144 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 6145 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 6146 6147 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2), 6148 DAG.getConstant(VaListSize, DL, MVT::i32), 6149 Align(PtrSize), false, false, false, 6150 MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV)); 6151 } 6152 6153 SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 6154 assert(Subtarget->isTargetDarwin() && 6155 "automatic va_arg instruction only works on Darwin"); 6156 6157 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 6158 EVT VT = Op.getValueType(); 6159 SDLoc DL(Op); 6160 SDValue Chain = Op.getOperand(0); 6161 SDValue Addr = Op.getOperand(1); 6162 MaybeAlign Align(Op.getConstantOperandVal(3)); 6163 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8; 6164 auto PtrVT = getPointerTy(DAG.getDataLayout()); 6165 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout()); 6166 SDValue VAList = 6167 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V)); 6168 Chain = VAList.getValue(1); 6169 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT); 6170 6171 if (Align && *Align > MinSlotSize) { 6172 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 6173 DAG.getConstant(Align->value() - 1, DL, PtrVT)); 6174 VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList, 6175 DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT)); 6176 } 6177 6178 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); 6179 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); 6180 6181 // Scalar integer and FP values smaller than 64 bits are implicitly extended 6182 // up to 64 bits. At the very least, we have to increase the striding of the 6183 // vaargs list to match this, and for FP values we need to introduce 6184 // FP_ROUND nodes as well. 6185 if (VT.isInteger() && !VT.isVector()) 6186 ArgSize = std::max(ArgSize, MinSlotSize); 6187 bool NeedFPTrunc = false; 6188 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) { 6189 ArgSize = 8; 6190 NeedFPTrunc = true; 6191 } 6192 6193 // Increment the pointer, VAList, to the next vaarg 6194 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 6195 DAG.getConstant(ArgSize, DL, PtrVT)); 6196 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT); 6197 6198 // Store the incremented VAList to the legalized pointer 6199 SDValue APStore = 6200 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V)); 6201 6202 // Load the actual argument out of the pointer VAList 6203 if (NeedFPTrunc) { 6204 // Load the value as an f64. 6205 SDValue WideFP = 6206 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo()); 6207 // Round the value down to an f32. 6208 SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0), 6209 DAG.getIntPtrConstant(1, DL)); 6210 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) }; 6211 // Merge the rounded value with the chain output of the load. 6212 return DAG.getMergeValues(Ops, DL); 6213 } 6214 6215 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo()); 6216 } 6217 6218 SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, 6219 SelectionDAG &DAG) const { 6220 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 6221 MFI.setFrameAddressIsTaken(true); 6222 6223 EVT VT = Op.getValueType(); 6224 SDLoc DL(Op); 6225 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6226 SDValue FrameAddr = 6227 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64); 6228 while (Depth--) 6229 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr, 6230 MachinePointerInfo()); 6231 6232 if (Subtarget->isTargetILP32()) 6233 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr, 6234 DAG.getValueType(VT)); 6235 6236 return FrameAddr; 6237 } 6238 6239 SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op, 6240 SelectionDAG &DAG) const { 6241 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 6242 6243 EVT VT = getPointerTy(DAG.getDataLayout()); 6244 SDLoc DL(Op); 6245 int FI = MFI.CreateFixedObject(4, 0, false); 6246 return DAG.getFrameIndex(FI, VT); 6247 } 6248 6249 #define GET_REGISTER_MATCHER 6250 #include "AArch64GenAsmMatcher.inc" 6251 6252 // FIXME? Maybe this could be a TableGen attribute on some registers and 6253 // this table could be generated automatically from RegInfo. 6254 Register AArch64TargetLowering:: 6255 getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const { 6256 Register Reg = MatchRegisterName(RegName); 6257 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) { 6258 const MCRegisterInfo *MRI = Subtarget->getRegisterInfo(); 6259 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false); 6260 if (!Subtarget->isXRegisterReserved(DwarfRegNum)) 6261 Reg = 0; 6262 } 6263 if (Reg) 6264 return Reg; 6265 report_fatal_error(Twine("Invalid register name \"" 6266 + StringRef(RegName) + "\".")); 6267 } 6268 6269 SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op, 6270 SelectionDAG &DAG) const { 6271 DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true); 6272 6273 EVT VT = Op.getValueType(); 6274 SDLoc DL(Op); 6275 6276 SDValue FrameAddr = 6277 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT); 6278 SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout())); 6279 6280 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset); 6281 } 6282 6283 SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, 6284 SelectionDAG &DAG) const { 6285 MachineFunction &MF = DAG.getMachineFunction(); 6286 MachineFrameInfo &MFI = MF.getFrameInfo(); 6287 MFI.setReturnAddressIsTaken(true); 6288 6289 EVT VT = Op.getValueType(); 6290 SDLoc DL(Op); 6291 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6292 if (Depth) { 6293 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 6294 SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout())); 6295 return DAG.getLoad(VT, DL, DAG.getEntryNode(), 6296 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), 6297 MachinePointerInfo()); 6298 } 6299 6300 // Return LR, which contains the return address. Mark it an implicit live-in. 6301 unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass); 6302 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); 6303 } 6304 6305 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two 6306 /// i64 values and take a 2 x i64 value to shift plus a shift amount. 6307 SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op, 6308 SelectionDAG &DAG) const { 6309 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 6310 EVT VT = Op.getValueType(); 6311 unsigned VTBits = VT.getSizeInBits(); 6312 SDLoc dl(Op); 6313 SDValue ShOpLo = Op.getOperand(0); 6314 SDValue ShOpHi = Op.getOperand(1); 6315 SDValue ShAmt = Op.getOperand(2); 6316 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 6317 6318 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 6319 6320 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, 6321 DAG.getConstant(VTBits, dl, MVT::i64), ShAmt); 6322 SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 6323 6324 // Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which 6325 // is "undef". We wanted 0, so CSEL it directly. 6326 SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64), 6327 ISD::SETEQ, dl, DAG); 6328 SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32); 6329 HiBitsForLo = 6330 DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64), 6331 HiBitsForLo, CCVal, Cmp); 6332 6333 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, 6334 DAG.getConstant(VTBits, dl, MVT::i64)); 6335 6336 SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 6337 SDValue LoForNormalShift = 6338 DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo); 6339 6340 Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE, 6341 dl, DAG); 6342 CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); 6343 SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 6344 SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift, 6345 LoForNormalShift, CCVal, Cmp); 6346 6347 // AArch64 shifts larger than the register width are wrapped rather than 6348 // clamped, so we can't just emit "hi >> x". 6349 SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 6350 SDValue HiForBigShift = 6351 Opc == ISD::SRA 6352 ? DAG.getNode(Opc, dl, VT, ShOpHi, 6353 DAG.getConstant(VTBits - 1, dl, MVT::i64)) 6354 : DAG.getConstant(0, dl, VT); 6355 SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift, 6356 HiForNormalShift, CCVal, Cmp); 6357 6358 SDValue Ops[2] = { Lo, Hi }; 6359 return DAG.getMergeValues(Ops, dl); 6360 } 6361 6362 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two 6363 /// i64 values and take a 2 x i64 value to shift plus a shift amount. 6364 SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op, 6365 SelectionDAG &DAG) const { 6366 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 6367 EVT VT = Op.getValueType(); 6368 unsigned VTBits = VT.getSizeInBits(); 6369 SDLoc dl(Op); 6370 SDValue ShOpLo = Op.getOperand(0); 6371 SDValue ShOpHi = Op.getOperand(1); 6372 SDValue ShAmt = Op.getOperand(2); 6373 6374 assert(Op.getOpcode() == ISD::SHL_PARTS); 6375 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, 6376 DAG.getConstant(VTBits, dl, MVT::i64), ShAmt); 6377 SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 6378 6379 // Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which 6380 // is "undef". We wanted 0, so CSEL it directly. 6381 SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64), 6382 ISD::SETEQ, dl, DAG); 6383 SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32); 6384 LoBitsForHi = 6385 DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64), 6386 LoBitsForHi, CCVal, Cmp); 6387 6388 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, 6389 DAG.getConstant(VTBits, dl, MVT::i64)); 6390 SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 6391 SDValue HiForNormalShift = 6392 DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi); 6393 6394 SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 6395 6396 Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE, 6397 dl, DAG); 6398 CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); 6399 SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift, 6400 HiForNormalShift, CCVal, Cmp); 6401 6402 // AArch64 shifts of larger than register sizes are wrapped rather than 6403 // clamped, so we can't just emit "lo << a" if a is too big. 6404 SDValue LoForBigShift = DAG.getConstant(0, dl, VT); 6405 SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 6406 SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift, 6407 LoForNormalShift, CCVal, Cmp); 6408 6409 SDValue Ops[2] = { Lo, Hi }; 6410 return DAG.getMergeValues(Ops, dl); 6411 } 6412 6413 bool AArch64TargetLowering::isOffsetFoldingLegal( 6414 const GlobalAddressSDNode *GA) const { 6415 // Offsets are folded in the DAG combine rather than here so that we can 6416 // intelligently choose an offset based on the uses. 6417 return false; 6418 } 6419 6420 bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, 6421 bool OptForSize) const { 6422 bool IsLegal = false; 6423 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and 6424 // 16-bit case when target has full fp16 support. 6425 // FIXME: We should be able to handle f128 as well with a clever lowering. 6426 const APInt ImmInt = Imm.bitcastToAPInt(); 6427 if (VT == MVT::f64) 6428 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero(); 6429 else if (VT == MVT::f32) 6430 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero(); 6431 else if (VT == MVT::f16 && Subtarget->hasFullFP16()) 6432 IsLegal = AArch64_AM::getFP16Imm(ImmInt) != -1 || Imm.isPosZero(); 6433 // TODO: fmov h0, w0 is also legal, however on't have an isel pattern to 6434 // generate that fmov. 6435 6436 // If we can not materialize in immediate field for fmov, check if the 6437 // value can be encoded as the immediate operand of a logical instruction. 6438 // The immediate value will be created with either MOVZ, MOVN, or ORR. 6439 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) { 6440 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr; 6441 // however the mov+fmov sequence is always better because of the reduced 6442 // cache pressure. The timings are still the same if you consider 6443 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the 6444 // movw+movk is fused). So we limit up to 2 instrdduction at most. 6445 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 6446 AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(), 6447 Insn); 6448 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2)); 6449 IsLegal = Insn.size() <= Limit; 6450 } 6451 6452 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT.getEVTString() 6453 << " imm value: "; Imm.dump();); 6454 return IsLegal; 6455 } 6456 6457 //===----------------------------------------------------------------------===// 6458 // AArch64 Optimization Hooks 6459 //===----------------------------------------------------------------------===// 6460 6461 static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, 6462 SDValue Operand, SelectionDAG &DAG, 6463 int &ExtraSteps) { 6464 EVT VT = Operand.getValueType(); 6465 if (ST->hasNEON() && 6466 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 || 6467 VT == MVT::f32 || VT == MVT::v1f32 || 6468 VT == MVT::v2f32 || VT == MVT::v4f32)) { 6469 if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified) 6470 // For the reciprocal estimates, convergence is quadratic, so the number 6471 // of digits is doubled after each iteration. In ARMv8, the accuracy of 6472 // the initial estimate is 2^-8. Thus the number of extra steps to refine 6473 // the result for float (23 mantissa bits) is 2 and for double (52 6474 // mantissa bits) is 3. 6475 ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2; 6476 6477 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand); 6478 } 6479 6480 return SDValue(); 6481 } 6482 6483 SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand, 6484 SelectionDAG &DAG, int Enabled, 6485 int &ExtraSteps, 6486 bool &UseOneConst, 6487 bool Reciprocal) const { 6488 if (Enabled == ReciprocalEstimate::Enabled || 6489 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt())) 6490 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand, 6491 DAG, ExtraSteps)) { 6492 SDLoc DL(Operand); 6493 EVT VT = Operand.getValueType(); 6494 6495 SDNodeFlags Flags; 6496 Flags.setAllowReassociation(true); 6497 6498 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2) 6499 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N) 6500 for (int i = ExtraSteps; i > 0; --i) { 6501 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate, 6502 Flags); 6503 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags); 6504 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags); 6505 } 6506 if (!Reciprocal) { 6507 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), 6508 VT); 6509 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); 6510 SDValue Eq = DAG.getSetCC(DL, CCVT, Operand, FPZero, ISD::SETEQ); 6511 6512 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags); 6513 // Correct the result if the operand is 0.0. 6514 Estimate = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL, 6515 VT, Eq, Operand, Estimate); 6516 } 6517 6518 ExtraSteps = 0; 6519 return Estimate; 6520 } 6521 6522 return SDValue(); 6523 } 6524 6525 SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand, 6526 SelectionDAG &DAG, int Enabled, 6527 int &ExtraSteps) const { 6528 if (Enabled == ReciprocalEstimate::Enabled) 6529 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand, 6530 DAG, ExtraSteps)) { 6531 SDLoc DL(Operand); 6532 EVT VT = Operand.getValueType(); 6533 6534 SDNodeFlags Flags; 6535 Flags.setAllowReassociation(true); 6536 6537 // Newton reciprocal iteration: E * (2 - X * E) 6538 // AArch64 reciprocal iteration instruction: (2 - M * N) 6539 for (int i = ExtraSteps; i > 0; --i) { 6540 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand, 6541 Estimate, Flags); 6542 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags); 6543 } 6544 6545 ExtraSteps = 0; 6546 return Estimate; 6547 } 6548 6549 return SDValue(); 6550 } 6551 6552 //===----------------------------------------------------------------------===// 6553 // AArch64 Inline Assembly Support 6554 //===----------------------------------------------------------------------===// 6555 6556 // Table of Constraints 6557 // TODO: This is the current set of constraints supported by ARM for the 6558 // compiler, not all of them may make sense. 6559 // 6560 // r - A general register 6561 // w - An FP/SIMD register of some size in the range v0-v31 6562 // x - An FP/SIMD register of some size in the range v0-v15 6563 // I - Constant that can be used with an ADD instruction 6564 // J - Constant that can be used with a SUB instruction 6565 // K - Constant that can be used with a 32-bit logical instruction 6566 // L - Constant that can be used with a 64-bit logical instruction 6567 // M - Constant that can be used as a 32-bit MOV immediate 6568 // N - Constant that can be used as a 64-bit MOV immediate 6569 // Q - A memory reference with base register and no offset 6570 // S - A symbolic address 6571 // Y - Floating point constant zero 6572 // Z - Integer constant zero 6573 // 6574 // Note that general register operands will be output using their 64-bit x 6575 // register name, whatever the size of the variable, unless the asm operand 6576 // is prefixed by the %w modifier. Floating-point and SIMD register operands 6577 // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or 6578 // %q modifier. 6579 const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const { 6580 // At this point, we have to lower this constraint to something else, so we 6581 // lower it to an "r" or "w". However, by doing this we will force the result 6582 // to be in register, while the X constraint is much more permissive. 6583 // 6584 // Although we are correct (we are free to emit anything, without 6585 // constraints), we might break use cases that would expect us to be more 6586 // efficient and emit something else. 6587 if (!Subtarget->hasFPARMv8()) 6588 return "r"; 6589 6590 if (ConstraintVT.isFloatingPoint()) 6591 return "w"; 6592 6593 if (ConstraintVT.isVector() && 6594 (ConstraintVT.getSizeInBits() == 64 || 6595 ConstraintVT.getSizeInBits() == 128)) 6596 return "w"; 6597 6598 return "r"; 6599 } 6600 6601 enum PredicateConstraint { 6602 Upl, 6603 Upa, 6604 Invalid 6605 }; 6606 6607 static PredicateConstraint parsePredicateConstraint(StringRef Constraint) { 6608 PredicateConstraint P = PredicateConstraint::Invalid; 6609 if (Constraint == "Upa") 6610 P = PredicateConstraint::Upa; 6611 if (Constraint == "Upl") 6612 P = PredicateConstraint::Upl; 6613 return P; 6614 } 6615 6616 /// getConstraintType - Given a constraint letter, return the type of 6617 /// constraint it is for this target. 6618 AArch64TargetLowering::ConstraintType 6619 AArch64TargetLowering::getConstraintType(StringRef Constraint) const { 6620 if (Constraint.size() == 1) { 6621 switch (Constraint[0]) { 6622 default: 6623 break; 6624 case 'x': 6625 case 'w': 6626 case 'y': 6627 return C_RegisterClass; 6628 // An address with a single base register. Due to the way we 6629 // currently handle addresses it is the same as 'r'. 6630 case 'Q': 6631 return C_Memory; 6632 case 'I': 6633 case 'J': 6634 case 'K': 6635 case 'L': 6636 case 'M': 6637 case 'N': 6638 case 'Y': 6639 case 'Z': 6640 return C_Immediate; 6641 case 'z': 6642 case 'S': // A symbolic address 6643 return C_Other; 6644 } 6645 } else if (parsePredicateConstraint(Constraint) != 6646 PredicateConstraint::Invalid) 6647 return C_RegisterClass; 6648 return TargetLowering::getConstraintType(Constraint); 6649 } 6650 6651 /// Examine constraint type and operand type and determine a weight value. 6652 /// This object must already have been set up with the operand type 6653 /// and the current alternative constraint selected. 6654 TargetLowering::ConstraintWeight 6655 AArch64TargetLowering::getSingleConstraintMatchWeight( 6656 AsmOperandInfo &info, const char *constraint) const { 6657 ConstraintWeight weight = CW_Invalid; 6658 Value *CallOperandVal = info.CallOperandVal; 6659 // If we don't have a value, we can't do a match, 6660 // but allow it at the lowest weight. 6661 if (!CallOperandVal) 6662 return CW_Default; 6663 Type *type = CallOperandVal->getType(); 6664 // Look at the constraint type. 6665 switch (*constraint) { 6666 default: 6667 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 6668 break; 6669 case 'x': 6670 case 'w': 6671 case 'y': 6672 if (type->isFloatingPointTy() || type->isVectorTy()) 6673 weight = CW_Register; 6674 break; 6675 case 'z': 6676 weight = CW_Constant; 6677 break; 6678 case 'U': 6679 if (parsePredicateConstraint(constraint) != PredicateConstraint::Invalid) 6680 weight = CW_Register; 6681 break; 6682 } 6683 return weight; 6684 } 6685 6686 std::pair<unsigned, const TargetRegisterClass *> 6687 AArch64TargetLowering::getRegForInlineAsmConstraint( 6688 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { 6689 if (Constraint.size() == 1) { 6690 switch (Constraint[0]) { 6691 case 'r': 6692 if (VT.getSizeInBits() == 64) 6693 return std::make_pair(0U, &AArch64::GPR64commonRegClass); 6694 return std::make_pair(0U, &AArch64::GPR32commonRegClass); 6695 case 'w': 6696 if (!Subtarget->hasFPARMv8()) 6697 break; 6698 if (VT.isScalableVector()) 6699 return std::make_pair(0U, &AArch64::ZPRRegClass); 6700 if (VT.getSizeInBits() == 16) 6701 return std::make_pair(0U, &AArch64::FPR16RegClass); 6702 if (VT.getSizeInBits() == 32) 6703 return std::make_pair(0U, &AArch64::FPR32RegClass); 6704 if (VT.getSizeInBits() == 64) 6705 return std::make_pair(0U, &AArch64::FPR64RegClass); 6706 if (VT.getSizeInBits() == 128) 6707 return std::make_pair(0U, &AArch64::FPR128RegClass); 6708 break; 6709 // The instructions that this constraint is designed for can 6710 // only take 128-bit registers so just use that regclass. 6711 case 'x': 6712 if (!Subtarget->hasFPARMv8()) 6713 break; 6714 if (VT.isScalableVector()) 6715 return std::make_pair(0U, &AArch64::ZPR_4bRegClass); 6716 if (VT.getSizeInBits() == 128) 6717 return std::make_pair(0U, &AArch64::FPR128_loRegClass); 6718 break; 6719 case 'y': 6720 if (!Subtarget->hasFPARMv8()) 6721 break; 6722 if (VT.isScalableVector()) 6723 return std::make_pair(0U, &AArch64::ZPR_3bRegClass); 6724 break; 6725 } 6726 } else { 6727 PredicateConstraint PC = parsePredicateConstraint(Constraint); 6728 if (PC != PredicateConstraint::Invalid) { 6729 assert(VT.isScalableVector()); 6730 bool restricted = (PC == PredicateConstraint::Upl); 6731 return restricted ? std::make_pair(0U, &AArch64::PPR_3bRegClass) 6732 : std::make_pair(0U, &AArch64::PPRRegClass); 6733 } 6734 } 6735 if (StringRef("{cc}").equals_lower(Constraint)) 6736 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass); 6737 6738 // Use the default implementation in TargetLowering to convert the register 6739 // constraint into a member of a register class. 6740 std::pair<unsigned, const TargetRegisterClass *> Res; 6741 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 6742 6743 // Not found as a standard register? 6744 if (!Res.second) { 6745 unsigned Size = Constraint.size(); 6746 if ((Size == 4 || Size == 5) && Constraint[0] == '{' && 6747 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') { 6748 int RegNo; 6749 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo); 6750 if (!Failed && RegNo >= 0 && RegNo <= 31) { 6751 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size. 6752 // By default we'll emit v0-v31 for this unless there's a modifier where 6753 // we'll emit the correct register as well. 6754 if (VT != MVT::Other && VT.getSizeInBits() == 64) { 6755 Res.first = AArch64::FPR64RegClass.getRegister(RegNo); 6756 Res.second = &AArch64::FPR64RegClass; 6757 } else { 6758 Res.first = AArch64::FPR128RegClass.getRegister(RegNo); 6759 Res.second = &AArch64::FPR128RegClass; 6760 } 6761 } 6762 } 6763 } 6764 6765 if (Res.second && !Subtarget->hasFPARMv8() && 6766 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) && 6767 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second)) 6768 return std::make_pair(0U, nullptr); 6769 6770 return Res; 6771 } 6772 6773 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 6774 /// vector. If it is invalid, don't add anything to Ops. 6775 void AArch64TargetLowering::LowerAsmOperandForConstraint( 6776 SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, 6777 SelectionDAG &DAG) const { 6778 SDValue Result; 6779 6780 // Currently only support length 1 constraints. 6781 if (Constraint.length() != 1) 6782 return; 6783 6784 char ConstraintLetter = Constraint[0]; 6785 switch (ConstraintLetter) { 6786 default: 6787 break; 6788 6789 // This set of constraints deal with valid constants for various instructions. 6790 // Validate and return a target constant for them if we can. 6791 case 'z': { 6792 // 'z' maps to xzr or wzr so it needs an input of 0. 6793 if (!isNullConstant(Op)) 6794 return; 6795 6796 if (Op.getValueType() == MVT::i64) 6797 Result = DAG.getRegister(AArch64::XZR, MVT::i64); 6798 else 6799 Result = DAG.getRegister(AArch64::WZR, MVT::i32); 6800 break; 6801 } 6802 case 'S': { 6803 // An absolute symbolic address or label reference. 6804 if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) { 6805 Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op), 6806 GA->getValueType(0)); 6807 } else if (const BlockAddressSDNode *BA = 6808 dyn_cast<BlockAddressSDNode>(Op)) { 6809 Result = 6810 DAG.getTargetBlockAddress(BA->getBlockAddress(), BA->getValueType(0)); 6811 } else if (const ExternalSymbolSDNode *ES = 6812 dyn_cast<ExternalSymbolSDNode>(Op)) { 6813 Result = 6814 DAG.getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0)); 6815 } else 6816 return; 6817 break; 6818 } 6819 6820 case 'I': 6821 case 'J': 6822 case 'K': 6823 case 'L': 6824 case 'M': 6825 case 'N': 6826 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 6827 if (!C) 6828 return; 6829 6830 // Grab the value and do some validation. 6831 uint64_t CVal = C->getZExtValue(); 6832 switch (ConstraintLetter) { 6833 // The I constraint applies only to simple ADD or SUB immediate operands: 6834 // i.e. 0 to 4095 with optional shift by 12 6835 // The J constraint applies only to ADD or SUB immediates that would be 6836 // valid when negated, i.e. if [an add pattern] were to be output as a SUB 6837 // instruction [or vice versa], in other words -1 to -4095 with optional 6838 // left shift by 12. 6839 case 'I': 6840 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal)) 6841 break; 6842 return; 6843 case 'J': { 6844 uint64_t NVal = -C->getSExtValue(); 6845 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) { 6846 CVal = C->getSExtValue(); 6847 break; 6848 } 6849 return; 6850 } 6851 // The K and L constraints apply *only* to logical immediates, including 6852 // what used to be the MOVI alias for ORR (though the MOVI alias has now 6853 // been removed and MOV should be used). So these constraints have to 6854 // distinguish between bit patterns that are valid 32-bit or 64-bit 6855 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but 6856 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice 6857 // versa. 6858 case 'K': 6859 if (AArch64_AM::isLogicalImmediate(CVal, 32)) 6860 break; 6861 return; 6862 case 'L': 6863 if (AArch64_AM::isLogicalImmediate(CVal, 64)) 6864 break; 6865 return; 6866 // The M and N constraints are a superset of K and L respectively, for use 6867 // with the MOV (immediate) alias. As well as the logical immediates they 6868 // also match 32 or 64-bit immediates that can be loaded either using a 6869 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca 6870 // (M) or 64-bit 0x1234000000000000 (N) etc. 6871 // As a note some of this code is liberally stolen from the asm parser. 6872 case 'M': { 6873 if (!isUInt<32>(CVal)) 6874 return; 6875 if (AArch64_AM::isLogicalImmediate(CVal, 32)) 6876 break; 6877 if ((CVal & 0xFFFF) == CVal) 6878 break; 6879 if ((CVal & 0xFFFF0000ULL) == CVal) 6880 break; 6881 uint64_t NCVal = ~(uint32_t)CVal; 6882 if ((NCVal & 0xFFFFULL) == NCVal) 6883 break; 6884 if ((NCVal & 0xFFFF0000ULL) == NCVal) 6885 break; 6886 return; 6887 } 6888 case 'N': { 6889 if (AArch64_AM::isLogicalImmediate(CVal, 64)) 6890 break; 6891 if ((CVal & 0xFFFFULL) == CVal) 6892 break; 6893 if ((CVal & 0xFFFF0000ULL) == CVal) 6894 break; 6895 if ((CVal & 0xFFFF00000000ULL) == CVal) 6896 break; 6897 if ((CVal & 0xFFFF000000000000ULL) == CVal) 6898 break; 6899 uint64_t NCVal = ~CVal; 6900 if ((NCVal & 0xFFFFULL) == NCVal) 6901 break; 6902 if ((NCVal & 0xFFFF0000ULL) == NCVal) 6903 break; 6904 if ((NCVal & 0xFFFF00000000ULL) == NCVal) 6905 break; 6906 if ((NCVal & 0xFFFF000000000000ULL) == NCVal) 6907 break; 6908 return; 6909 } 6910 default: 6911 return; 6912 } 6913 6914 // All assembler immediates are 64-bit integers. 6915 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64); 6916 break; 6917 } 6918 6919 if (Result.getNode()) { 6920 Ops.push_back(Result); 6921 return; 6922 } 6923 6924 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 6925 } 6926 6927 //===----------------------------------------------------------------------===// 6928 // AArch64 Advanced SIMD Support 6929 //===----------------------------------------------------------------------===// 6930 6931 /// WidenVector - Given a value in the V64 register class, produce the 6932 /// equivalent value in the V128 register class. 6933 static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) { 6934 EVT VT = V64Reg.getValueType(); 6935 unsigned NarrowSize = VT.getVectorNumElements(); 6936 MVT EltTy = VT.getVectorElementType().getSimpleVT(); 6937 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize); 6938 SDLoc DL(V64Reg); 6939 6940 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy), 6941 V64Reg, DAG.getConstant(0, DL, MVT::i32)); 6942 } 6943 6944 /// getExtFactor - Determine the adjustment factor for the position when 6945 /// generating an "extract from vector registers" instruction. 6946 static unsigned getExtFactor(SDValue &V) { 6947 EVT EltType = V.getValueType().getVectorElementType(); 6948 return EltType.getSizeInBits() / 8; 6949 } 6950 6951 /// NarrowVector - Given a value in the V128 register class, produce the 6952 /// equivalent value in the V64 register class. 6953 static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) { 6954 EVT VT = V128Reg.getValueType(); 6955 unsigned WideSize = VT.getVectorNumElements(); 6956 MVT EltTy = VT.getVectorElementType().getSimpleVT(); 6957 MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2); 6958 SDLoc DL(V128Reg); 6959 6960 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg); 6961 } 6962 6963 // Gather data to see if the operation can be modelled as a 6964 // shuffle in combination with VEXTs. 6965 SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, 6966 SelectionDAG &DAG) const { 6967 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 6968 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n"); 6969 SDLoc dl(Op); 6970 EVT VT = Op.getValueType(); 6971 unsigned NumElts = VT.getVectorNumElements(); 6972 6973 struct ShuffleSourceInfo { 6974 SDValue Vec; 6975 unsigned MinElt; 6976 unsigned MaxElt; 6977 6978 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to 6979 // be compatible with the shuffle we intend to construct. As a result 6980 // ShuffleVec will be some sliding window into the original Vec. 6981 SDValue ShuffleVec; 6982 6983 // Code should guarantee that element i in Vec starts at element "WindowBase 6984 // + i * WindowScale in ShuffleVec". 6985 int WindowBase; 6986 int WindowScale; 6987 6988 ShuffleSourceInfo(SDValue Vec) 6989 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0), 6990 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {} 6991 6992 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } 6993 }; 6994 6995 // First gather all vectors used as an immediate source for this BUILD_VECTOR 6996 // node. 6997 SmallVector<ShuffleSourceInfo, 2> Sources; 6998 for (unsigned i = 0; i < NumElts; ++i) { 6999 SDValue V = Op.getOperand(i); 7000 if (V.isUndef()) 7001 continue; 7002 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 7003 !isa<ConstantSDNode>(V.getOperand(1))) { 7004 LLVM_DEBUG( 7005 dbgs() << "Reshuffle failed: " 7006 "a shuffle can only come from building a vector from " 7007 "various elements of other vectors, provided their " 7008 "indices are constant\n"); 7009 return SDValue(); 7010 } 7011 7012 // Add this element source to the list if it's not already there. 7013 SDValue SourceVec = V.getOperand(0); 7014 auto Source = find(Sources, SourceVec); 7015 if (Source == Sources.end()) 7016 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); 7017 7018 // Update the minimum and maximum lane number seen. 7019 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); 7020 Source->MinElt = std::min(Source->MinElt, EltNo); 7021 Source->MaxElt = std::max(Source->MaxElt, EltNo); 7022 } 7023 7024 if (Sources.size() > 2) { 7025 LLVM_DEBUG( 7026 dbgs() << "Reshuffle failed: currently only do something sane when at " 7027 "most two source vectors are involved\n"); 7028 return SDValue(); 7029 } 7030 7031 // Find out the smallest element size among result and two sources, and use 7032 // it as element size to build the shuffle_vector. 7033 EVT SmallestEltTy = VT.getVectorElementType(); 7034 for (auto &Source : Sources) { 7035 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); 7036 if (SrcEltTy.bitsLT(SmallestEltTy)) { 7037 SmallestEltTy = SrcEltTy; 7038 } 7039 } 7040 unsigned ResMultiplier = 7041 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits(); 7042 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); 7043 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); 7044 7045 // If the source vector is too wide or too narrow, we may nevertheless be able 7046 // to construct a compatible shuffle either by concatenating it with UNDEF or 7047 // extracting a suitable range of elements. 7048 for (auto &Src : Sources) { 7049 EVT SrcVT = Src.ShuffleVec.getValueType(); 7050 7051 if (SrcVT.getSizeInBits() == VT.getSizeInBits()) 7052 continue; 7053 7054 // This stage of the search produces a source with the same element type as 7055 // the original, but with a total width matching the BUILD_VECTOR output. 7056 EVT EltVT = SrcVT.getVectorElementType(); 7057 unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); 7058 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); 7059 7060 if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { 7061 assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits()); 7062 // We can pad out the smaller vector for free, so if it's part of a 7063 // shuffle... 7064 Src.ShuffleVec = 7065 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, 7066 DAG.getUNDEF(Src.ShuffleVec.getValueType())); 7067 continue; 7068 } 7069 7070 assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits()); 7071 7072 if (Src.MaxElt - Src.MinElt >= NumSrcElts) { 7073 LLVM_DEBUG( 7074 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n"); 7075 return SDValue(); 7076 } 7077 7078 if (Src.MinElt >= NumSrcElts) { 7079 // The extraction can just take the second half 7080 Src.ShuffleVec = 7081 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 7082 DAG.getConstant(NumSrcElts, dl, MVT::i64)); 7083 Src.WindowBase = -NumSrcElts; 7084 } else if (Src.MaxElt < NumSrcElts) { 7085 // The extraction can just take the first half 7086 Src.ShuffleVec = 7087 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 7088 DAG.getConstant(0, dl, MVT::i64)); 7089 } else { 7090 // An actual VEXT is needed 7091 SDValue VEXTSrc1 = 7092 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 7093 DAG.getConstant(0, dl, MVT::i64)); 7094 SDValue VEXTSrc2 = 7095 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 7096 DAG.getConstant(NumSrcElts, dl, MVT::i64)); 7097 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1); 7098 7099 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1, 7100 VEXTSrc2, 7101 DAG.getConstant(Imm, dl, MVT::i32)); 7102 Src.WindowBase = -Src.MinElt; 7103 } 7104 } 7105 7106 // Another possible incompatibility occurs from the vector element types. We 7107 // can fix this by bitcasting the source vectors to the same type we intend 7108 // for the shuffle. 7109 for (auto &Src : Sources) { 7110 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); 7111 if (SrcEltTy == SmallestEltTy) 7112 continue; 7113 assert(ShuffleVT.getVectorElementType() == SmallestEltTy); 7114 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); 7115 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); 7116 Src.WindowBase *= Src.WindowScale; 7117 } 7118 7119 // Final sanity check before we try to actually produce a shuffle. 7120 LLVM_DEBUG(for (auto Src 7121 : Sources) 7122 assert(Src.ShuffleVec.getValueType() == ShuffleVT);); 7123 7124 // The stars all align, our next step is to produce the mask for the shuffle. 7125 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); 7126 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits(); 7127 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { 7128 SDValue Entry = Op.getOperand(i); 7129 if (Entry.isUndef()) 7130 continue; 7131 7132 auto Src = find(Sources, Entry.getOperand(0)); 7133 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); 7134 7135 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit 7136 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this 7137 // segment. 7138 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); 7139 int BitsDefined = 7140 std::min(OrigEltTy.getSizeInBits(), VT.getScalarSizeInBits()); 7141 int LanesDefined = BitsDefined / BitsPerShuffleLane; 7142 7143 // This source is expected to fill ResMultiplier lanes of the final shuffle, 7144 // starting at the appropriate offset. 7145 int *LaneMask = &Mask[i * ResMultiplier]; 7146 7147 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; 7148 ExtractBase += NumElts * (Src - Sources.begin()); 7149 for (int j = 0; j < LanesDefined; ++j) 7150 LaneMask[j] = ExtractBase + j; 7151 } 7152 7153 // Final check before we try to produce nonsense... 7154 if (!isShuffleMaskLegal(Mask, ShuffleVT)) { 7155 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n"); 7156 return SDValue(); 7157 } 7158 7159 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; 7160 for (unsigned i = 0; i < Sources.size(); ++i) 7161 ShuffleOps[i] = Sources[i].ShuffleVec; 7162 7163 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], 7164 ShuffleOps[1], Mask); 7165 SDValue V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); 7166 7167 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump(); 7168 dbgs() << "Reshuffle, creating node: "; V.dump();); 7169 7170 return V; 7171 } 7172 7173 // check if an EXT instruction can handle the shuffle mask when the 7174 // vector sources of the shuffle are the same. 7175 static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { 7176 unsigned NumElts = VT.getVectorNumElements(); 7177 7178 // Assume that the first shuffle index is not UNDEF. Fail if it is. 7179 if (M[0] < 0) 7180 return false; 7181 7182 Imm = M[0]; 7183 7184 // If this is a VEXT shuffle, the immediate value is the index of the first 7185 // element. The other shuffle indices must be the successive elements after 7186 // the first one. 7187 unsigned ExpectedElt = Imm; 7188 for (unsigned i = 1; i < NumElts; ++i) { 7189 // Increment the expected index. If it wraps around, just follow it 7190 // back to index zero and keep going. 7191 ++ExpectedElt; 7192 if (ExpectedElt == NumElts) 7193 ExpectedElt = 0; 7194 7195 if (M[i] < 0) 7196 continue; // ignore UNDEF indices 7197 if (ExpectedElt != static_cast<unsigned>(M[i])) 7198 return false; 7199 } 7200 7201 return true; 7202 } 7203 7204 // check if an EXT instruction can handle the shuffle mask when the 7205 // vector sources of the shuffle are different. 7206 static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT, 7207 unsigned &Imm) { 7208 // Look for the first non-undef element. 7209 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; }); 7210 7211 // Benefit form APInt to handle overflow when calculating expected element. 7212 unsigned NumElts = VT.getVectorNumElements(); 7213 unsigned MaskBits = APInt(32, NumElts * 2).logBase2(); 7214 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1); 7215 // The following shuffle indices must be the successive elements after the 7216 // first real element. 7217 const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(), 7218 [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;}); 7219 if (FirstWrongElt != M.end()) 7220 return false; 7221 7222 // The index of an EXT is the first element if it is not UNDEF. 7223 // Watch out for the beginning UNDEFs. The EXT index should be the expected 7224 // value of the first element. E.g. 7225 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>. 7226 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>. 7227 // ExpectedElt is the last mask index plus 1. 7228 Imm = ExpectedElt.getZExtValue(); 7229 7230 // There are two difference cases requiring to reverse input vectors. 7231 // For example, for vector <4 x i32> we have the following cases, 7232 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>) 7233 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>) 7234 // For both cases, we finally use mask <5, 6, 7, 0>, which requires 7235 // to reverse two input vectors. 7236 if (Imm < NumElts) 7237 ReverseEXT = true; 7238 else 7239 Imm -= NumElts; 7240 7241 return true; 7242 } 7243 7244 /// isREVMask - Check if a vector shuffle corresponds to a REV 7245 /// instruction with the specified blocksize. (The order of the elements 7246 /// within each block of the vector is reversed.) 7247 static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { 7248 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && 7249 "Only possible block sizes for REV are: 16, 32, 64"); 7250 7251 unsigned EltSz = VT.getScalarSizeInBits(); 7252 if (EltSz == 64) 7253 return false; 7254 7255 unsigned NumElts = VT.getVectorNumElements(); 7256 unsigned BlockElts = M[0] + 1; 7257 // If the first shuffle index is UNDEF, be optimistic. 7258 if (M[0] < 0) 7259 BlockElts = BlockSize / EltSz; 7260 7261 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) 7262 return false; 7263 7264 for (unsigned i = 0; i < NumElts; ++i) { 7265 if (M[i] < 0) 7266 continue; // ignore UNDEF indices 7267 if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts)) 7268 return false; 7269 } 7270 7271 return true; 7272 } 7273 7274 static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 7275 unsigned NumElts = VT.getVectorNumElements(); 7276 if (NumElts % 2 != 0) 7277 return false; 7278 WhichResult = (M[0] == 0 ? 0 : 1); 7279 unsigned Idx = WhichResult * NumElts / 2; 7280 for (unsigned i = 0; i != NumElts; i += 2) { 7281 if ((M[i] >= 0 && (unsigned)M[i] != Idx) || 7282 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts)) 7283 return false; 7284 Idx += 1; 7285 } 7286 7287 return true; 7288 } 7289 7290 static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 7291 unsigned NumElts = VT.getVectorNumElements(); 7292 WhichResult = (M[0] == 0 ? 0 : 1); 7293 for (unsigned i = 0; i != NumElts; ++i) { 7294 if (M[i] < 0) 7295 continue; // ignore UNDEF indices 7296 if ((unsigned)M[i] != 2 * i + WhichResult) 7297 return false; 7298 } 7299 7300 return true; 7301 } 7302 7303 static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 7304 unsigned NumElts = VT.getVectorNumElements(); 7305 if (NumElts % 2 != 0) 7306 return false; 7307 WhichResult = (M[0] == 0 ? 0 : 1); 7308 for (unsigned i = 0; i < NumElts; i += 2) { 7309 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || 7310 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult)) 7311 return false; 7312 } 7313 return true; 7314 } 7315 7316 /// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of 7317 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 7318 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. 7319 static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 7320 unsigned NumElts = VT.getVectorNumElements(); 7321 if (NumElts % 2 != 0) 7322 return false; 7323 WhichResult = (M[0] == 0 ? 0 : 1); 7324 unsigned Idx = WhichResult * NumElts / 2; 7325 for (unsigned i = 0; i != NumElts; i += 2) { 7326 if ((M[i] >= 0 && (unsigned)M[i] != Idx) || 7327 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx)) 7328 return false; 7329 Idx += 1; 7330 } 7331 7332 return true; 7333 } 7334 7335 /// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of 7336 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 7337 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, 7338 static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 7339 unsigned Half = VT.getVectorNumElements() / 2; 7340 WhichResult = (M[0] == 0 ? 0 : 1); 7341 for (unsigned j = 0; j != 2; ++j) { 7342 unsigned Idx = WhichResult; 7343 for (unsigned i = 0; i != Half; ++i) { 7344 int MIdx = M[i + j * Half]; 7345 if (MIdx >= 0 && (unsigned)MIdx != Idx) 7346 return false; 7347 Idx += 2; 7348 } 7349 } 7350 7351 return true; 7352 } 7353 7354 /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of 7355 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 7356 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. 7357 static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 7358 unsigned NumElts = VT.getVectorNumElements(); 7359 if (NumElts % 2 != 0) 7360 return false; 7361 WhichResult = (M[0] == 0 ? 0 : 1); 7362 for (unsigned i = 0; i < NumElts; i += 2) { 7363 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || 7364 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult)) 7365 return false; 7366 } 7367 return true; 7368 } 7369 7370 static bool isINSMask(ArrayRef<int> M, int NumInputElements, 7371 bool &DstIsLeft, int &Anomaly) { 7372 if (M.size() != static_cast<size_t>(NumInputElements)) 7373 return false; 7374 7375 int NumLHSMatch = 0, NumRHSMatch = 0; 7376 int LastLHSMismatch = -1, LastRHSMismatch = -1; 7377 7378 for (int i = 0; i < NumInputElements; ++i) { 7379 if (M[i] == -1) { 7380 ++NumLHSMatch; 7381 ++NumRHSMatch; 7382 continue; 7383 } 7384 7385 if (M[i] == i) 7386 ++NumLHSMatch; 7387 else 7388 LastLHSMismatch = i; 7389 7390 if (M[i] == i + NumInputElements) 7391 ++NumRHSMatch; 7392 else 7393 LastRHSMismatch = i; 7394 } 7395 7396 if (NumLHSMatch == NumInputElements - 1) { 7397 DstIsLeft = true; 7398 Anomaly = LastLHSMismatch; 7399 return true; 7400 } else if (NumRHSMatch == NumInputElements - 1) { 7401 DstIsLeft = false; 7402 Anomaly = LastRHSMismatch; 7403 return true; 7404 } 7405 7406 return false; 7407 } 7408 7409 static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) { 7410 if (VT.getSizeInBits() != 128) 7411 return false; 7412 7413 unsigned NumElts = VT.getVectorNumElements(); 7414 7415 for (int I = 0, E = NumElts / 2; I != E; I++) { 7416 if (Mask[I] != I) 7417 return false; 7418 } 7419 7420 int Offset = NumElts / 2; 7421 for (int I = NumElts / 2, E = NumElts; I != E; I++) { 7422 if (Mask[I] != I + SplitLHS * Offset) 7423 return false; 7424 } 7425 7426 return true; 7427 } 7428 7429 static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) { 7430 SDLoc DL(Op); 7431 EVT VT = Op.getValueType(); 7432 SDValue V0 = Op.getOperand(0); 7433 SDValue V1 = Op.getOperand(1); 7434 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask(); 7435 7436 if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() || 7437 VT.getVectorElementType() != V1.getValueType().getVectorElementType()) 7438 return SDValue(); 7439 7440 bool SplitV0 = V0.getValueSizeInBits() == 128; 7441 7442 if (!isConcatMask(Mask, VT, SplitV0)) 7443 return SDValue(); 7444 7445 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); 7446 if (SplitV0) { 7447 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0, 7448 DAG.getConstant(0, DL, MVT::i64)); 7449 } 7450 if (V1.getValueSizeInBits() == 128) { 7451 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1, 7452 DAG.getConstant(0, DL, MVT::i64)); 7453 } 7454 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1); 7455 } 7456 7457 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit 7458 /// the specified operations to build the shuffle. 7459 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, 7460 SDValue RHS, SelectionDAG &DAG, 7461 const SDLoc &dl) { 7462 unsigned OpNum = (PFEntry >> 26) & 0x0F; 7463 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1); 7464 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1); 7465 7466 enum { 7467 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 7468 OP_VREV, 7469 OP_VDUP0, 7470 OP_VDUP1, 7471 OP_VDUP2, 7472 OP_VDUP3, 7473 OP_VEXT1, 7474 OP_VEXT2, 7475 OP_VEXT3, 7476 OP_VUZPL, // VUZP, left result 7477 OP_VUZPR, // VUZP, right result 7478 OP_VZIPL, // VZIP, left result 7479 OP_VZIPR, // VZIP, right result 7480 OP_VTRNL, // VTRN, left result 7481 OP_VTRNR // VTRN, right result 7482 }; 7483 7484 if (OpNum == OP_COPY) { 7485 if (LHSID == (1 * 9 + 2) * 9 + 3) 7486 return LHS; 7487 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!"); 7488 return RHS; 7489 } 7490 7491 SDValue OpLHS, OpRHS; 7492 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); 7493 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); 7494 EVT VT = OpLHS.getValueType(); 7495 7496 switch (OpNum) { 7497 default: 7498 llvm_unreachable("Unknown shuffle opcode!"); 7499 case OP_VREV: 7500 // VREV divides the vector in half and swaps within the half. 7501 if (VT.getVectorElementType() == MVT::i32 || 7502 VT.getVectorElementType() == MVT::f32) 7503 return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS); 7504 // vrev <4 x i16> -> REV32 7505 if (VT.getVectorElementType() == MVT::i16 || 7506 VT.getVectorElementType() == MVT::f16 || 7507 VT.getVectorElementType() == MVT::bf16) 7508 return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS); 7509 // vrev <4 x i8> -> REV16 7510 assert(VT.getVectorElementType() == MVT::i8); 7511 return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS); 7512 case OP_VDUP0: 7513 case OP_VDUP1: 7514 case OP_VDUP2: 7515 case OP_VDUP3: { 7516 EVT EltTy = VT.getVectorElementType(); 7517 unsigned Opcode; 7518 if (EltTy == MVT::i8) 7519 Opcode = AArch64ISD::DUPLANE8; 7520 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16) 7521 Opcode = AArch64ISD::DUPLANE16; 7522 else if (EltTy == MVT::i32 || EltTy == MVT::f32) 7523 Opcode = AArch64ISD::DUPLANE32; 7524 else if (EltTy == MVT::i64 || EltTy == MVT::f64) 7525 Opcode = AArch64ISD::DUPLANE64; 7526 else 7527 llvm_unreachable("Invalid vector element type?"); 7528 7529 if (VT.getSizeInBits() == 64) 7530 OpLHS = WidenVector(OpLHS, DAG); 7531 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64); 7532 return DAG.getNode(Opcode, dl, VT, OpLHS, Lane); 7533 } 7534 case OP_VEXT1: 7535 case OP_VEXT2: 7536 case OP_VEXT3: { 7537 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS); 7538 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS, 7539 DAG.getConstant(Imm, dl, MVT::i32)); 7540 } 7541 case OP_VUZPL: 7542 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS, 7543 OpRHS); 7544 case OP_VUZPR: 7545 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS, 7546 OpRHS); 7547 case OP_VZIPL: 7548 return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS, 7549 OpRHS); 7550 case OP_VZIPR: 7551 return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS, 7552 OpRHS); 7553 case OP_VTRNL: 7554 return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS, 7555 OpRHS); 7556 case OP_VTRNR: 7557 return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS, 7558 OpRHS); 7559 } 7560 } 7561 7562 static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask, 7563 SelectionDAG &DAG) { 7564 // Check to see if we can use the TBL instruction. 7565 SDValue V1 = Op.getOperand(0); 7566 SDValue V2 = Op.getOperand(1); 7567 SDLoc DL(Op); 7568 7569 EVT EltVT = Op.getValueType().getVectorElementType(); 7570 unsigned BytesPerElt = EltVT.getSizeInBits() / 8; 7571 7572 SmallVector<SDValue, 8> TBLMask; 7573 for (int Val : ShuffleMask) { 7574 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { 7575 unsigned Offset = Byte + Val * BytesPerElt; 7576 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32)); 7577 } 7578 } 7579 7580 MVT IndexVT = MVT::v8i8; 7581 unsigned IndexLen = 8; 7582 if (Op.getValueSizeInBits() == 128) { 7583 IndexVT = MVT::v16i8; 7584 IndexLen = 16; 7585 } 7586 7587 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1); 7588 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2); 7589 7590 SDValue Shuffle; 7591 if (V2.getNode()->isUndef()) { 7592 if (IndexLen == 8) 7593 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst); 7594 Shuffle = DAG.getNode( 7595 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, 7596 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, 7597 DAG.getBuildVector(IndexVT, DL, 7598 makeArrayRef(TBLMask.data(), IndexLen))); 7599 } else { 7600 if (IndexLen == 8) { 7601 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst); 7602 Shuffle = DAG.getNode( 7603 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, 7604 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, 7605 DAG.getBuildVector(IndexVT, DL, 7606 makeArrayRef(TBLMask.data(), IndexLen))); 7607 } else { 7608 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we 7609 // cannot currently represent the register constraints on the input 7610 // table registers. 7611 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst, 7612 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0], 7613 // IndexLen)); 7614 Shuffle = DAG.getNode( 7615 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, 7616 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst, 7617 V2Cst, DAG.getBuildVector(IndexVT, DL, 7618 makeArrayRef(TBLMask.data(), IndexLen))); 7619 } 7620 } 7621 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle); 7622 } 7623 7624 static unsigned getDUPLANEOp(EVT EltType) { 7625 if (EltType == MVT::i8) 7626 return AArch64ISD::DUPLANE8; 7627 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16) 7628 return AArch64ISD::DUPLANE16; 7629 if (EltType == MVT::i32 || EltType == MVT::f32) 7630 return AArch64ISD::DUPLANE32; 7631 if (EltType == MVT::i64 || EltType == MVT::f64) 7632 return AArch64ISD::DUPLANE64; 7633 7634 llvm_unreachable("Invalid vector element type?"); 7635 } 7636 7637 SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, 7638 SelectionDAG &DAG) const { 7639 SDLoc dl(Op); 7640 EVT VT = Op.getValueType(); 7641 7642 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 7643 7644 // Convert shuffles that are directly supported on NEON to target-specific 7645 // DAG nodes, instead of keeping them as shuffles and matching them again 7646 // during code selection. This is more efficient and avoids the possibility 7647 // of inconsistencies between legalization and selection. 7648 ArrayRef<int> ShuffleMask = SVN->getMask(); 7649 7650 SDValue V1 = Op.getOperand(0); 7651 SDValue V2 = Op.getOperand(1); 7652 7653 if (SVN->isSplat()) { 7654 int Lane = SVN->getSplatIndex(); 7655 // If this is undef splat, generate it via "just" vdup, if possible. 7656 if (Lane == -1) 7657 Lane = 0; 7658 7659 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) 7660 return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(), 7661 V1.getOperand(0)); 7662 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non- 7663 // constant. If so, we can just reference the lane's definition directly. 7664 if (V1.getOpcode() == ISD::BUILD_VECTOR && 7665 !isa<ConstantSDNode>(V1.getOperand(Lane))) 7666 return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane)); 7667 7668 // Otherwise, duplicate from the lane of the input vector. 7669 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType()); 7670 7671 // Try to eliminate a bitcasted extract subvector before a DUPLANE. 7672 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) { 7673 // Match: dup (bitcast (extract_subv X, C)), LaneC 7674 if (BitCast.getOpcode() != ISD::BITCAST || 7675 BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR) 7676 return false; 7677 7678 // The extract index must align in the destination type. That may not 7679 // happen if the bitcast is from narrow to wide type. 7680 SDValue Extract = BitCast.getOperand(0); 7681 unsigned ExtIdx = Extract.getConstantOperandVal(1); 7682 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits(); 7683 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth; 7684 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits(); 7685 if (ExtIdxInBits % CastedEltBitWidth != 0) 7686 return false; 7687 7688 // Update the lane value by offsetting with the scaled extract index. 7689 LaneC += ExtIdxInBits / CastedEltBitWidth; 7690 7691 // Determine the casted vector type of the wide vector input. 7692 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC' 7693 // Examples: 7694 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3 7695 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5 7696 unsigned SrcVecNumElts = 7697 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth; 7698 CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(), 7699 SrcVecNumElts); 7700 return true; 7701 }; 7702 MVT CastVT; 7703 if (getScaledOffsetDup(V1, Lane, CastVT)) { 7704 V1 = DAG.getBitcast(CastVT, V1.getOperand(0).getOperand(0)); 7705 } else if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) { 7706 // The lane is incremented by the index of the extract. 7707 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3 7708 Lane += V1.getConstantOperandVal(1); 7709 V1 = V1.getOperand(0); 7710 } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) { 7711 // The lane is decremented if we are splatting from the 2nd operand. 7712 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1 7713 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2; 7714 Lane -= Idx * VT.getVectorNumElements() / 2; 7715 V1 = WidenVector(V1.getOperand(Idx), DAG); 7716 } else if (VT.getSizeInBits() == 64) { 7717 // Widen the operand to 128-bit register with undef. 7718 V1 = WidenVector(V1, DAG); 7719 } 7720 return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64)); 7721 } 7722 7723 if (isREVMask(ShuffleMask, VT, 64)) 7724 return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2); 7725 if (isREVMask(ShuffleMask, VT, 32)) 7726 return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2); 7727 if (isREVMask(ShuffleMask, VT, 16)) 7728 return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2); 7729 7730 bool ReverseEXT = false; 7731 unsigned Imm; 7732 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) { 7733 if (ReverseEXT) 7734 std::swap(V1, V2); 7735 Imm *= getExtFactor(V1); 7736 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2, 7737 DAG.getConstant(Imm, dl, MVT::i32)); 7738 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) { 7739 Imm *= getExtFactor(V1); 7740 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1, 7741 DAG.getConstant(Imm, dl, MVT::i32)); 7742 } 7743 7744 unsigned WhichResult; 7745 if (isZIPMask(ShuffleMask, VT, WhichResult)) { 7746 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; 7747 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); 7748 } 7749 if (isUZPMask(ShuffleMask, VT, WhichResult)) { 7750 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; 7751 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); 7752 } 7753 if (isTRNMask(ShuffleMask, VT, WhichResult)) { 7754 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; 7755 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); 7756 } 7757 7758 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 7759 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; 7760 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); 7761 } 7762 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 7763 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; 7764 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); 7765 } 7766 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 7767 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; 7768 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); 7769 } 7770 7771 if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG)) 7772 return Concat; 7773 7774 bool DstIsLeft; 7775 int Anomaly; 7776 int NumInputElements = V1.getValueType().getVectorNumElements(); 7777 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) { 7778 SDValue DstVec = DstIsLeft ? V1 : V2; 7779 SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64); 7780 7781 SDValue SrcVec = V1; 7782 int SrcLane = ShuffleMask[Anomaly]; 7783 if (SrcLane >= NumInputElements) { 7784 SrcVec = V2; 7785 SrcLane -= VT.getVectorNumElements(); 7786 } 7787 SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64); 7788 7789 EVT ScalarVT = VT.getVectorElementType(); 7790 7791 if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger()) 7792 ScalarVT = MVT::i32; 7793 7794 return DAG.getNode( 7795 ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, 7796 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV), 7797 DstLaneV); 7798 } 7799 7800 // If the shuffle is not directly supported and it has 4 elements, use 7801 // the PerfectShuffle-generated table to synthesize it from other shuffles. 7802 unsigned NumElts = VT.getVectorNumElements(); 7803 if (NumElts == 4) { 7804 unsigned PFIndexes[4]; 7805 for (unsigned i = 0; i != 4; ++i) { 7806 if (ShuffleMask[i] < 0) 7807 PFIndexes[i] = 8; 7808 else 7809 PFIndexes[i] = ShuffleMask[i]; 7810 } 7811 7812 // Compute the index in the perfect shuffle table. 7813 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + 7814 PFIndexes[2] * 9 + PFIndexes[3]; 7815 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 7816 unsigned Cost = (PFEntry >> 30); 7817 7818 if (Cost <= 4) 7819 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 7820 } 7821 7822 return GenerateTBL(Op, ShuffleMask, DAG); 7823 } 7824 7825 SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op, 7826 SelectionDAG &DAG) const { 7827 SDLoc dl(Op); 7828 EVT VT = Op.getValueType(); 7829 EVT ElemVT = VT.getScalarType(); 7830 7831 SDValue SplatVal = Op.getOperand(0); 7832 7833 // Extend input splat value where needed to fit into a GPR (32b or 64b only) 7834 // FPRs don't have this restriction. 7835 switch (ElemVT.getSimpleVT().SimpleTy) { 7836 case MVT::i1: { 7837 // The only legal i1 vectors are SVE vectors, so we can use SVE-specific 7838 // lowering code. 7839 if (auto *ConstVal = dyn_cast<ConstantSDNode>(SplatVal)) { 7840 if (ConstVal->isOne()) 7841 return getPTrue(DAG, dl, VT, AArch64SVEPredPattern::all); 7842 // TODO: Add special case for constant false 7843 } 7844 // The general case of i1. There isn't any natural way to do this, 7845 // so we use some trickery with whilelo. 7846 SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64); 7847 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i64, SplatVal, 7848 DAG.getValueType(MVT::i1)); 7849 SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, 7850 MVT::i64); 7851 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, 7852 DAG.getConstant(0, dl, MVT::i64), SplatVal); 7853 } 7854 case MVT::i8: 7855 case MVT::i16: 7856 case MVT::i32: 7857 SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32); 7858 break; 7859 case MVT::i64: 7860 SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64); 7861 break; 7862 case MVT::f16: 7863 case MVT::bf16: 7864 case MVT::f32: 7865 case MVT::f64: 7866 // Fine as is 7867 break; 7868 default: 7869 report_fatal_error("Unsupported SPLAT_VECTOR input operand type"); 7870 } 7871 7872 return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal); 7873 } 7874 7875 SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op, 7876 SelectionDAG &DAG) const { 7877 SDLoc DL(Op); 7878 7879 EVT VT = Op.getValueType(); 7880 if (!isTypeLegal(VT) || !VT.isScalableVector()) 7881 return SDValue(); 7882 7883 // Current lowering only supports the SVE-ACLE types. 7884 if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock) 7885 return SDValue(); 7886 7887 // The DUPQ operation is indepedent of element type so normalise to i64s. 7888 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1)); 7889 SDValue Idx128 = Op.getOperand(2); 7890 7891 // DUPQ can be used when idx is in range. 7892 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128); 7893 if (CIdx && (CIdx->getZExtValue() <= 3)) { 7894 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64); 7895 SDNode *DUPQ = 7896 DAG.getMachineNode(AArch64::DUP_ZZI_Q, DL, MVT::nxv2i64, V, CI); 7897 return DAG.getNode(ISD::BITCAST, DL, VT, SDValue(DUPQ, 0)); 7898 } 7899 7900 // The ACLE says this must produce the same result as: 7901 // svtbl(data, svadd_x(svptrue_b64(), 7902 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1), 7903 // index * 2)) 7904 SDValue One = DAG.getConstant(1, DL, MVT::i64); 7905 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One); 7906 7907 // create the vector 0,1,0,1,... 7908 SDValue Zero = DAG.getConstant(0, DL, MVT::i64); 7909 SDValue SV = DAG.getNode(AArch64ISD::INDEX_VECTOR, 7910 DL, MVT::nxv2i64, Zero, One); 7911 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne); 7912 7913 // create the vector idx64,idx64+1,idx64,idx64+1,... 7914 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128); 7915 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64); 7916 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64); 7917 7918 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],... 7919 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask); 7920 return DAG.getNode(ISD::BITCAST, DL, VT, TBL); 7921 } 7922 7923 7924 static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, 7925 APInt &UndefBits) { 7926 EVT VT = BVN->getValueType(0); 7927 APInt SplatBits, SplatUndef; 7928 unsigned SplatBitSize; 7929 bool HasAnyUndefs; 7930 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 7931 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize; 7932 7933 for (unsigned i = 0; i < NumSplats; ++i) { 7934 CnstBits <<= SplatBitSize; 7935 UndefBits <<= SplatBitSize; 7936 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits()); 7937 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits()); 7938 } 7939 7940 return true; 7941 } 7942 7943 return false; 7944 } 7945 7946 // Try 64-bit splatted SIMD immediate. 7947 static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, 7948 const APInt &Bits) { 7949 if (Bits.getHiBits(64) == Bits.getLoBits(64)) { 7950 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); 7951 EVT VT = Op.getValueType(); 7952 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64; 7953 7954 if (AArch64_AM::isAdvSIMDModImmType10(Value)) { 7955 Value = AArch64_AM::encodeAdvSIMDModImmType10(Value); 7956 7957 SDLoc dl(Op); 7958 SDValue Mov = DAG.getNode(NewOp, dl, MovTy, 7959 DAG.getConstant(Value, dl, MVT::i32)); 7960 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 7961 } 7962 } 7963 7964 return SDValue(); 7965 } 7966 7967 // Try 32-bit splatted SIMD immediate. 7968 static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, 7969 const APInt &Bits, 7970 const SDValue *LHS = nullptr) { 7971 if (Bits.getHiBits(64) == Bits.getLoBits(64)) { 7972 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); 7973 EVT VT = Op.getValueType(); 7974 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 7975 bool isAdvSIMDModImm = false; 7976 uint64_t Shift; 7977 7978 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) { 7979 Value = AArch64_AM::encodeAdvSIMDModImmType1(Value); 7980 Shift = 0; 7981 } 7982 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) { 7983 Value = AArch64_AM::encodeAdvSIMDModImmType2(Value); 7984 Shift = 8; 7985 } 7986 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) { 7987 Value = AArch64_AM::encodeAdvSIMDModImmType3(Value); 7988 Shift = 16; 7989 } 7990 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) { 7991 Value = AArch64_AM::encodeAdvSIMDModImmType4(Value); 7992 Shift = 24; 7993 } 7994 7995 if (isAdvSIMDModImm) { 7996 SDLoc dl(Op); 7997 SDValue Mov; 7998 7999 if (LHS) 8000 Mov = DAG.getNode(NewOp, dl, MovTy, *LHS, 8001 DAG.getConstant(Value, dl, MVT::i32), 8002 DAG.getConstant(Shift, dl, MVT::i32)); 8003 else 8004 Mov = DAG.getNode(NewOp, dl, MovTy, 8005 DAG.getConstant(Value, dl, MVT::i32), 8006 DAG.getConstant(Shift, dl, MVT::i32)); 8007 8008 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 8009 } 8010 } 8011 8012 return SDValue(); 8013 } 8014 8015 // Try 16-bit splatted SIMD immediate. 8016 static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, 8017 const APInt &Bits, 8018 const SDValue *LHS = nullptr) { 8019 if (Bits.getHiBits(64) == Bits.getLoBits(64)) { 8020 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); 8021 EVT VT = Op.getValueType(); 8022 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 8023 bool isAdvSIMDModImm = false; 8024 uint64_t Shift; 8025 8026 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) { 8027 Value = AArch64_AM::encodeAdvSIMDModImmType5(Value); 8028 Shift = 0; 8029 } 8030 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) { 8031 Value = AArch64_AM::encodeAdvSIMDModImmType6(Value); 8032 Shift = 8; 8033 } 8034 8035 if (isAdvSIMDModImm) { 8036 SDLoc dl(Op); 8037 SDValue Mov; 8038 8039 if (LHS) 8040 Mov = DAG.getNode(NewOp, dl, MovTy, *LHS, 8041 DAG.getConstant(Value, dl, MVT::i32), 8042 DAG.getConstant(Shift, dl, MVT::i32)); 8043 else 8044 Mov = DAG.getNode(NewOp, dl, MovTy, 8045 DAG.getConstant(Value, dl, MVT::i32), 8046 DAG.getConstant(Shift, dl, MVT::i32)); 8047 8048 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 8049 } 8050 } 8051 8052 return SDValue(); 8053 } 8054 8055 // Try 32-bit splatted SIMD immediate with shifted ones. 8056 static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, 8057 SelectionDAG &DAG, const APInt &Bits) { 8058 if (Bits.getHiBits(64) == Bits.getLoBits(64)) { 8059 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); 8060 EVT VT = Op.getValueType(); 8061 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 8062 bool isAdvSIMDModImm = false; 8063 uint64_t Shift; 8064 8065 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) { 8066 Value = AArch64_AM::encodeAdvSIMDModImmType7(Value); 8067 Shift = 264; 8068 } 8069 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) { 8070 Value = AArch64_AM::encodeAdvSIMDModImmType8(Value); 8071 Shift = 272; 8072 } 8073 8074 if (isAdvSIMDModImm) { 8075 SDLoc dl(Op); 8076 SDValue Mov = DAG.getNode(NewOp, dl, MovTy, 8077 DAG.getConstant(Value, dl, MVT::i32), 8078 DAG.getConstant(Shift, dl, MVT::i32)); 8079 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 8080 } 8081 } 8082 8083 return SDValue(); 8084 } 8085 8086 // Try 8-bit splatted SIMD immediate. 8087 static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, 8088 const APInt &Bits) { 8089 if (Bits.getHiBits(64) == Bits.getLoBits(64)) { 8090 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); 8091 EVT VT = Op.getValueType(); 8092 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8; 8093 8094 if (AArch64_AM::isAdvSIMDModImmType9(Value)) { 8095 Value = AArch64_AM::encodeAdvSIMDModImmType9(Value); 8096 8097 SDLoc dl(Op); 8098 SDValue Mov = DAG.getNode(NewOp, dl, MovTy, 8099 DAG.getConstant(Value, dl, MVT::i32)); 8100 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 8101 } 8102 } 8103 8104 return SDValue(); 8105 } 8106 8107 // Try FP splatted SIMD immediate. 8108 static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, 8109 const APInt &Bits) { 8110 if (Bits.getHiBits(64) == Bits.getLoBits(64)) { 8111 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); 8112 EVT VT = Op.getValueType(); 8113 bool isWide = (VT.getSizeInBits() == 128); 8114 MVT MovTy; 8115 bool isAdvSIMDModImm = false; 8116 8117 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) { 8118 Value = AArch64_AM::encodeAdvSIMDModImmType11(Value); 8119 MovTy = isWide ? MVT::v4f32 : MVT::v2f32; 8120 } 8121 else if (isWide && 8122 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) { 8123 Value = AArch64_AM::encodeAdvSIMDModImmType12(Value); 8124 MovTy = MVT::v2f64; 8125 } 8126 8127 if (isAdvSIMDModImm) { 8128 SDLoc dl(Op); 8129 SDValue Mov = DAG.getNode(NewOp, dl, MovTy, 8130 DAG.getConstant(Value, dl, MVT::i32)); 8131 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 8132 } 8133 } 8134 8135 return SDValue(); 8136 } 8137 8138 // Specialized code to quickly find if PotentialBVec is a BuildVector that 8139 // consists of only the same constant int value, returned in reference arg 8140 // ConstVal 8141 static bool isAllConstantBuildVector(const SDValue &PotentialBVec, 8142 uint64_t &ConstVal) { 8143 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec); 8144 if (!Bvec) 8145 return false; 8146 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0)); 8147 if (!FirstElt) 8148 return false; 8149 EVT VT = Bvec->getValueType(0); 8150 unsigned NumElts = VT.getVectorNumElements(); 8151 for (unsigned i = 1; i < NumElts; ++i) 8152 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt) 8153 return false; 8154 ConstVal = FirstElt->getZExtValue(); 8155 return true; 8156 } 8157 8158 static unsigned getIntrinsicID(const SDNode *N) { 8159 unsigned Opcode = N->getOpcode(); 8160 switch (Opcode) { 8161 default: 8162 return Intrinsic::not_intrinsic; 8163 case ISD::INTRINSIC_WO_CHAIN: { 8164 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 8165 if (IID < Intrinsic::num_intrinsics) 8166 return IID; 8167 return Intrinsic::not_intrinsic; 8168 } 8169 } 8170 } 8171 8172 // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)), 8173 // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a 8174 // BUILD_VECTORs with constant element C1, C2 is a constant, and: 8175 // - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2) 8176 // - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2) 8177 // The (or (lsl Y, C2), (and X, BvecC1)) case is also handled. 8178 static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { 8179 EVT VT = N->getValueType(0); 8180 8181 if (!VT.isVector()) 8182 return SDValue(); 8183 8184 SDLoc DL(N); 8185 8186 SDValue And; 8187 SDValue Shift; 8188 8189 SDValue FirstOp = N->getOperand(0); 8190 unsigned FirstOpc = FirstOp.getOpcode(); 8191 SDValue SecondOp = N->getOperand(1); 8192 unsigned SecondOpc = SecondOp.getOpcode(); 8193 8194 // Is one of the operands an AND or a BICi? The AND may have been optimised to 8195 // a BICi in order to use an immediate instead of a register. 8196 // Is the other operand an shl or lshr? This will have been turned into: 8197 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift. 8198 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) && 8199 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR)) { 8200 And = FirstOp; 8201 Shift = SecondOp; 8202 8203 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) && 8204 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR)) { 8205 And = SecondOp; 8206 Shift = FirstOp; 8207 } else 8208 return SDValue(); 8209 8210 bool IsAnd = And.getOpcode() == ISD::AND; 8211 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR; 8212 8213 // Is the shift amount constant? 8214 ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1)); 8215 if (!C2node) 8216 return SDValue(); 8217 8218 uint64_t C1; 8219 if (IsAnd) { 8220 // Is the and mask vector all constant? 8221 if (!isAllConstantBuildVector(And.getOperand(1), C1)) 8222 return SDValue(); 8223 } else { 8224 // Reconstruct the corresponding AND immediate from the two BICi immediates. 8225 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1)); 8226 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2)); 8227 assert(C1nodeImm && C1nodeShift); 8228 C1 = ~(C1nodeImm->getZExtValue() << C1nodeShift->getZExtValue()); 8229 } 8230 8231 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or 8232 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account 8233 // how much one can shift elements of a particular size? 8234 uint64_t C2 = C2node->getZExtValue(); 8235 unsigned ElemSizeInBits = VT.getScalarSizeInBits(); 8236 if (C2 > ElemSizeInBits) 8237 return SDValue(); 8238 8239 APInt C1AsAPInt(ElemSizeInBits, C1); 8240 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2) 8241 : APInt::getLowBitsSet(ElemSizeInBits, C2); 8242 if (C1AsAPInt != RequiredC1) 8243 return SDValue(); 8244 8245 SDValue X = And.getOperand(0); 8246 SDValue Y = Shift.getOperand(0); 8247 8248 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI; 8249 SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Shift.getOperand(1)); 8250 8251 LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n"); 8252 LLVM_DEBUG(N->dump(&DAG)); 8253 LLVM_DEBUG(dbgs() << "into: \n"); 8254 LLVM_DEBUG(ResultSLI->dump(&DAG)); 8255 8256 ++NumShiftInserts; 8257 return ResultSLI; 8258 } 8259 8260 SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, 8261 SelectionDAG &DAG) const { 8262 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2)) 8263 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG)) 8264 return Res; 8265 8266 EVT VT = Op.getValueType(); 8267 8268 SDValue LHS = Op.getOperand(0); 8269 BuildVectorSDNode *BVN = 8270 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode()); 8271 if (!BVN) { 8272 // OR commutes, so try swapping the operands. 8273 LHS = Op.getOperand(1); 8274 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode()); 8275 } 8276 if (!BVN) 8277 return Op; 8278 8279 APInt DefBits(VT.getSizeInBits(), 0); 8280 APInt UndefBits(VT.getSizeInBits(), 0); 8281 if (resolveBuildVector(BVN, DefBits, UndefBits)) { 8282 SDValue NewOp; 8283 8284 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG, 8285 DefBits, &LHS)) || 8286 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG, 8287 DefBits, &LHS))) 8288 return NewOp; 8289 8290 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG, 8291 UndefBits, &LHS)) || 8292 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG, 8293 UndefBits, &LHS))) 8294 return NewOp; 8295 } 8296 8297 // We can always fall back to a non-immediate OR. 8298 return Op; 8299 } 8300 8301 // Normalize the operands of BUILD_VECTOR. The value of constant operands will 8302 // be truncated to fit element width. 8303 static SDValue NormalizeBuildVector(SDValue Op, 8304 SelectionDAG &DAG) { 8305 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 8306 SDLoc dl(Op); 8307 EVT VT = Op.getValueType(); 8308 EVT EltTy= VT.getVectorElementType(); 8309 8310 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16) 8311 return Op; 8312 8313 SmallVector<SDValue, 16> Ops; 8314 for (SDValue Lane : Op->ops()) { 8315 // For integer vectors, type legalization would have promoted the 8316 // operands already. Otherwise, if Op is a floating-point splat 8317 // (with operands cast to integers), then the only possibilities 8318 // are constants and UNDEFs. 8319 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) { 8320 APInt LowBits(EltTy.getSizeInBits(), 8321 CstLane->getZExtValue()); 8322 Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32); 8323 } else if (Lane.getNode()->isUndef()) { 8324 Lane = DAG.getUNDEF(MVT::i32); 8325 } else { 8326 assert(Lane.getValueType() == MVT::i32 && 8327 "Unexpected BUILD_VECTOR operand type"); 8328 } 8329 Ops.push_back(Lane); 8330 } 8331 return DAG.getBuildVector(VT, dl, Ops); 8332 } 8333 8334 static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG) { 8335 EVT VT = Op.getValueType(); 8336 8337 APInt DefBits(VT.getSizeInBits(), 0); 8338 APInt UndefBits(VT.getSizeInBits(), 0); 8339 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 8340 if (resolveBuildVector(BVN, DefBits, UndefBits)) { 8341 SDValue NewOp; 8342 if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) || 8343 (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || 8344 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) || 8345 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || 8346 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) || 8347 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits))) 8348 return NewOp; 8349 8350 DefBits = ~DefBits; 8351 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) || 8352 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) || 8353 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits))) 8354 return NewOp; 8355 8356 DefBits = UndefBits; 8357 if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) || 8358 (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || 8359 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) || 8360 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || 8361 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) || 8362 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits))) 8363 return NewOp; 8364 8365 DefBits = ~UndefBits; 8366 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) || 8367 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) || 8368 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits))) 8369 return NewOp; 8370 } 8371 8372 return SDValue(); 8373 } 8374 8375 SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, 8376 SelectionDAG &DAG) const { 8377 EVT VT = Op.getValueType(); 8378 8379 // Try to build a simple constant vector. 8380 Op = NormalizeBuildVector(Op, DAG); 8381 if (VT.isInteger()) { 8382 // Certain vector constants, used to express things like logical NOT and 8383 // arithmetic NEG, are passed through unmodified. This allows special 8384 // patterns for these operations to match, which will lower these constants 8385 // to whatever is proven necessary. 8386 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 8387 if (BVN->isConstant()) 8388 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) { 8389 unsigned BitSize = VT.getVectorElementType().getSizeInBits(); 8390 APInt Val(BitSize, 8391 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue()); 8392 if (Val.isNullValue() || Val.isAllOnesValue()) 8393 return Op; 8394 } 8395 } 8396 8397 if (SDValue V = ConstantBuildVector(Op, DAG)) 8398 return V; 8399 8400 // Scan through the operands to find some interesting properties we can 8401 // exploit: 8402 // 1) If only one value is used, we can use a DUP, or 8403 // 2) if only the low element is not undef, we can just insert that, or 8404 // 3) if only one constant value is used (w/ some non-constant lanes), 8405 // we can splat the constant value into the whole vector then fill 8406 // in the non-constant lanes. 8407 // 4) FIXME: If different constant values are used, but we can intelligently 8408 // select the values we'll be overwriting for the non-constant 8409 // lanes such that we can directly materialize the vector 8410 // some other way (MOVI, e.g.), we can be sneaky. 8411 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP. 8412 SDLoc dl(Op); 8413 unsigned NumElts = VT.getVectorNumElements(); 8414 bool isOnlyLowElement = true; 8415 bool usesOnlyOneValue = true; 8416 bool usesOnlyOneConstantValue = true; 8417 bool isConstant = true; 8418 bool AllLanesExtractElt = true; 8419 unsigned NumConstantLanes = 0; 8420 SDValue Value; 8421 SDValue ConstantValue; 8422 for (unsigned i = 0; i < NumElts; ++i) { 8423 SDValue V = Op.getOperand(i); 8424 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) 8425 AllLanesExtractElt = false; 8426 if (V.isUndef()) 8427 continue; 8428 if (i > 0) 8429 isOnlyLowElement = false; 8430 if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) 8431 isConstant = false; 8432 8433 if (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V)) { 8434 ++NumConstantLanes; 8435 if (!ConstantValue.getNode()) 8436 ConstantValue = V; 8437 else if (ConstantValue != V) 8438 usesOnlyOneConstantValue = false; 8439 } 8440 8441 if (!Value.getNode()) 8442 Value = V; 8443 else if (V != Value) 8444 usesOnlyOneValue = false; 8445 } 8446 8447 if (!Value.getNode()) { 8448 LLVM_DEBUG( 8449 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n"); 8450 return DAG.getUNDEF(VT); 8451 } 8452 8453 // Convert BUILD_VECTOR where all elements but the lowest are undef into 8454 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector 8455 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR. 8456 if (isOnlyLowElement && !(NumElts == 1 && isa<ConstantSDNode>(Value))) { 8457 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 " 8458 "SCALAR_TO_VECTOR node\n"); 8459 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); 8460 } 8461 8462 if (AllLanesExtractElt) { 8463 SDNode *Vector = nullptr; 8464 bool Even = false; 8465 bool Odd = false; 8466 // Check whether the extract elements match the Even pattern <0,2,4,...> or 8467 // the Odd pattern <1,3,5,...>. 8468 for (unsigned i = 0; i < NumElts; ++i) { 8469 SDValue V = Op.getOperand(i); 8470 const SDNode *N = V.getNode(); 8471 if (!isa<ConstantSDNode>(N->getOperand(1))) 8472 break; 8473 SDValue N0 = N->getOperand(0); 8474 8475 // All elements are extracted from the same vector. 8476 if (!Vector) { 8477 Vector = N0.getNode(); 8478 // Check that the type of EXTRACT_VECTOR_ELT matches the type of 8479 // BUILD_VECTOR. 8480 if (VT.getVectorElementType() != 8481 N0.getValueType().getVectorElementType()) 8482 break; 8483 } else if (Vector != N0.getNode()) { 8484 Odd = false; 8485 Even = false; 8486 break; 8487 } 8488 8489 // Extracted values are either at Even indices <0,2,4,...> or at Odd 8490 // indices <1,3,5,...>. 8491 uint64_t Val = N->getConstantOperandVal(1); 8492 if (Val == 2 * i) { 8493 Even = true; 8494 continue; 8495 } 8496 if (Val - 1 == 2 * i) { 8497 Odd = true; 8498 continue; 8499 } 8500 8501 // Something does not match: abort. 8502 Odd = false; 8503 Even = false; 8504 break; 8505 } 8506 if (Even || Odd) { 8507 SDValue LHS = 8508 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0), 8509 DAG.getConstant(0, dl, MVT::i64)); 8510 SDValue RHS = 8511 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0), 8512 DAG.getConstant(NumElts, dl, MVT::i64)); 8513 8514 if (Even && !Odd) 8515 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS, 8516 RHS); 8517 if (Odd && !Even) 8518 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS, 8519 RHS); 8520 } 8521 } 8522 8523 // Use DUP for non-constant splats. For f32 constant splats, reduce to 8524 // i32 and try again. 8525 if (usesOnlyOneValue) { 8526 if (!isConstant) { 8527 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 8528 Value.getValueType() != VT) { 8529 LLVM_DEBUG( 8530 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n"); 8531 return DAG.getNode(AArch64ISD::DUP, dl, VT, Value); 8532 } 8533 8534 // This is actually a DUPLANExx operation, which keeps everything vectory. 8535 8536 SDValue Lane = Value.getOperand(1); 8537 Value = Value.getOperand(0); 8538 if (Value.getValueSizeInBits() == 64) { 8539 LLVM_DEBUG( 8540 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, " 8541 "widening it\n"); 8542 Value = WidenVector(Value, DAG); 8543 } 8544 8545 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType()); 8546 return DAG.getNode(Opcode, dl, VT, Value, Lane); 8547 } 8548 8549 if (VT.getVectorElementType().isFloatingPoint()) { 8550 SmallVector<SDValue, 8> Ops; 8551 EVT EltTy = VT.getVectorElementType(); 8552 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 || 8553 EltTy == MVT::f64) && "Unsupported floating-point vector type"); 8554 LLVM_DEBUG( 8555 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int " 8556 "BITCASTS, and try again\n"); 8557 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits()); 8558 for (unsigned i = 0; i < NumElts; ++i) 8559 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i))); 8560 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts); 8561 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); 8562 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: "; 8563 Val.dump();); 8564 Val = LowerBUILD_VECTOR(Val, DAG); 8565 if (Val.getNode()) 8566 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 8567 } 8568 } 8569 8570 // If there was only one constant value used and for more than one lane, 8571 // start by splatting that value, then replace the non-constant lanes. This 8572 // is better than the default, which will perform a separate initialization 8573 // for each lane. 8574 if (NumConstantLanes > 0 && usesOnlyOneConstantValue) { 8575 // Firstly, try to materialize the splat constant. 8576 SDValue Vec = DAG.getSplatBuildVector(VT, dl, ConstantValue), 8577 Val = ConstantBuildVector(Vec, DAG); 8578 if (!Val) { 8579 // Otherwise, materialize the constant and splat it. 8580 Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue); 8581 DAG.ReplaceAllUsesWith(Vec.getNode(), &Val); 8582 } 8583 8584 // Now insert the non-constant lanes. 8585 for (unsigned i = 0; i < NumElts; ++i) { 8586 SDValue V = Op.getOperand(i); 8587 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); 8588 if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V)) 8589 // Note that type legalization likely mucked about with the VT of the 8590 // source operand, so we may have to convert it here before inserting. 8591 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx); 8592 } 8593 return Val; 8594 } 8595 8596 // This will generate a load from the constant pool. 8597 if (isConstant) { 8598 LLVM_DEBUG( 8599 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default " 8600 "expansion\n"); 8601 return SDValue(); 8602 } 8603 8604 // Empirical tests suggest this is rarely worth it for vectors of length <= 2. 8605 if (NumElts >= 4) { 8606 if (SDValue shuffle = ReconstructShuffle(Op, DAG)) 8607 return shuffle; 8608 } 8609 8610 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we 8611 // know the default expansion would otherwise fall back on something even 8612 // worse. For a vector with one or two non-undef values, that's 8613 // scalar_to_vector for the elements followed by a shuffle (provided the 8614 // shuffle is valid for the target) and materialization element by element 8615 // on the stack followed by a load for everything else. 8616 if (!isConstant && !usesOnlyOneValue) { 8617 LLVM_DEBUG( 8618 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence " 8619 "of INSERT_VECTOR_ELT\n"); 8620 8621 SDValue Vec = DAG.getUNDEF(VT); 8622 SDValue Op0 = Op.getOperand(0); 8623 unsigned i = 0; 8624 8625 // Use SCALAR_TO_VECTOR for lane zero to 8626 // a) Avoid a RMW dependency on the full vector register, and 8627 // b) Allow the register coalescer to fold away the copy if the 8628 // value is already in an S or D register, and we're forced to emit an 8629 // INSERT_SUBREG that we can't fold anywhere. 8630 // 8631 // We also allow types like i8 and i16 which are illegal scalar but legal 8632 // vector element types. After type-legalization the inserted value is 8633 // extended (i32) and it is safe to cast them to the vector type by ignoring 8634 // the upper bits of the lowest lane (e.g. v8i8, v4i16). 8635 if (!Op0.isUndef()) { 8636 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n"); 8637 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0); 8638 ++i; 8639 } 8640 LLVM_DEBUG(if (i < NumElts) dbgs() 8641 << "Creating nodes for the other vector elements:\n";); 8642 for (; i < NumElts; ++i) { 8643 SDValue V = Op.getOperand(i); 8644 if (V.isUndef()) 8645 continue; 8646 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); 8647 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); 8648 } 8649 return Vec; 8650 } 8651 8652 LLVM_DEBUG( 8653 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find " 8654 "better alternative\n"); 8655 return SDValue(); 8656 } 8657 8658 SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 8659 SelectionDAG &DAG) const { 8660 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!"); 8661 8662 // Check for non-constant or out of range lane. 8663 EVT VT = Op.getOperand(0).getValueType(); 8664 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 8665 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) 8666 return SDValue(); 8667 8668 8669 // Insertion/extraction are legal for V128 types. 8670 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || 8671 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || 8672 VT == MVT::v8f16 || VT == MVT::v8bf16) 8673 return Op; 8674 8675 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && 8676 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 && 8677 VT != MVT::v4bf16) 8678 return SDValue(); 8679 8680 // For V64 types, we perform insertion by expanding the value 8681 // to a V128 type and perform the insertion on that. 8682 SDLoc DL(Op); 8683 SDValue WideVec = WidenVector(Op.getOperand(0), DAG); 8684 EVT WideTy = WideVec.getValueType(); 8685 8686 SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec, 8687 Op.getOperand(1), Op.getOperand(2)); 8688 // Re-narrow the resultant vector. 8689 return NarrowVector(Node, DAG); 8690 } 8691 8692 SDValue 8693 AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 8694 SelectionDAG &DAG) const { 8695 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!"); 8696 8697 // Check for non-constant or out of range lane. 8698 EVT VT = Op.getOperand(0).getValueType(); 8699 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 8700 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) 8701 return SDValue(); 8702 8703 8704 // Insertion/extraction are legal for V128 types. 8705 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || 8706 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || 8707 VT == MVT::v8f16 || VT == MVT::v8bf16) 8708 return Op; 8709 8710 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && 8711 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 && 8712 VT != MVT::v4bf16) 8713 return SDValue(); 8714 8715 // For V64 types, we perform extraction by expanding the value 8716 // to a V128 type and perform the extraction on that. 8717 SDLoc DL(Op); 8718 SDValue WideVec = WidenVector(Op.getOperand(0), DAG); 8719 EVT WideTy = WideVec.getValueType(); 8720 8721 EVT ExtrTy = WideTy.getVectorElementType(); 8722 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8) 8723 ExtrTy = MVT::i32; 8724 8725 // For extractions, we just return the result directly. 8726 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec, 8727 Op.getOperand(1)); 8728 } 8729 8730 SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, 8731 SelectionDAG &DAG) const { 8732 assert(Op.getValueType().isFixedLengthVector() && 8733 "Only cases that extract a fixed length vector are supported!"); 8734 8735 EVT InVT = Op.getOperand(0).getValueType(); 8736 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 8737 unsigned Size = Op.getValueSizeInBits(); 8738 8739 if (InVT.isScalableVector()) { 8740 // This will be matched by custom code during ISelDAGToDAG. 8741 if (Idx == 0 && isPackedVectorType(InVT, DAG)) 8742 return Op; 8743 8744 return SDValue(); 8745 } 8746 8747 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel. 8748 if (Idx == 0 && InVT.getSizeInBits() <= 128) 8749 return Op; 8750 8751 // If this is extracting the upper 64-bits of a 128-bit vector, we match 8752 // that directly. 8753 if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64) 8754 return Op; 8755 8756 return SDValue(); 8757 } 8758 8759 SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, 8760 SelectionDAG &DAG) const { 8761 assert(Op.getValueType().isScalableVector() && 8762 "Only expect to lower inserts into scalable vectors!"); 8763 8764 EVT InVT = Op.getOperand(1).getValueType(); 8765 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 8766 8767 // We don't have any patterns for scalable vector yet. 8768 if (InVT.isScalableVector() || !useSVEForFixedLengthVectorVT(InVT)) 8769 return SDValue(); 8770 8771 // This will be matched by custom code during ISelDAGToDAG. 8772 if (Idx == 0 && isPackedVectorType(InVT, DAG) && Op.getOperand(0).isUndef()) 8773 return Op; 8774 8775 return SDValue(); 8776 } 8777 8778 bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { 8779 // Currently no fixed length shuffles that require SVE are legal. 8780 if (useSVEForFixedLengthVectorVT(VT)) 8781 return false; 8782 8783 if (VT.getVectorNumElements() == 4 && 8784 (VT.is128BitVector() || VT.is64BitVector())) { 8785 unsigned PFIndexes[4]; 8786 for (unsigned i = 0; i != 4; ++i) { 8787 if (M[i] < 0) 8788 PFIndexes[i] = 8; 8789 else 8790 PFIndexes[i] = M[i]; 8791 } 8792 8793 // Compute the index in the perfect shuffle table. 8794 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + 8795 PFIndexes[2] * 9 + PFIndexes[3]; 8796 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 8797 unsigned Cost = (PFEntry >> 30); 8798 8799 if (Cost <= 4) 8800 return true; 8801 } 8802 8803 bool DummyBool; 8804 int DummyInt; 8805 unsigned DummyUnsigned; 8806 8807 return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) || 8808 isREVMask(M, VT, 32) || isREVMask(M, VT, 16) || 8809 isEXTMask(M, VT, DummyBool, DummyUnsigned) || 8810 // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM. 8811 isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) || 8812 isZIPMask(M, VT, DummyUnsigned) || 8813 isTRN_v_undef_Mask(M, VT, DummyUnsigned) || 8814 isUZP_v_undef_Mask(M, VT, DummyUnsigned) || 8815 isZIP_v_undef_Mask(M, VT, DummyUnsigned) || 8816 isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) || 8817 isConcatMask(M, VT, VT.getSizeInBits() == 128)); 8818 } 8819 8820 /// getVShiftImm - Check if this is a valid build_vector for the immediate 8821 /// operand of a vector shift operation, where all the elements of the 8822 /// build_vector must have the same constant integer value. 8823 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { 8824 // Ignore bit_converts. 8825 while (Op.getOpcode() == ISD::BITCAST) 8826 Op = Op.getOperand(0); 8827 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); 8828 APInt SplatBits, SplatUndef; 8829 unsigned SplatBitSize; 8830 bool HasAnyUndefs; 8831 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, 8832 HasAnyUndefs, ElementBits) || 8833 SplatBitSize > ElementBits) 8834 return false; 8835 Cnt = SplatBits.getSExtValue(); 8836 return true; 8837 } 8838 8839 /// isVShiftLImm - Check if this is a valid build_vector for the immediate 8840 /// operand of a vector shift left operation. That value must be in the range: 8841 /// 0 <= Value < ElementBits for a left shift; or 8842 /// 0 <= Value <= ElementBits for a long left shift. 8843 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { 8844 assert(VT.isVector() && "vector shift count is not a vector type"); 8845 int64_t ElementBits = VT.getScalarSizeInBits(); 8846 if (!getVShiftImm(Op, ElementBits, Cnt)) 8847 return false; 8848 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); 8849 } 8850 8851 /// isVShiftRImm - Check if this is a valid build_vector for the immediate 8852 /// operand of a vector shift right operation. The value must be in the range: 8853 /// 1 <= Value <= ElementBits for a right shift; or 8854 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) { 8855 assert(VT.isVector() && "vector shift count is not a vector type"); 8856 int64_t ElementBits = VT.getScalarSizeInBits(); 8857 if (!getVShiftImm(Op, ElementBits, Cnt)) 8858 return false; 8859 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); 8860 } 8861 8862 // Attempt to form urhadd(OpA, OpB) from 8863 // truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1)). 8864 // The original form of this expression is 8865 // truncate(srl(add(zext(OpB), add(zext(OpA), 1)), 1)) and before this function 8866 // is called the srl will have been lowered to AArch64ISD::VLSHR and the 8867 // ((OpA + OpB + 1) >> 1) expression will have been changed to (OpB - (~OpA)). 8868 // This pass can also recognize a variant of this pattern that uses sign 8869 // extension instead of zero extension and form a srhadd(OpA, OpB) from it. 8870 SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op, 8871 SelectionDAG &DAG) const { 8872 EVT VT = Op.getValueType(); 8873 8874 if (VT.getScalarType() == MVT::i1) { 8875 // Lower i1 truncate to `(x & 1) != 0`. 8876 SDLoc dl(Op); 8877 EVT OpVT = Op.getOperand(0).getValueType(); 8878 SDValue Zero = DAG.getConstant(0, dl, OpVT); 8879 SDValue One = DAG.getConstant(1, dl, OpVT); 8880 SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One); 8881 return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE); 8882 } 8883 8884 if (!VT.isVector() || VT.isScalableVector()) 8885 return Op; 8886 8887 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType())) 8888 return LowerFixedLengthVectorTruncateToSVE(Op, DAG); 8889 8890 // Since we are looking for a right shift by a constant value of 1 and we are 8891 // operating on types at least 16 bits in length (sign/zero extended OpA and 8892 // OpB, which are at least 8 bits), it follows that the truncate will always 8893 // discard the shifted-in bit and therefore the right shift will be logical 8894 // regardless of the signedness of OpA and OpB. 8895 SDValue Shift = Op.getOperand(0); 8896 if (Shift.getOpcode() != AArch64ISD::VLSHR) 8897 return Op; 8898 8899 // Is the right shift using an immediate value of 1? 8900 uint64_t ShiftAmount = Shift.getConstantOperandVal(1); 8901 if (ShiftAmount != 1) 8902 return Op; 8903 8904 SDValue Sub = Shift->getOperand(0); 8905 if (Sub.getOpcode() != ISD::SUB) 8906 return Op; 8907 8908 SDValue Xor = Sub.getOperand(1); 8909 if (Xor.getOpcode() != ISD::XOR) 8910 return Op; 8911 8912 SDValue ExtendOpA = Xor.getOperand(0); 8913 SDValue ExtendOpB = Sub.getOperand(0); 8914 unsigned ExtendOpAOpc = ExtendOpA.getOpcode(); 8915 unsigned ExtendOpBOpc = ExtendOpB.getOpcode(); 8916 if (!(ExtendOpAOpc == ExtendOpBOpc && 8917 (ExtendOpAOpc == ISD::ZERO_EXTEND || ExtendOpAOpc == ISD::SIGN_EXTEND))) 8918 return Op; 8919 8920 // Is the result of the right shift being truncated to the same value type as 8921 // the original operands, OpA and OpB? 8922 SDValue OpA = ExtendOpA.getOperand(0); 8923 SDValue OpB = ExtendOpB.getOperand(0); 8924 EVT OpAVT = OpA.getValueType(); 8925 assert(ExtendOpA.getValueType() == ExtendOpB.getValueType()); 8926 if (!(VT == OpAVT && OpAVT == OpB.getValueType())) 8927 return Op; 8928 8929 // Is the XOR using a constant amount of all ones in the right hand side? 8930 uint64_t C; 8931 if (!isAllConstantBuildVector(Xor.getOperand(1), C)) 8932 return Op; 8933 8934 unsigned ElemSizeInBits = VT.getScalarSizeInBits(); 8935 APInt CAsAPInt(ElemSizeInBits, C); 8936 if (CAsAPInt != APInt::getAllOnesValue(ElemSizeInBits)) 8937 return Op; 8938 8939 SDLoc DL(Op); 8940 bool IsSignExtend = ExtendOpAOpc == ISD::SIGN_EXTEND; 8941 unsigned RHADDOpc = IsSignExtend ? AArch64ISD::SRHADD : AArch64ISD::URHADD; 8942 SDValue ResultURHADD = DAG.getNode(RHADDOpc, DL, VT, OpA, OpB); 8943 8944 return ResultURHADD; 8945 } 8946 8947 SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, 8948 SelectionDAG &DAG) const { 8949 EVT VT = Op.getValueType(); 8950 SDLoc DL(Op); 8951 int64_t Cnt; 8952 8953 if (!Op.getOperand(1).getValueType().isVector()) 8954 return Op; 8955 unsigned EltSize = VT.getScalarSizeInBits(); 8956 8957 switch (Op.getOpcode()) { 8958 default: 8959 llvm_unreachable("unexpected shift opcode"); 8960 8961 case ISD::SHL: 8962 if (VT.isScalableVector()) 8963 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_MERGE_OP1); 8964 8965 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) 8966 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0), 8967 DAG.getConstant(Cnt, DL, MVT::i32)); 8968 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8969 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL, 8970 MVT::i32), 8971 Op.getOperand(0), Op.getOperand(1)); 8972 case ISD::SRA: 8973 case ISD::SRL: 8974 if (VT.isScalableVector()) { 8975 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_MERGE_OP1 8976 : AArch64ISD::SRL_MERGE_OP1; 8977 return LowerToPredicatedOp(Op, DAG, Opc); 8978 } 8979 8980 // Right shift immediate 8981 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) { 8982 unsigned Opc = 8983 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR; 8984 return DAG.getNode(Opc, DL, VT, Op.getOperand(0), 8985 DAG.getConstant(Cnt, DL, MVT::i32)); 8986 } 8987 8988 // Right shift register. Note, there is not a shift right register 8989 // instruction, but the shift left register instruction takes a signed 8990 // value, where negative numbers specify a right shift. 8991 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl 8992 : Intrinsic::aarch64_neon_ushl; 8993 // negate the shift amount 8994 SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1)); 8995 SDValue NegShiftLeft = 8996 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8997 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0), 8998 NegShift); 8999 return NegShiftLeft; 9000 } 9001 9002 return SDValue(); 9003 } 9004 9005 static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, 9006 AArch64CC::CondCode CC, bool NoNans, EVT VT, 9007 const SDLoc &dl, SelectionDAG &DAG) { 9008 EVT SrcVT = LHS.getValueType(); 9009 assert(VT.getSizeInBits() == SrcVT.getSizeInBits() && 9010 "function only supposed to emit natural comparisons"); 9011 9012 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode()); 9013 APInt CnstBits(VT.getSizeInBits(), 0); 9014 APInt UndefBits(VT.getSizeInBits(), 0); 9015 bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits); 9016 bool IsZero = IsCnst && (CnstBits == 0); 9017 9018 if (SrcVT.getVectorElementType().isFloatingPoint()) { 9019 switch (CC) { 9020 default: 9021 return SDValue(); 9022 case AArch64CC::NE: { 9023 SDValue Fcmeq; 9024 if (IsZero) 9025 Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); 9026 else 9027 Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); 9028 return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq); 9029 } 9030 case AArch64CC::EQ: 9031 if (IsZero) 9032 return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); 9033 return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); 9034 case AArch64CC::GE: 9035 if (IsZero) 9036 return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS); 9037 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS); 9038 case AArch64CC::GT: 9039 if (IsZero) 9040 return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS); 9041 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS); 9042 case AArch64CC::LS: 9043 if (IsZero) 9044 return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS); 9045 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS); 9046 case AArch64CC::LT: 9047 if (!NoNans) 9048 return SDValue(); 9049 // If we ignore NaNs then we can use to the MI implementation. 9050 LLVM_FALLTHROUGH; 9051 case AArch64CC::MI: 9052 if (IsZero) 9053 return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS); 9054 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS); 9055 } 9056 } 9057 9058 switch (CC) { 9059 default: 9060 return SDValue(); 9061 case AArch64CC::NE: { 9062 SDValue Cmeq; 9063 if (IsZero) 9064 Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); 9065 else 9066 Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); 9067 return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq); 9068 } 9069 case AArch64CC::EQ: 9070 if (IsZero) 9071 return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); 9072 return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); 9073 case AArch64CC::GE: 9074 if (IsZero) 9075 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS); 9076 return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS); 9077 case AArch64CC::GT: 9078 if (IsZero) 9079 return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS); 9080 return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS); 9081 case AArch64CC::LE: 9082 if (IsZero) 9083 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS); 9084 return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS); 9085 case AArch64CC::LS: 9086 return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS); 9087 case AArch64CC::LO: 9088 return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS); 9089 case AArch64CC::LT: 9090 if (IsZero) 9091 return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS); 9092 return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS); 9093 case AArch64CC::HI: 9094 return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS); 9095 case AArch64CC::HS: 9096 return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS); 9097 } 9098 } 9099 9100 SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, 9101 SelectionDAG &DAG) const { 9102 if (Op.getValueType().isScalableVector()) { 9103 if (Op.getOperand(0).getValueType().isFloatingPoint()) 9104 return Op; 9105 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO); 9106 } 9107 9108 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 9109 SDValue LHS = Op.getOperand(0); 9110 SDValue RHS = Op.getOperand(1); 9111 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger(); 9112 SDLoc dl(Op); 9113 9114 if (LHS.getValueType().getVectorElementType().isInteger()) { 9115 assert(LHS.getValueType() == RHS.getValueType()); 9116 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); 9117 SDValue Cmp = 9118 EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG); 9119 return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); 9120 } 9121 9122 const bool FullFP16 = 9123 static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16(); 9124 9125 // Make v4f16 (only) fcmp operations utilise vector instructions 9126 // v8f16 support will be a litle more complicated 9127 if (!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) { 9128 if (LHS.getValueType().getVectorNumElements() == 4) { 9129 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS); 9130 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS); 9131 SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC); 9132 DAG.ReplaceAllUsesWith(Op, NewSetcc); 9133 CmpVT = MVT::v4i32; 9134 } else 9135 return SDValue(); 9136 } 9137 9138 assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) || 9139 LHS.getValueType().getVectorElementType() != MVT::f128); 9140 9141 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally 9142 // clean. Some of them require two branches to implement. 9143 AArch64CC::CondCode CC1, CC2; 9144 bool ShouldInvert; 9145 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert); 9146 9147 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath; 9148 SDValue Cmp = 9149 EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG); 9150 if (!Cmp.getNode()) 9151 return SDValue(); 9152 9153 if (CC2 != AArch64CC::AL) { 9154 SDValue Cmp2 = 9155 EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG); 9156 if (!Cmp2.getNode()) 9157 return SDValue(); 9158 9159 Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2); 9160 } 9161 9162 Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); 9163 9164 if (ShouldInvert) 9165 Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType()); 9166 9167 return Cmp; 9168 } 9169 9170 static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, 9171 SelectionDAG &DAG) { 9172 SDValue VecOp = ScalarOp.getOperand(0); 9173 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp); 9174 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx, 9175 DAG.getConstant(0, DL, MVT::i64)); 9176 } 9177 9178 SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op, 9179 SelectionDAG &DAG) const { 9180 SDLoc dl(Op); 9181 switch (Op.getOpcode()) { 9182 case ISD::VECREDUCE_ADD: 9183 return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG); 9184 case ISD::VECREDUCE_SMAX: 9185 return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG); 9186 case ISD::VECREDUCE_SMIN: 9187 return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG); 9188 case ISD::VECREDUCE_UMAX: 9189 return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG); 9190 case ISD::VECREDUCE_UMIN: 9191 return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG); 9192 case ISD::VECREDUCE_FMAX: { 9193 assert(Op->getFlags().hasNoNaNs() && "fmax vector reduction needs NoNaN flag"); 9194 return DAG.getNode( 9195 ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), 9196 DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32), 9197 Op.getOperand(0)); 9198 } 9199 case ISD::VECREDUCE_FMIN: { 9200 assert(Op->getFlags().hasNoNaNs() && "fmin vector reduction needs NoNaN flag"); 9201 return DAG.getNode( 9202 ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), 9203 DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32), 9204 Op.getOperand(0)); 9205 } 9206 default: 9207 llvm_unreachable("Unhandled reduction"); 9208 } 9209 } 9210 9211 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op, 9212 SelectionDAG &DAG) const { 9213 auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget()); 9214 if (!Subtarget.hasLSE()) 9215 return SDValue(); 9216 9217 // LSE has an atomic load-add instruction, but not a load-sub. 9218 SDLoc dl(Op); 9219 MVT VT = Op.getSimpleValueType(); 9220 SDValue RHS = Op.getOperand(2); 9221 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode()); 9222 RHS = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), RHS); 9223 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, AN->getMemoryVT(), 9224 Op.getOperand(0), Op.getOperand(1), RHS, 9225 AN->getMemOperand()); 9226 } 9227 9228 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op, 9229 SelectionDAG &DAG) const { 9230 auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget()); 9231 if (!Subtarget.hasLSE()) 9232 return SDValue(); 9233 9234 // LSE has an atomic load-clear instruction, but not a load-and. 9235 SDLoc dl(Op); 9236 MVT VT = Op.getSimpleValueType(); 9237 SDValue RHS = Op.getOperand(2); 9238 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode()); 9239 RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS); 9240 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(), 9241 Op.getOperand(0), Op.getOperand(1), RHS, 9242 AN->getMemOperand()); 9243 } 9244 9245 SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC( 9246 SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const { 9247 SDLoc dl(Op); 9248 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 9249 SDValue Callee = DAG.getTargetExternalSymbol("__chkstk", PtrVT, 0); 9250 9251 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 9252 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask(); 9253 if (Subtarget->hasCustomCallingConv()) 9254 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask); 9255 9256 Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size, 9257 DAG.getConstant(4, dl, MVT::i64)); 9258 Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue()); 9259 Chain = 9260 DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue), 9261 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64), 9262 DAG.getRegisterMask(Mask), Chain.getValue(1)); 9263 // To match the actual intent better, we should read the output from X15 here 9264 // again (instead of potentially spilling it to the stack), but rereading Size 9265 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined 9266 // here. 9267 9268 Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size, 9269 DAG.getConstant(4, dl, MVT::i64)); 9270 return Chain; 9271 } 9272 9273 SDValue 9274 AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 9275 SelectionDAG &DAG) const { 9276 assert(Subtarget->isTargetWindows() && 9277 "Only Windows alloca probing supported"); 9278 SDLoc dl(Op); 9279 // Get the inputs. 9280 SDNode *Node = Op.getNode(); 9281 SDValue Chain = Op.getOperand(0); 9282 SDValue Size = Op.getOperand(1); 9283 MaybeAlign Align = 9284 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue(); 9285 EVT VT = Node->getValueType(0); 9286 9287 if (DAG.getMachineFunction().getFunction().hasFnAttribute( 9288 "no-stack-arg-probe")) { 9289 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); 9290 Chain = SP.getValue(1); 9291 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); 9292 if (Align) 9293 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), 9294 DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); 9295 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); 9296 SDValue Ops[2] = {SP, Chain}; 9297 return DAG.getMergeValues(Ops, dl); 9298 } 9299 9300 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); 9301 9302 Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG); 9303 9304 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); 9305 Chain = SP.getValue(1); 9306 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); 9307 if (Align) 9308 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), 9309 DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); 9310 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); 9311 9312 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), 9313 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl); 9314 9315 SDValue Ops[2] = {SP, Chain}; 9316 return DAG.getMergeValues(Ops, dl); 9317 } 9318 9319 SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op, 9320 SelectionDAG &DAG) const { 9321 EVT VT = Op.getValueType(); 9322 assert(VT != MVT::i64 && "Expected illegal VSCALE node"); 9323 9324 SDLoc DL(Op); 9325 APInt MulImm = cast<ConstantSDNode>(Op.getOperand(0))->getAPIntValue(); 9326 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sextOrSelf(64)), 9327 DL, VT); 9328 } 9329 9330 /// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics. 9331 template <unsigned NumVecs> 9332 static bool setInfoSVEStN(AArch64TargetLowering::IntrinsicInfo &Info, 9333 const CallInst &CI) { 9334 Info.opc = ISD::INTRINSIC_VOID; 9335 // Retrieve EC from first vector argument. 9336 const EVT VT = EVT::getEVT(CI.getArgOperand(0)->getType()); 9337 ElementCount EC = VT.getVectorElementCount(); 9338 #ifndef NDEBUG 9339 // Check the assumption that all input vectors are the same type. 9340 for (unsigned I = 0; I < NumVecs; ++I) 9341 assert(VT == EVT::getEVT(CI.getArgOperand(I)->getType()) && 9342 "Invalid type."); 9343 #endif 9344 // memVT is `NumVecs * VT`. 9345 Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(), 9346 EC * NumVecs); 9347 Info.ptrVal = CI.getArgOperand(CI.getNumArgOperands() - 1); 9348 Info.offset = 0; 9349 Info.align.reset(); 9350 Info.flags = MachineMemOperand::MOStore; 9351 return true; 9352 } 9353 9354 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as 9355 /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment 9356 /// specified in the intrinsic calls. 9357 bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 9358 const CallInst &I, 9359 MachineFunction &MF, 9360 unsigned Intrinsic) const { 9361 auto &DL = I.getModule()->getDataLayout(); 9362 switch (Intrinsic) { 9363 case Intrinsic::aarch64_sve_st2: 9364 return setInfoSVEStN<2>(Info, I); 9365 case Intrinsic::aarch64_sve_st3: 9366 return setInfoSVEStN<3>(Info, I); 9367 case Intrinsic::aarch64_sve_st4: 9368 return setInfoSVEStN<4>(Info, I); 9369 case Intrinsic::aarch64_neon_ld2: 9370 case Intrinsic::aarch64_neon_ld3: 9371 case Intrinsic::aarch64_neon_ld4: 9372 case Intrinsic::aarch64_neon_ld1x2: 9373 case Intrinsic::aarch64_neon_ld1x3: 9374 case Intrinsic::aarch64_neon_ld1x4: 9375 case Intrinsic::aarch64_neon_ld2lane: 9376 case Intrinsic::aarch64_neon_ld3lane: 9377 case Intrinsic::aarch64_neon_ld4lane: 9378 case Intrinsic::aarch64_neon_ld2r: 9379 case Intrinsic::aarch64_neon_ld3r: 9380 case Intrinsic::aarch64_neon_ld4r: { 9381 Info.opc = ISD::INTRINSIC_W_CHAIN; 9382 // Conservatively set memVT to the entire set of vectors loaded. 9383 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 9384 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 9385 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); 9386 Info.offset = 0; 9387 Info.align.reset(); 9388 // volatile loads with NEON intrinsics not supported 9389 Info.flags = MachineMemOperand::MOLoad; 9390 return true; 9391 } 9392 case Intrinsic::aarch64_neon_st2: 9393 case Intrinsic::aarch64_neon_st3: 9394 case Intrinsic::aarch64_neon_st4: 9395 case Intrinsic::aarch64_neon_st1x2: 9396 case Intrinsic::aarch64_neon_st1x3: 9397 case Intrinsic::aarch64_neon_st1x4: 9398 case Intrinsic::aarch64_neon_st2lane: 9399 case Intrinsic::aarch64_neon_st3lane: 9400 case Intrinsic::aarch64_neon_st4lane: { 9401 Info.opc = ISD::INTRINSIC_VOID; 9402 // Conservatively set memVT to the entire set of vectors stored. 9403 unsigned NumElts = 0; 9404 for (unsigned ArgI = 0, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { 9405 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 9406 if (!ArgTy->isVectorTy()) 9407 break; 9408 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 9409 } 9410 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 9411 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); 9412 Info.offset = 0; 9413 Info.align.reset(); 9414 // volatile stores with NEON intrinsics not supported 9415 Info.flags = MachineMemOperand::MOStore; 9416 return true; 9417 } 9418 case Intrinsic::aarch64_ldaxr: 9419 case Intrinsic::aarch64_ldxr: { 9420 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType()); 9421 Info.opc = ISD::INTRINSIC_W_CHAIN; 9422 Info.memVT = MVT::getVT(PtrTy->getElementType()); 9423 Info.ptrVal = I.getArgOperand(0); 9424 Info.offset = 0; 9425 Info.align = DL.getABITypeAlign(PtrTy->getElementType()); 9426 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 9427 return true; 9428 } 9429 case Intrinsic::aarch64_stlxr: 9430 case Intrinsic::aarch64_stxr: { 9431 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); 9432 Info.opc = ISD::INTRINSIC_W_CHAIN; 9433 Info.memVT = MVT::getVT(PtrTy->getElementType()); 9434 Info.ptrVal = I.getArgOperand(1); 9435 Info.offset = 0; 9436 Info.align = DL.getABITypeAlign(PtrTy->getElementType()); 9437 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 9438 return true; 9439 } 9440 case Intrinsic::aarch64_ldaxp: 9441 case Intrinsic::aarch64_ldxp: 9442 Info.opc = ISD::INTRINSIC_W_CHAIN; 9443 Info.memVT = MVT::i128; 9444 Info.ptrVal = I.getArgOperand(0); 9445 Info.offset = 0; 9446 Info.align = Align(16); 9447 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 9448 return true; 9449 case Intrinsic::aarch64_stlxp: 9450 case Intrinsic::aarch64_stxp: 9451 Info.opc = ISD::INTRINSIC_W_CHAIN; 9452 Info.memVT = MVT::i128; 9453 Info.ptrVal = I.getArgOperand(2); 9454 Info.offset = 0; 9455 Info.align = Align(16); 9456 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 9457 return true; 9458 case Intrinsic::aarch64_sve_ldnt1: { 9459 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); 9460 Info.opc = ISD::INTRINSIC_W_CHAIN; 9461 Info.memVT = MVT::getVT(I.getType()); 9462 Info.ptrVal = I.getArgOperand(1); 9463 Info.offset = 0; 9464 Info.align = DL.getABITypeAlign(PtrTy->getElementType()); 9465 Info.flags = MachineMemOperand::MOLoad; 9466 if (Intrinsic == Intrinsic::aarch64_sve_ldnt1) 9467 Info.flags |= MachineMemOperand::MONonTemporal; 9468 return true; 9469 } 9470 case Intrinsic::aarch64_sve_stnt1: { 9471 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(2)->getType()); 9472 Info.opc = ISD::INTRINSIC_W_CHAIN; 9473 Info.memVT = MVT::getVT(I.getOperand(0)->getType()); 9474 Info.ptrVal = I.getArgOperand(2); 9475 Info.offset = 0; 9476 Info.align = DL.getABITypeAlign(PtrTy->getElementType()); 9477 Info.flags = MachineMemOperand::MOStore; 9478 if (Intrinsic == Intrinsic::aarch64_sve_stnt1) 9479 Info.flags |= MachineMemOperand::MONonTemporal; 9480 return true; 9481 } 9482 default: 9483 break; 9484 } 9485 9486 return false; 9487 } 9488 9489 bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load, 9490 ISD::LoadExtType ExtTy, 9491 EVT NewVT) const { 9492 // TODO: This may be worth removing. Check regression tests for diffs. 9493 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT)) 9494 return false; 9495 9496 // If we're reducing the load width in order to avoid having to use an extra 9497 // instruction to do extension then it's probably a good idea. 9498 if (ExtTy != ISD::NON_EXTLOAD) 9499 return true; 9500 // Don't reduce load width if it would prevent us from combining a shift into 9501 // the offset. 9502 MemSDNode *Mem = dyn_cast<MemSDNode>(Load); 9503 assert(Mem); 9504 const SDValue &Base = Mem->getBasePtr(); 9505 if (Base.getOpcode() == ISD::ADD && 9506 Base.getOperand(1).getOpcode() == ISD::SHL && 9507 Base.getOperand(1).hasOneUse() && 9508 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) { 9509 // The shift can be combined if it matches the size of the value being 9510 // loaded (and so reducing the width would make it not match). 9511 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1); 9512 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8; 9513 if (ShiftAmount == Log2_32(LoadBytes)) 9514 return false; 9515 } 9516 // We have no reason to disallow reducing the load width, so allow it. 9517 return true; 9518 } 9519 9520 // Truncations from 64-bit GPR to 32-bit GPR is free. 9521 bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { 9522 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 9523 return false; 9524 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 9525 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 9526 return NumBits1 > NumBits2; 9527 } 9528 bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 9529 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) 9530 return false; 9531 unsigned NumBits1 = VT1.getSizeInBits(); 9532 unsigned NumBits2 = VT2.getSizeInBits(); 9533 return NumBits1 > NumBits2; 9534 } 9535 9536 /// Check if it is profitable to hoist instruction in then/else to if. 9537 /// Not profitable if I and it's user can form a FMA instruction 9538 /// because we prefer FMSUB/FMADD. 9539 bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const { 9540 if (I->getOpcode() != Instruction::FMul) 9541 return true; 9542 9543 if (!I->hasOneUse()) 9544 return true; 9545 9546 Instruction *User = I->user_back(); 9547 9548 if (User && 9549 !(User->getOpcode() == Instruction::FSub || 9550 User->getOpcode() == Instruction::FAdd)) 9551 return true; 9552 9553 const TargetOptions &Options = getTargetMachine().Options; 9554 const Function *F = I->getFunction(); 9555 const DataLayout &DL = F->getParent()->getDataLayout(); 9556 Type *Ty = User->getOperand(0)->getType(); 9557 9558 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) && 9559 isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) && 9560 (Options.AllowFPOpFusion == FPOpFusion::Fast || 9561 Options.UnsafeFPMath)); 9562 } 9563 9564 // All 32-bit GPR operations implicitly zero the high-half of the corresponding 9565 // 64-bit GPR. 9566 bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { 9567 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 9568 return false; 9569 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 9570 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 9571 return NumBits1 == 32 && NumBits2 == 64; 9572 } 9573 bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 9574 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) 9575 return false; 9576 unsigned NumBits1 = VT1.getSizeInBits(); 9577 unsigned NumBits2 = VT2.getSizeInBits(); 9578 return NumBits1 == 32 && NumBits2 == 64; 9579 } 9580 9581 bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 9582 EVT VT1 = Val.getValueType(); 9583 if (isZExtFree(VT1, VT2)) { 9584 return true; 9585 } 9586 9587 if (Val.getOpcode() != ISD::LOAD) 9588 return false; 9589 9590 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend. 9591 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() && 9592 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() && 9593 VT1.getSizeInBits() <= 32); 9594 } 9595 9596 bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const { 9597 if (isa<FPExtInst>(Ext)) 9598 return false; 9599 9600 // Vector types are not free. 9601 if (Ext->getType()->isVectorTy()) 9602 return false; 9603 9604 for (const Use &U : Ext->uses()) { 9605 // The extension is free if we can fold it with a left shift in an 9606 // addressing mode or an arithmetic operation: add, sub, and cmp. 9607 9608 // Is there a shift? 9609 const Instruction *Instr = cast<Instruction>(U.getUser()); 9610 9611 // Is this a constant shift? 9612 switch (Instr->getOpcode()) { 9613 case Instruction::Shl: 9614 if (!isa<ConstantInt>(Instr->getOperand(1))) 9615 return false; 9616 break; 9617 case Instruction::GetElementPtr: { 9618 gep_type_iterator GTI = gep_type_begin(Instr); 9619 auto &DL = Ext->getModule()->getDataLayout(); 9620 std::advance(GTI, U.getOperandNo()-1); 9621 Type *IdxTy = GTI.getIndexedType(); 9622 // This extension will end up with a shift because of the scaling factor. 9623 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0. 9624 // Get the shift amount based on the scaling factor: 9625 // log2(sizeof(IdxTy)) - log2(8). 9626 uint64_t ShiftAmt = 9627 countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy).getFixedSize()) - 3; 9628 // Is the constant foldable in the shift of the addressing mode? 9629 // I.e., shift amount is between 1 and 4 inclusive. 9630 if (ShiftAmt == 0 || ShiftAmt > 4) 9631 return false; 9632 break; 9633 } 9634 case Instruction::Trunc: 9635 // Check if this is a noop. 9636 // trunc(sext ty1 to ty2) to ty1. 9637 if (Instr->getType() == Ext->getOperand(0)->getType()) 9638 continue; 9639 LLVM_FALLTHROUGH; 9640 default: 9641 return false; 9642 } 9643 9644 // At this point we can use the bfm family, so this extension is free 9645 // for that use. 9646 } 9647 return true; 9648 } 9649 9650 /// Check if both Op1 and Op2 are shufflevector extracts of either the lower 9651 /// or upper half of the vector elements. 9652 static bool areExtractShuffleVectors(Value *Op1, Value *Op2) { 9653 auto areTypesHalfed = [](Value *FullV, Value *HalfV) { 9654 auto *FullTy = FullV->getType(); 9655 auto *HalfTy = HalfV->getType(); 9656 return FullTy->getPrimitiveSizeInBits().getFixedSize() == 9657 2 * HalfTy->getPrimitiveSizeInBits().getFixedSize(); 9658 }; 9659 9660 auto extractHalf = [](Value *FullV, Value *HalfV) { 9661 auto *FullVT = cast<FixedVectorType>(FullV->getType()); 9662 auto *HalfVT = cast<FixedVectorType>(HalfV->getType()); 9663 return FullVT->getNumElements() == 2 * HalfVT->getNumElements(); 9664 }; 9665 9666 ArrayRef<int> M1, M2; 9667 Value *S1Op1, *S2Op1; 9668 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) || 9669 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2)))) 9670 return false; 9671 9672 // Check that the operands are half as wide as the result and we extract 9673 // half of the elements of the input vectors. 9674 if (!areTypesHalfed(S1Op1, Op1) || !areTypesHalfed(S2Op1, Op2) || 9675 !extractHalf(S1Op1, Op1) || !extractHalf(S2Op1, Op2)) 9676 return false; 9677 9678 // Check the mask extracts either the lower or upper half of vector 9679 // elements. 9680 int M1Start = -1; 9681 int M2Start = -1; 9682 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2; 9683 if (!ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start) || 9684 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start) || 9685 M1Start != M2Start || (M1Start != 0 && M2Start != (NumElements / 2))) 9686 return false; 9687 9688 return true; 9689 } 9690 9691 /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth 9692 /// of the vector elements. 9693 static bool areExtractExts(Value *Ext1, Value *Ext2) { 9694 auto areExtDoubled = [](Instruction *Ext) { 9695 return Ext->getType()->getScalarSizeInBits() == 9696 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); 9697 }; 9698 9699 if (!match(Ext1, m_ZExtOrSExt(m_Value())) || 9700 !match(Ext2, m_ZExtOrSExt(m_Value())) || 9701 !areExtDoubled(cast<Instruction>(Ext1)) || 9702 !areExtDoubled(cast<Instruction>(Ext2))) 9703 return false; 9704 9705 return true; 9706 } 9707 9708 /// Check if Op could be used with vmull_high_p64 intrinsic. 9709 static bool isOperandOfVmullHighP64(Value *Op) { 9710 Value *VectorOperand = nullptr; 9711 ConstantInt *ElementIndex = nullptr; 9712 return match(Op, m_ExtractElt(m_Value(VectorOperand), 9713 m_ConstantInt(ElementIndex))) && 9714 ElementIndex->getValue() == 1 && 9715 isa<FixedVectorType>(VectorOperand->getType()) && 9716 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2; 9717 } 9718 9719 /// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic. 9720 static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) { 9721 return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2); 9722 } 9723 9724 /// Check if sinking \p I's operands to I's basic block is profitable, because 9725 /// the operands can be folded into a target instruction, e.g. 9726 /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2). 9727 bool AArch64TargetLowering::shouldSinkOperands( 9728 Instruction *I, SmallVectorImpl<Use *> &Ops) const { 9729 if (!I->getType()->isVectorTy()) 9730 return false; 9731 9732 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 9733 switch (II->getIntrinsicID()) { 9734 case Intrinsic::aarch64_neon_umull: 9735 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1))) 9736 return false; 9737 Ops.push_back(&II->getOperandUse(0)); 9738 Ops.push_back(&II->getOperandUse(1)); 9739 return true; 9740 9741 case Intrinsic::aarch64_neon_pmull64: 9742 if (!areOperandsOfVmullHighP64(II->getArgOperand(0), 9743 II->getArgOperand(1))) 9744 return false; 9745 Ops.push_back(&II->getArgOperandUse(0)); 9746 Ops.push_back(&II->getArgOperandUse(1)); 9747 return true; 9748 9749 default: 9750 return false; 9751 } 9752 } 9753 9754 switch (I->getOpcode()) { 9755 case Instruction::Sub: 9756 case Instruction::Add: { 9757 if (!areExtractExts(I->getOperand(0), I->getOperand(1))) 9758 return false; 9759 9760 // If the exts' operands extract either the lower or upper elements, we 9761 // can sink them too. 9762 auto Ext1 = cast<Instruction>(I->getOperand(0)); 9763 auto Ext2 = cast<Instruction>(I->getOperand(1)); 9764 if (areExtractShuffleVectors(Ext1, Ext2)) { 9765 Ops.push_back(&Ext1->getOperandUse(0)); 9766 Ops.push_back(&Ext2->getOperandUse(0)); 9767 } 9768 9769 Ops.push_back(&I->getOperandUse(0)); 9770 Ops.push_back(&I->getOperandUse(1)); 9771 9772 return true; 9773 } 9774 default: 9775 return false; 9776 } 9777 return false; 9778 } 9779 9780 bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType, 9781 Align &RequiredAligment) const { 9782 if (!LoadedType.isSimple() || 9783 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint())) 9784 return false; 9785 // Cyclone supports unaligned accesses. 9786 RequiredAligment = Align(1); 9787 unsigned NumBits = LoadedType.getSizeInBits(); 9788 return NumBits == 32 || NumBits == 64; 9789 } 9790 9791 /// A helper function for determining the number of interleaved accesses we 9792 /// will generate when lowering accesses of the given type. 9793 unsigned 9794 AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy, 9795 const DataLayout &DL) const { 9796 return (DL.getTypeSizeInBits(VecTy) + 127) / 128; 9797 } 9798 9799 MachineMemOperand::Flags 9800 AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const { 9801 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor && 9802 I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr) 9803 return MOStridedAccess; 9804 return MachineMemOperand::MONone; 9805 } 9806 9807 bool AArch64TargetLowering::isLegalInterleavedAccessType( 9808 VectorType *VecTy, const DataLayout &DL) const { 9809 9810 unsigned VecSize = DL.getTypeSizeInBits(VecTy); 9811 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); 9812 9813 // Ensure the number of vector elements is greater than 1. 9814 if (cast<FixedVectorType>(VecTy)->getNumElements() < 2) 9815 return false; 9816 9817 // Ensure the element type is legal. 9818 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64) 9819 return false; 9820 9821 // Ensure the total vector size is 64 or a multiple of 128. Types larger than 9822 // 128 will be split into multiple interleaved accesses. 9823 return VecSize == 64 || VecSize % 128 == 0; 9824 } 9825 9826 /// Lower an interleaved load into a ldN intrinsic. 9827 /// 9828 /// E.g. Lower an interleaved load (Factor = 2): 9829 /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr 9830 /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements 9831 /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements 9832 /// 9833 /// Into: 9834 /// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr) 9835 /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 9836 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 9837 bool AArch64TargetLowering::lowerInterleavedLoad( 9838 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, 9839 ArrayRef<unsigned> Indices, unsigned Factor) const { 9840 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 9841 "Invalid interleave factor"); 9842 assert(!Shuffles.empty() && "Empty shufflevector input"); 9843 assert(Shuffles.size() == Indices.size() && 9844 "Unmatched number of shufflevectors and indices"); 9845 9846 const DataLayout &DL = LI->getModule()->getDataLayout(); 9847 9848 VectorType *VTy = Shuffles[0]->getType(); 9849 9850 // Skip if we do not have NEON and skip illegal vector types. We can 9851 // "legalize" wide vector types into multiple interleaved accesses as long as 9852 // the vector types are divisible by 128. 9853 if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VTy, DL)) 9854 return false; 9855 9856 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL); 9857 9858 auto *FVTy = cast<FixedVectorType>(VTy); 9859 9860 // A pointer vector can not be the return type of the ldN intrinsics. Need to 9861 // load integer vectors first and then convert to pointer vectors. 9862 Type *EltTy = FVTy->getElementType(); 9863 if (EltTy->isPointerTy()) 9864 FVTy = 9865 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements()); 9866 9867 IRBuilder<> Builder(LI); 9868 9869 // The base address of the load. 9870 Value *BaseAddr = LI->getPointerOperand(); 9871 9872 if (NumLoads > 1) { 9873 // If we're going to generate more than one load, reset the sub-vector type 9874 // to something legal. 9875 FVTy = FixedVectorType::get(FVTy->getElementType(), 9876 FVTy->getNumElements() / NumLoads); 9877 9878 // We will compute the pointer operand of each load from the original base 9879 // address using GEPs. Cast the base address to a pointer to the scalar 9880 // element type. 9881 BaseAddr = Builder.CreateBitCast( 9882 BaseAddr, 9883 FVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace())); 9884 } 9885 9886 Type *PtrTy = FVTy->getPointerTo(LI->getPointerAddressSpace()); 9887 Type *Tys[2] = {FVTy, PtrTy}; 9888 static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2, 9889 Intrinsic::aarch64_neon_ld3, 9890 Intrinsic::aarch64_neon_ld4}; 9891 Function *LdNFunc = 9892 Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); 9893 9894 // Holds sub-vectors extracted from the load intrinsic return values. The 9895 // sub-vectors are associated with the shufflevector instructions they will 9896 // replace. 9897 DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs; 9898 9899 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) { 9900 9901 // If we're generating more than one load, compute the base address of 9902 // subsequent loads as an offset from the previous. 9903 if (LoadCount > 0) 9904 BaseAddr = Builder.CreateConstGEP1_32(FVTy->getElementType(), BaseAddr, 9905 FVTy->getNumElements() * Factor); 9906 9907 CallInst *LdN = Builder.CreateCall( 9908 LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN"); 9909 9910 // Extract and store the sub-vectors returned by the load intrinsic. 9911 for (unsigned i = 0; i < Shuffles.size(); i++) { 9912 ShuffleVectorInst *SVI = Shuffles[i]; 9913 unsigned Index = Indices[i]; 9914 9915 Value *SubVec = Builder.CreateExtractValue(LdN, Index); 9916 9917 // Convert the integer vector to pointer vector if the element is pointer. 9918 if (EltTy->isPointerTy()) 9919 SubVec = Builder.CreateIntToPtr( 9920 SubVec, FixedVectorType::get(SVI->getType()->getElementType(), 9921 FVTy->getNumElements())); 9922 SubVecs[SVI].push_back(SubVec); 9923 } 9924 } 9925 9926 // Replace uses of the shufflevector instructions with the sub-vectors 9927 // returned by the load intrinsic. If a shufflevector instruction is 9928 // associated with more than one sub-vector, those sub-vectors will be 9929 // concatenated into a single wide vector. 9930 for (ShuffleVectorInst *SVI : Shuffles) { 9931 auto &SubVec = SubVecs[SVI]; 9932 auto *WideVec = 9933 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0]; 9934 SVI->replaceAllUsesWith(WideVec); 9935 } 9936 9937 return true; 9938 } 9939 9940 /// Lower an interleaved store into a stN intrinsic. 9941 /// 9942 /// E.g. Lower an interleaved store (Factor = 3): 9943 /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, 9944 /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> 9945 /// store <12 x i32> %i.vec, <12 x i32>* %ptr 9946 /// 9947 /// Into: 9948 /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> 9949 /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> 9950 /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> 9951 /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr) 9952 /// 9953 /// Note that the new shufflevectors will be removed and we'll only generate one 9954 /// st3 instruction in CodeGen. 9955 /// 9956 /// Example for a more general valid mask (Factor 3). Lower: 9957 /// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1, 9958 /// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19> 9959 /// store <12 x i32> %i.vec, <12 x i32>* %ptr 9960 /// 9961 /// Into: 9962 /// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7> 9963 /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> 9964 /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> 9965 /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr) 9966 bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, 9967 ShuffleVectorInst *SVI, 9968 unsigned Factor) const { 9969 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 9970 "Invalid interleave factor"); 9971 9972 auto *VecTy = cast<FixedVectorType>(SVI->getType()); 9973 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); 9974 9975 unsigned LaneLen = VecTy->getNumElements() / Factor; 9976 Type *EltTy = VecTy->getElementType(); 9977 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen); 9978 9979 const DataLayout &DL = SI->getModule()->getDataLayout(); 9980 9981 // Skip if we do not have NEON and skip illegal vector types. We can 9982 // "legalize" wide vector types into multiple interleaved accesses as long as 9983 // the vector types are divisible by 128. 9984 if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL)) 9985 return false; 9986 9987 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); 9988 9989 Value *Op0 = SVI->getOperand(0); 9990 Value *Op1 = SVI->getOperand(1); 9991 IRBuilder<> Builder(SI); 9992 9993 // StN intrinsics don't support pointer vectors as arguments. Convert pointer 9994 // vectors to integer vectors. 9995 if (EltTy->isPointerTy()) { 9996 Type *IntTy = DL.getIntPtrType(EltTy); 9997 unsigned NumOpElts = 9998 cast<FixedVectorType>(Op0->getType())->getNumElements(); 9999 10000 // Convert to the corresponding integer vector. 10001 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts); 10002 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); 10003 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); 10004 10005 SubVecTy = FixedVectorType::get(IntTy, LaneLen); 10006 } 10007 10008 // The base address of the store. 10009 Value *BaseAddr = SI->getPointerOperand(); 10010 10011 if (NumStores > 1) { 10012 // If we're going to generate more than one store, reset the lane length 10013 // and sub-vector type to something legal. 10014 LaneLen /= NumStores; 10015 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen); 10016 10017 // We will compute the pointer operand of each store from the original base 10018 // address using GEPs. Cast the base address to a pointer to the scalar 10019 // element type. 10020 BaseAddr = Builder.CreateBitCast( 10021 BaseAddr, 10022 SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace())); 10023 } 10024 10025 auto Mask = SVI->getShuffleMask(); 10026 10027 Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace()); 10028 Type *Tys[2] = {SubVecTy, PtrTy}; 10029 static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2, 10030 Intrinsic::aarch64_neon_st3, 10031 Intrinsic::aarch64_neon_st4}; 10032 Function *StNFunc = 10033 Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys); 10034 10035 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { 10036 10037 SmallVector<Value *, 5> Ops; 10038 10039 // Split the shufflevector operands into sub vectors for the new stN call. 10040 for (unsigned i = 0; i < Factor; i++) { 10041 unsigned IdxI = StoreCount * LaneLen * Factor + i; 10042 if (Mask[IdxI] >= 0) { 10043 Ops.push_back(Builder.CreateShuffleVector( 10044 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0))); 10045 } else { 10046 unsigned StartMask = 0; 10047 for (unsigned j = 1; j < LaneLen; j++) { 10048 unsigned IdxJ = StoreCount * LaneLen * Factor + j; 10049 if (Mask[IdxJ * Factor + IdxI] >= 0) { 10050 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ; 10051 break; 10052 } 10053 } 10054 // Note: Filling undef gaps with random elements is ok, since 10055 // those elements were being written anyway (with undefs). 10056 // In the case of all undefs we're defaulting to using elems from 0 10057 // Note: StartMask cannot be negative, it's checked in 10058 // isReInterleaveMask 10059 Ops.push_back(Builder.CreateShuffleVector( 10060 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0))); 10061 } 10062 } 10063 10064 // If we generating more than one store, we compute the base address of 10065 // subsequent stores as an offset from the previous. 10066 if (StoreCount > 0) 10067 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(), 10068 BaseAddr, LaneLen * Factor); 10069 10070 Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy)); 10071 Builder.CreateCall(StNFunc, Ops); 10072 } 10073 return true; 10074 } 10075 10076 // Lower an SVE structured load intrinsic returning a tuple type to target 10077 // specific intrinsic taking the same input but returning a multi-result value 10078 // of the split tuple type. 10079 // 10080 // E.g. Lowering an LD3: 10081 // 10082 // call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32( 10083 // <vscale x 4 x i1> %pred, 10084 // <vscale x 4 x i32>* %addr) 10085 // 10086 // Output DAG: 10087 // 10088 // t0: ch = EntryToken 10089 // t2: nxv4i1,ch = CopyFromReg t0, Register:nxv4i1 %0 10090 // t4: i64,ch = CopyFromReg t0, Register:i64 %1 10091 // t5: nxv4i32,nxv4i32,nxv4i32,ch = AArch64ISD::SVE_LD3 t0, t2, t4 10092 // t6: nxv12i32 = concat_vectors t5, t5:1, t5:2 10093 // 10094 // This is called pre-legalization to avoid widening/splitting issues with 10095 // non-power-of-2 tuple types used for LD3, such as nxv12i32. 10096 SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic, 10097 ArrayRef<SDValue> LoadOps, 10098 EVT VT, SelectionDAG &DAG, 10099 const SDLoc &DL) const { 10100 assert(VT.isScalableVector() && "Can only lower scalable vectors"); 10101 10102 unsigned N, Opcode; 10103 static std::map<unsigned, std::pair<unsigned, unsigned>> IntrinsicMap = { 10104 {Intrinsic::aarch64_sve_ld2, {2, AArch64ISD::SVE_LD2_MERGE_ZERO}}, 10105 {Intrinsic::aarch64_sve_ld3, {3, AArch64ISD::SVE_LD3_MERGE_ZERO}}, 10106 {Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4_MERGE_ZERO}}}; 10107 10108 std::tie(N, Opcode) = IntrinsicMap[Intrinsic]; 10109 assert(VT.getVectorElementCount().Min % N == 0 && 10110 "invalid tuple vector type!"); 10111 10112 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 10113 VT.getVectorElementCount() / N); 10114 assert(isTypeLegal(SplitVT)); 10115 10116 SmallVector<EVT, 5> VTs(N, SplitVT); 10117 VTs.push_back(MVT::Other); // Chain 10118 SDVTList NodeTys = DAG.getVTList(VTs); 10119 10120 SDValue PseudoLoad = DAG.getNode(Opcode, DL, NodeTys, LoadOps); 10121 SmallVector<SDValue, 4> PseudoLoadOps; 10122 for (unsigned I = 0; I < N; ++I) 10123 PseudoLoadOps.push_back(SDValue(PseudoLoad.getNode(), I)); 10124 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, PseudoLoadOps); 10125 } 10126 10127 EVT AArch64TargetLowering::getOptimalMemOpType( 10128 const MemOp &Op, const AttributeList &FuncAttributes) const { 10129 bool CanImplicitFloat = 10130 !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat); 10131 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat; 10132 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat; 10133 // Only use AdvSIMD to implement memset of 32-byte and above. It would have 10134 // taken one instruction to materialize the v2i64 zero and one store (with 10135 // restrictive addressing mode). Just do i64 stores. 10136 bool IsSmallMemset = Op.isMemset() && Op.size() < 32; 10137 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) { 10138 if (Op.isAligned(AlignCheck)) 10139 return true; 10140 bool Fast; 10141 return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone, 10142 &Fast) && 10143 Fast; 10144 }; 10145 10146 if (CanUseNEON && Op.isMemset() && !IsSmallMemset && 10147 AlignmentIsAcceptable(MVT::v2i64, Align(16))) 10148 return MVT::v2i64; 10149 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16))) 10150 return MVT::f128; 10151 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8))) 10152 return MVT::i64; 10153 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4))) 10154 return MVT::i32; 10155 return MVT::Other; 10156 } 10157 10158 LLT AArch64TargetLowering::getOptimalMemOpLLT( 10159 const MemOp &Op, const AttributeList &FuncAttributes) const { 10160 bool CanImplicitFloat = 10161 !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat); 10162 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat; 10163 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat; 10164 // Only use AdvSIMD to implement memset of 32-byte and above. It would have 10165 // taken one instruction to materialize the v2i64 zero and one store (with 10166 // restrictive addressing mode). Just do i64 stores. 10167 bool IsSmallMemset = Op.isMemset() && Op.size() < 32; 10168 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) { 10169 if (Op.isAligned(AlignCheck)) 10170 return true; 10171 bool Fast; 10172 return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone, 10173 &Fast) && 10174 Fast; 10175 }; 10176 10177 if (CanUseNEON && Op.isMemset() && !IsSmallMemset && 10178 AlignmentIsAcceptable(MVT::v2i64, Align(16))) 10179 return LLT::vector(2, 64); 10180 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16))) 10181 return LLT::scalar(128); 10182 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8))) 10183 return LLT::scalar(64); 10184 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4))) 10185 return LLT::scalar(32); 10186 return LLT(); 10187 } 10188 10189 // 12-bit optionally shifted immediates are legal for adds. 10190 bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const { 10191 if (Immed == std::numeric_limits<int64_t>::min()) { 10192 LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed 10193 << ": avoid UB for INT64_MIN\n"); 10194 return false; 10195 } 10196 // Same encoding for add/sub, just flip the sign. 10197 Immed = std::abs(Immed); 10198 bool IsLegal = ((Immed >> 12) == 0 || 10199 ((Immed & 0xfff) == 0 && Immed >> 24 == 0)); 10200 LLVM_DEBUG(dbgs() << "Is " << Immed 10201 << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n"); 10202 return IsLegal; 10203 } 10204 10205 // Integer comparisons are implemented with ADDS/SUBS, so the range of valid 10206 // immediates is the same as for an add or a sub. 10207 bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const { 10208 return isLegalAddImmediate(Immed); 10209 } 10210 10211 /// isLegalAddressingMode - Return true if the addressing mode represented 10212 /// by AM is legal for this target, for a load/store of the specified type. 10213 bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL, 10214 const AddrMode &AM, Type *Ty, 10215 unsigned AS, Instruction *I) const { 10216 // AArch64 has five basic addressing modes: 10217 // reg 10218 // reg + 9-bit signed offset 10219 // reg + SIZE_IN_BYTES * 12-bit unsigned offset 10220 // reg1 + reg2 10221 // reg + SIZE_IN_BYTES * reg 10222 10223 // No global is ever allowed as a base. 10224 if (AM.BaseGV) 10225 return false; 10226 10227 // No reg+reg+imm addressing. 10228 if (AM.HasBaseReg && AM.BaseOffs && AM.Scale) 10229 return false; 10230 10231 // FIXME: Update this method to support scalable addressing modes. 10232 if (isa<ScalableVectorType>(Ty)) 10233 return AM.HasBaseReg && !AM.BaseOffs && !AM.Scale; 10234 10235 // check reg + imm case: 10236 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12 10237 uint64_t NumBytes = 0; 10238 if (Ty->isSized()) { 10239 uint64_t NumBits = DL.getTypeSizeInBits(Ty); 10240 NumBytes = NumBits / 8; 10241 if (!isPowerOf2_64(NumBits)) 10242 NumBytes = 0; 10243 } 10244 10245 if (!AM.Scale) { 10246 int64_t Offset = AM.BaseOffs; 10247 10248 // 9-bit signed offset 10249 if (isInt<9>(Offset)) 10250 return true; 10251 10252 // 12-bit unsigned offset 10253 unsigned shift = Log2_64(NumBytes); 10254 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 && 10255 // Must be a multiple of NumBytes (NumBytes is a power of 2) 10256 (Offset >> shift) << shift == Offset) 10257 return true; 10258 return false; 10259 } 10260 10261 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2 10262 10263 return AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes); 10264 } 10265 10266 bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const { 10267 // Consider splitting large offset of struct or array. 10268 return true; 10269 } 10270 10271 int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL, 10272 const AddrMode &AM, Type *Ty, 10273 unsigned AS) const { 10274 // Scaling factors are not free at all. 10275 // Operands | Rt Latency 10276 // ------------------------------------------- 10277 // Rt, [Xn, Xm] | 4 10278 // ------------------------------------------- 10279 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5 10280 // Rt, [Xn, Wm, <extend> #imm] | 10281 if (isLegalAddressingMode(DL, AM, Ty, AS)) 10282 // Scale represents reg2 * scale, thus account for 1 if 10283 // it is not equal to 0 or 1. 10284 return AM.Scale != 0 && AM.Scale != 1; 10285 return -1; 10286 } 10287 10288 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd( 10289 const MachineFunction &MF, EVT VT) const { 10290 VT = VT.getScalarType(); 10291 10292 if (!VT.isSimple()) 10293 return false; 10294 10295 switch (VT.getSimpleVT().SimpleTy) { 10296 case MVT::f32: 10297 case MVT::f64: 10298 return true; 10299 default: 10300 break; 10301 } 10302 10303 return false; 10304 } 10305 10306 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F, 10307 Type *Ty) const { 10308 switch (Ty->getScalarType()->getTypeID()) { 10309 case Type::FloatTyID: 10310 case Type::DoubleTyID: 10311 return true; 10312 default: 10313 return false; 10314 } 10315 } 10316 10317 const MCPhysReg * 10318 AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const { 10319 // LR is a callee-save register, but we must treat it as clobbered by any call 10320 // site. Hence we include LR in the scratch registers, which are in turn added 10321 // as implicit-defs for stackmaps and patchpoints. 10322 static const MCPhysReg ScratchRegs[] = { 10323 AArch64::X16, AArch64::X17, AArch64::LR, 0 10324 }; 10325 return ScratchRegs; 10326 } 10327 10328 bool 10329 AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N, 10330 CombineLevel Level) const { 10331 N = N->getOperand(0).getNode(); 10332 EVT VT = N->getValueType(0); 10333 // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine 10334 // it with shift to let it be lowered to UBFX. 10335 if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) && 10336 isa<ConstantSDNode>(N->getOperand(1))) { 10337 uint64_t TruncMask = N->getConstantOperandVal(1); 10338 if (isMask_64(TruncMask) && 10339 N->getOperand(0).getOpcode() == ISD::SRL && 10340 isa<ConstantSDNode>(N->getOperand(0)->getOperand(1))) 10341 return false; 10342 } 10343 return true; 10344 } 10345 10346 bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 10347 Type *Ty) const { 10348 assert(Ty->isIntegerTy()); 10349 10350 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 10351 if (BitSize == 0) 10352 return false; 10353 10354 int64_t Val = Imm.getSExtValue(); 10355 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize)) 10356 return true; 10357 10358 if ((int64_t)Val < 0) 10359 Val = ~Val; 10360 if (BitSize == 32) 10361 Val &= (1LL << 32) - 1; 10362 10363 unsigned LZ = countLeadingZeros((uint64_t)Val); 10364 unsigned Shift = (63 - LZ) / 16; 10365 // MOVZ is free so return true for one or fewer MOVK. 10366 return Shift < 3; 10367 } 10368 10369 bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, 10370 unsigned Index) const { 10371 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) 10372 return false; 10373 10374 return (Index == 0 || Index == ResVT.getVectorNumElements()); 10375 } 10376 10377 /// Turn vector tests of the signbit in the form of: 10378 /// xor (sra X, elt_size(X)-1), -1 10379 /// into: 10380 /// cmge X, X, #0 10381 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, 10382 const AArch64Subtarget *Subtarget) { 10383 EVT VT = N->getValueType(0); 10384 if (!Subtarget->hasNEON() || !VT.isVector()) 10385 return SDValue(); 10386 10387 // There must be a shift right algebraic before the xor, and the xor must be a 10388 // 'not' operation. 10389 SDValue Shift = N->getOperand(0); 10390 SDValue Ones = N->getOperand(1); 10391 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() || 10392 !ISD::isBuildVectorAllOnes(Ones.getNode())) 10393 return SDValue(); 10394 10395 // The shift should be smearing the sign bit across each vector element. 10396 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1)); 10397 EVT ShiftEltTy = Shift.getValueType().getVectorElementType(); 10398 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1) 10399 return SDValue(); 10400 10401 return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0)); 10402 } 10403 10404 // Generate SUBS and CSEL for integer abs. 10405 static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { 10406 EVT VT = N->getValueType(0); 10407 10408 SDValue N0 = N->getOperand(0); 10409 SDValue N1 = N->getOperand(1); 10410 SDLoc DL(N); 10411 10412 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1) 10413 // and change it to SUB and CSEL. 10414 if (VT.isInteger() && N->getOpcode() == ISD::XOR && 10415 N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 && 10416 N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) 10417 if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1))) 10418 if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) { 10419 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), 10420 N0.getOperand(0)); 10421 // Generate SUBS & CSEL. 10422 SDValue Cmp = 10423 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32), 10424 N0.getOperand(0), DAG.getConstant(0, DL, VT)); 10425 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg, 10426 DAG.getConstant(AArch64CC::PL, DL, MVT::i32), 10427 SDValue(Cmp.getNode(), 1)); 10428 } 10429 return SDValue(); 10430 } 10431 10432 static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, 10433 TargetLowering::DAGCombinerInfo &DCI, 10434 const AArch64Subtarget *Subtarget) { 10435 if (DCI.isBeforeLegalizeOps()) 10436 return SDValue(); 10437 10438 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget)) 10439 return Cmp; 10440 10441 return performIntegerAbsCombine(N, DAG); 10442 } 10443 10444 SDValue 10445 AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, 10446 SelectionDAG &DAG, 10447 SmallVectorImpl<SDNode *> &Created) const { 10448 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); 10449 if (isIntDivCheap(N->getValueType(0), Attr)) 10450 return SDValue(N,0); // Lower SDIV as SDIV 10451 10452 // fold (sdiv X, pow2) 10453 EVT VT = N->getValueType(0); 10454 if ((VT != MVT::i32 && VT != MVT::i64) || 10455 !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2())) 10456 return SDValue(); 10457 10458 SDLoc DL(N); 10459 SDValue N0 = N->getOperand(0); 10460 unsigned Lg2 = Divisor.countTrailingZeros(); 10461 SDValue Zero = DAG.getConstant(0, DL, VT); 10462 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT); 10463 10464 // Add (N0 < 0) ? Pow2 - 1 : 0; 10465 SDValue CCVal; 10466 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL); 10467 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne); 10468 SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp); 10469 10470 Created.push_back(Cmp.getNode()); 10471 Created.push_back(Add.getNode()); 10472 Created.push_back(CSel.getNode()); 10473 10474 // Divide by pow2. 10475 SDValue SRA = 10476 DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64)); 10477 10478 // If we're dividing by a positive value, we're done. Otherwise, we must 10479 // negate the result. 10480 if (Divisor.isNonNegative()) 10481 return SRA; 10482 10483 Created.push_back(SRA.getNode()); 10484 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA); 10485 } 10486 10487 static bool IsSVECntIntrinsic(SDValue S) { 10488 switch(getIntrinsicID(S.getNode())) { 10489 default: 10490 break; 10491 case Intrinsic::aarch64_sve_cntb: 10492 case Intrinsic::aarch64_sve_cnth: 10493 case Intrinsic::aarch64_sve_cntw: 10494 case Intrinsic::aarch64_sve_cntd: 10495 return true; 10496 } 10497 return false; 10498 } 10499 10500 static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, 10501 TargetLowering::DAGCombinerInfo &DCI, 10502 const AArch64Subtarget *Subtarget) { 10503 if (DCI.isBeforeLegalizeOps()) 10504 return SDValue(); 10505 10506 // The below optimizations require a constant RHS. 10507 if (!isa<ConstantSDNode>(N->getOperand(1))) 10508 return SDValue(); 10509 10510 SDValue N0 = N->getOperand(0); 10511 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(1)); 10512 const APInt &ConstValue = C->getAPIntValue(); 10513 10514 // Allow the scaling to be folded into the `cnt` instruction by preventing 10515 // the scaling to be obscured here. This makes it easier to pattern match. 10516 if (IsSVECntIntrinsic(N0) || 10517 (N0->getOpcode() == ISD::TRUNCATE && 10518 (IsSVECntIntrinsic(N0->getOperand(0))))) 10519 if (ConstValue.sge(1) && ConstValue.sle(16)) 10520 return SDValue(); 10521 10522 // Multiplication of a power of two plus/minus one can be done more 10523 // cheaply as as shift+add/sub. For now, this is true unilaterally. If 10524 // future CPUs have a cheaper MADD instruction, this may need to be 10525 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and 10526 // 64-bit is 5 cycles, so this is always a win. 10527 // More aggressively, some multiplications N0 * C can be lowered to 10528 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M, 10529 // e.g. 6=3*2=(2+1)*2. 10530 // TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45 10531 // which equals to (1+2)*16-(1+2). 10532 // TrailingZeroes is used to test if the mul can be lowered to 10533 // shift+add+shift. 10534 unsigned TrailingZeroes = ConstValue.countTrailingZeros(); 10535 if (TrailingZeroes) { 10536 // Conservatively do not lower to shift+add+shift if the mul might be 10537 // folded into smul or umul. 10538 if (N0->hasOneUse() && (isSignExtended(N0.getNode(), DAG) || 10539 isZeroExtended(N0.getNode(), DAG))) 10540 return SDValue(); 10541 // Conservatively do not lower to shift+add+shift if the mul might be 10542 // folded into madd or msub. 10543 if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD || 10544 N->use_begin()->getOpcode() == ISD::SUB)) 10545 return SDValue(); 10546 } 10547 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub 10548 // and shift+add+shift. 10549 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes); 10550 10551 unsigned ShiftAmt, AddSubOpc; 10552 // Is the shifted value the LHS operand of the add/sub? 10553 bool ShiftValUseIsN0 = true; 10554 // Do we need to negate the result? 10555 bool NegateResult = false; 10556 10557 if (ConstValue.isNonNegative()) { 10558 // (mul x, 2^N + 1) => (add (shl x, N), x) 10559 // (mul x, 2^N - 1) => (sub (shl x, N), x) 10560 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M) 10561 APInt SCVMinus1 = ShiftedConstValue - 1; 10562 APInt CVPlus1 = ConstValue + 1; 10563 if (SCVMinus1.isPowerOf2()) { 10564 ShiftAmt = SCVMinus1.logBase2(); 10565 AddSubOpc = ISD::ADD; 10566 } else if (CVPlus1.isPowerOf2()) { 10567 ShiftAmt = CVPlus1.logBase2(); 10568 AddSubOpc = ISD::SUB; 10569 } else 10570 return SDValue(); 10571 } else { 10572 // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) 10573 // (mul x, -(2^N + 1)) => - (add (shl x, N), x) 10574 APInt CVNegPlus1 = -ConstValue + 1; 10575 APInt CVNegMinus1 = -ConstValue - 1; 10576 if (CVNegPlus1.isPowerOf2()) { 10577 ShiftAmt = CVNegPlus1.logBase2(); 10578 AddSubOpc = ISD::SUB; 10579 ShiftValUseIsN0 = false; 10580 } else if (CVNegMinus1.isPowerOf2()) { 10581 ShiftAmt = CVNegMinus1.logBase2(); 10582 AddSubOpc = ISD::ADD; 10583 NegateResult = true; 10584 } else 10585 return SDValue(); 10586 } 10587 10588 SDLoc DL(N); 10589 EVT VT = N->getValueType(0); 10590 SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N0, 10591 DAG.getConstant(ShiftAmt, DL, MVT::i64)); 10592 10593 SDValue AddSubN0 = ShiftValUseIsN0 ? ShiftedVal : N0; 10594 SDValue AddSubN1 = ShiftValUseIsN0 ? N0 : ShiftedVal; 10595 SDValue Res = DAG.getNode(AddSubOpc, DL, VT, AddSubN0, AddSubN1); 10596 assert(!(NegateResult && TrailingZeroes) && 10597 "NegateResult and TrailingZeroes cannot both be true for now."); 10598 // Negate the result. 10599 if (NegateResult) 10600 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res); 10601 // Shift the result. 10602 if (TrailingZeroes) 10603 return DAG.getNode(ISD::SHL, DL, VT, Res, 10604 DAG.getConstant(TrailingZeroes, DL, MVT::i64)); 10605 return Res; 10606 } 10607 10608 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, 10609 SelectionDAG &DAG) { 10610 // Take advantage of vector comparisons producing 0 or -1 in each lane to 10611 // optimize away operation when it's from a constant. 10612 // 10613 // The general transformation is: 10614 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> 10615 // AND(VECTOR_CMP(x,y), constant2) 10616 // constant2 = UNARYOP(constant) 10617 10618 // Early exit if this isn't a vector operation, the operand of the 10619 // unary operation isn't a bitwise AND, or if the sizes of the operations 10620 // aren't the same. 10621 EVT VT = N->getValueType(0); 10622 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || 10623 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC || 10624 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits()) 10625 return SDValue(); 10626 10627 // Now check that the other operand of the AND is a constant. We could 10628 // make the transformation for non-constant splats as well, but it's unclear 10629 // that would be a benefit as it would not eliminate any operations, just 10630 // perform one more step in scalar code before moving to the vector unit. 10631 if (BuildVectorSDNode *BV = 10632 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) { 10633 // Bail out if the vector isn't a constant. 10634 if (!BV->isConstant()) 10635 return SDValue(); 10636 10637 // Everything checks out. Build up the new and improved node. 10638 SDLoc DL(N); 10639 EVT IntVT = BV->getValueType(0); 10640 // Create a new constant of the appropriate type for the transformed 10641 // DAG. 10642 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); 10643 // The AND node needs bitcasts to/from an integer vector type around it. 10644 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst); 10645 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, 10646 N->getOperand(0)->getOperand(0), MaskConst); 10647 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd); 10648 return Res; 10649 } 10650 10651 return SDValue(); 10652 } 10653 10654 static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, 10655 const AArch64Subtarget *Subtarget) { 10656 // First try to optimize away the conversion when it's conditionally from 10657 // a constant. Vectors only. 10658 if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG)) 10659 return Res; 10660 10661 EVT VT = N->getValueType(0); 10662 if (VT != MVT::f32 && VT != MVT::f64) 10663 return SDValue(); 10664 10665 // Only optimize when the source and destination types have the same width. 10666 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits()) 10667 return SDValue(); 10668 10669 // If the result of an integer load is only used by an integer-to-float 10670 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead. 10671 // This eliminates an "integer-to-vector-move" UOP and improves throughput. 10672 SDValue N0 = N->getOperand(0); 10673 if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && 10674 // Do not change the width of a volatile load. 10675 !cast<LoadSDNode>(N0)->isVolatile()) { 10676 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 10677 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(), 10678 LN0->getPointerInfo(), LN0->getAlignment(), 10679 LN0->getMemOperand()->getFlags()); 10680 10681 // Make sure successors of the original load stay after it by updating them 10682 // to use the new Chain. 10683 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1)); 10684 10685 unsigned Opcode = 10686 (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF; 10687 return DAG.getNode(Opcode, SDLoc(N), VT, Load); 10688 } 10689 10690 return SDValue(); 10691 } 10692 10693 /// Fold a floating-point multiply by power of two into floating-point to 10694 /// fixed-point conversion. 10695 static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, 10696 TargetLowering::DAGCombinerInfo &DCI, 10697 const AArch64Subtarget *Subtarget) { 10698 if (!Subtarget->hasNEON()) 10699 return SDValue(); 10700 10701 if (!N->getValueType(0).isSimple()) 10702 return SDValue(); 10703 10704 SDValue Op = N->getOperand(0); 10705 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || 10706 Op.getOpcode() != ISD::FMUL) 10707 return SDValue(); 10708 10709 SDValue ConstVec = Op->getOperand(1); 10710 if (!isa<BuildVectorSDNode>(ConstVec)) 10711 return SDValue(); 10712 10713 MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); 10714 uint32_t FloatBits = FloatTy.getSizeInBits(); 10715 if (FloatBits != 32 && FloatBits != 64) 10716 return SDValue(); 10717 10718 MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); 10719 uint32_t IntBits = IntTy.getSizeInBits(); 10720 if (IntBits != 16 && IntBits != 32 && IntBits != 64) 10721 return SDValue(); 10722 10723 // Avoid conversions where iN is larger than the float (e.g., float -> i64). 10724 if (IntBits > FloatBits) 10725 return SDValue(); 10726 10727 BitVector UndefElements; 10728 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 10729 int32_t Bits = IntBits == 64 ? 64 : 32; 10730 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1); 10731 if (C == -1 || C == 0 || C > Bits) 10732 return SDValue(); 10733 10734 MVT ResTy; 10735 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 10736 switch (NumLanes) { 10737 default: 10738 return SDValue(); 10739 case 2: 10740 ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; 10741 break; 10742 case 4: 10743 ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64; 10744 break; 10745 } 10746 10747 if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps()) 10748 return SDValue(); 10749 10750 assert((ResTy != MVT::v4i64 || DCI.isBeforeLegalizeOps()) && 10751 "Illegal vector type after legalization"); 10752 10753 SDLoc DL(N); 10754 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; 10755 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs 10756 : Intrinsic::aarch64_neon_vcvtfp2fxu; 10757 SDValue FixConv = 10758 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy, 10759 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), 10760 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32)); 10761 // We can handle smaller integers by generating an extra trunc. 10762 if (IntBits < FloatBits) 10763 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv); 10764 10765 return FixConv; 10766 } 10767 10768 /// Fold a floating-point divide by power of two into fixed-point to 10769 /// floating-point conversion. 10770 static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG, 10771 TargetLowering::DAGCombinerInfo &DCI, 10772 const AArch64Subtarget *Subtarget) { 10773 if (!Subtarget->hasNEON()) 10774 return SDValue(); 10775 10776 SDValue Op = N->getOperand(0); 10777 unsigned Opc = Op->getOpcode(); 10778 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || 10779 !Op.getOperand(0).getValueType().isSimple() || 10780 (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP)) 10781 return SDValue(); 10782 10783 SDValue ConstVec = N->getOperand(1); 10784 if (!isa<BuildVectorSDNode>(ConstVec)) 10785 return SDValue(); 10786 10787 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); 10788 int32_t IntBits = IntTy.getSizeInBits(); 10789 if (IntBits != 16 && IntBits != 32 && IntBits != 64) 10790 return SDValue(); 10791 10792 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); 10793 int32_t FloatBits = FloatTy.getSizeInBits(); 10794 if (FloatBits != 32 && FloatBits != 64) 10795 return SDValue(); 10796 10797 // Avoid conversions where iN is larger than the float (e.g., i64 -> float). 10798 if (IntBits > FloatBits) 10799 return SDValue(); 10800 10801 BitVector UndefElements; 10802 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 10803 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1); 10804 if (C == -1 || C == 0 || C > FloatBits) 10805 return SDValue(); 10806 10807 MVT ResTy; 10808 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 10809 switch (NumLanes) { 10810 default: 10811 return SDValue(); 10812 case 2: 10813 ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; 10814 break; 10815 case 4: 10816 ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64; 10817 break; 10818 } 10819 10820 if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps()) 10821 return SDValue(); 10822 10823 SDLoc DL(N); 10824 SDValue ConvInput = Op.getOperand(0); 10825 bool IsSigned = Opc == ISD::SINT_TO_FP; 10826 if (IntBits < FloatBits) 10827 ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, 10828 ResTy, ConvInput); 10829 10830 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp 10831 : Intrinsic::aarch64_neon_vcvtfxu2fp; 10832 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), 10833 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput, 10834 DAG.getConstant(C, DL, MVT::i32)); 10835 } 10836 10837 /// An EXTR instruction is made up of two shifts, ORed together. This helper 10838 /// searches for and classifies those shifts. 10839 static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, 10840 bool &FromHi) { 10841 if (N.getOpcode() == ISD::SHL) 10842 FromHi = false; 10843 else if (N.getOpcode() == ISD::SRL) 10844 FromHi = true; 10845 else 10846 return false; 10847 10848 if (!isa<ConstantSDNode>(N.getOperand(1))) 10849 return false; 10850 10851 ShiftAmount = N->getConstantOperandVal(1); 10852 Src = N->getOperand(0); 10853 return true; 10854 } 10855 10856 /// EXTR instruction extracts a contiguous chunk of bits from two existing 10857 /// registers viewed as a high/low pair. This function looks for the pattern: 10858 /// <tt>(or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N))</tt> and replaces it 10859 /// with an EXTR. Can't quite be done in TableGen because the two immediates 10860 /// aren't independent. 10861 static SDValue tryCombineToEXTR(SDNode *N, 10862 TargetLowering::DAGCombinerInfo &DCI) { 10863 SelectionDAG &DAG = DCI.DAG; 10864 SDLoc DL(N); 10865 EVT VT = N->getValueType(0); 10866 10867 assert(N->getOpcode() == ISD::OR && "Unexpected root"); 10868 10869 if (VT != MVT::i32 && VT != MVT::i64) 10870 return SDValue(); 10871 10872 SDValue LHS; 10873 uint32_t ShiftLHS = 0; 10874 bool LHSFromHi = false; 10875 if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi)) 10876 return SDValue(); 10877 10878 SDValue RHS; 10879 uint32_t ShiftRHS = 0; 10880 bool RHSFromHi = false; 10881 if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi)) 10882 return SDValue(); 10883 10884 // If they're both trying to come from the high part of the register, they're 10885 // not really an EXTR. 10886 if (LHSFromHi == RHSFromHi) 10887 return SDValue(); 10888 10889 if (ShiftLHS + ShiftRHS != VT.getSizeInBits()) 10890 return SDValue(); 10891 10892 if (LHSFromHi) { 10893 std::swap(LHS, RHS); 10894 std::swap(ShiftLHS, ShiftRHS); 10895 } 10896 10897 return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS, 10898 DAG.getConstant(ShiftRHS, DL, MVT::i64)); 10899 } 10900 10901 static SDValue tryCombineToBSL(SDNode *N, 10902 TargetLowering::DAGCombinerInfo &DCI) { 10903 EVT VT = N->getValueType(0); 10904 SelectionDAG &DAG = DCI.DAG; 10905 SDLoc DL(N); 10906 10907 if (!VT.isVector()) 10908 return SDValue(); 10909 10910 SDValue N0 = N->getOperand(0); 10911 if (N0.getOpcode() != ISD::AND) 10912 return SDValue(); 10913 10914 SDValue N1 = N->getOperand(1); 10915 if (N1.getOpcode() != ISD::AND) 10916 return SDValue(); 10917 10918 // We only have to look for constant vectors here since the general, variable 10919 // case can be handled in TableGen. 10920 unsigned Bits = VT.getScalarSizeInBits(); 10921 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1); 10922 for (int i = 1; i >= 0; --i) 10923 for (int j = 1; j >= 0; --j) { 10924 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i)); 10925 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j)); 10926 if (!BVN0 || !BVN1) 10927 continue; 10928 10929 bool FoundMatch = true; 10930 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) { 10931 ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k)); 10932 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k)); 10933 if (!CN0 || !CN1 || 10934 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) { 10935 FoundMatch = false; 10936 break; 10937 } 10938 } 10939 10940 if (FoundMatch) 10941 return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0), 10942 N0->getOperand(1 - i), N1->getOperand(1 - j)); 10943 } 10944 10945 return SDValue(); 10946 } 10947 10948 static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 10949 const AArch64Subtarget *Subtarget) { 10950 // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) 10951 SelectionDAG &DAG = DCI.DAG; 10952 EVT VT = N->getValueType(0); 10953 10954 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 10955 return SDValue(); 10956 10957 if (SDValue Res = tryCombineToEXTR(N, DCI)) 10958 return Res; 10959 10960 if (SDValue Res = tryCombineToBSL(N, DCI)) 10961 return Res; 10962 10963 return SDValue(); 10964 } 10965 10966 static bool isConstantSplatVectorMaskForType(SDNode *N, EVT MemVT) { 10967 if (!MemVT.getVectorElementType().isSimple()) 10968 return false; 10969 10970 uint64_t MaskForTy = 0ull; 10971 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) { 10972 case MVT::i8: 10973 MaskForTy = 0xffull; 10974 break; 10975 case MVT::i16: 10976 MaskForTy = 0xffffull; 10977 break; 10978 case MVT::i32: 10979 MaskForTy = 0xffffffffull; 10980 break; 10981 default: 10982 return false; 10983 break; 10984 } 10985 10986 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR) 10987 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0))) 10988 return Op0->getAPIntValue().getLimitedValue() == MaskForTy; 10989 10990 return false; 10991 } 10992 10993 static SDValue performSVEAndCombine(SDNode *N, 10994 TargetLowering::DAGCombinerInfo &DCI) { 10995 if (DCI.isBeforeLegalizeOps()) 10996 return SDValue(); 10997 10998 SelectionDAG &DAG = DCI.DAG; 10999 SDValue Src = N->getOperand(0); 11000 unsigned Opc = Src->getOpcode(); 11001 11002 // Zero/any extend of an unsigned unpack 11003 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) { 11004 SDValue UnpkOp = Src->getOperand(0); 11005 SDValue Dup = N->getOperand(1); 11006 11007 if (Dup.getOpcode() != AArch64ISD::DUP) 11008 return SDValue(); 11009 11010 SDLoc DL(N); 11011 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0)); 11012 uint64_t ExtVal = C->getZExtValue(); 11013 11014 // If the mask is fully covered by the unpack, we don't need to push 11015 // a new AND onto the operand 11016 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType(); 11017 if ((ExtVal == 0xFF && EltTy == MVT::i8) || 11018 (ExtVal == 0xFFFF && EltTy == MVT::i16) || 11019 (ExtVal == 0xFFFFFFFF && EltTy == MVT::i32)) 11020 return Src; 11021 11022 // Truncate to prevent a DUP with an over wide constant 11023 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits()); 11024 11025 // Otherwise, make sure we propagate the AND to the operand 11026 // of the unpack 11027 Dup = DAG.getNode(AArch64ISD::DUP, DL, 11028 UnpkOp->getValueType(0), 11029 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32)); 11030 11031 SDValue And = DAG.getNode(ISD::AND, DL, 11032 UnpkOp->getValueType(0), UnpkOp, Dup); 11033 11034 return DAG.getNode(Opc, DL, N->getValueType(0), And); 11035 } 11036 11037 SDValue Mask = N->getOperand(1); 11038 11039 if (!Src.hasOneUse()) 11040 return SDValue(); 11041 11042 EVT MemVT; 11043 11044 // SVE load instructions perform an implicit zero-extend, which makes them 11045 // perfect candidates for combining. 11046 switch (Opc) { 11047 case AArch64ISD::LD1_MERGE_ZERO: 11048 case AArch64ISD::LDNF1_MERGE_ZERO: 11049 case AArch64ISD::LDFF1_MERGE_ZERO: 11050 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT(); 11051 break; 11052 case AArch64ISD::GLD1_MERGE_ZERO: 11053 case AArch64ISD::GLD1_SCALED_MERGE_ZERO: 11054 case AArch64ISD::GLD1_SXTW_MERGE_ZERO: 11055 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO: 11056 case AArch64ISD::GLD1_UXTW_MERGE_ZERO: 11057 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO: 11058 case AArch64ISD::GLD1_IMM_MERGE_ZERO: 11059 case AArch64ISD::GLDFF1_MERGE_ZERO: 11060 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO: 11061 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO: 11062 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO: 11063 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO: 11064 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO: 11065 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO: 11066 case AArch64ISD::GLDNT1_MERGE_ZERO: 11067 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT(); 11068 break; 11069 default: 11070 return SDValue(); 11071 } 11072 11073 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT)) 11074 return Src; 11075 11076 return SDValue(); 11077 } 11078 11079 static SDValue performANDCombine(SDNode *N, 11080 TargetLowering::DAGCombinerInfo &DCI) { 11081 SelectionDAG &DAG = DCI.DAG; 11082 SDValue LHS = N->getOperand(0); 11083 EVT VT = N->getValueType(0); 11084 if (!VT.isVector() || !DAG.getTargetLoweringInfo().isTypeLegal(VT)) 11085 return SDValue(); 11086 11087 if (VT.isScalableVector()) 11088 return performSVEAndCombine(N, DCI); 11089 11090 BuildVectorSDNode *BVN = 11091 dyn_cast<BuildVectorSDNode>(N->getOperand(1).getNode()); 11092 if (!BVN) 11093 return SDValue(); 11094 11095 // AND does not accept an immediate, so check if we can use a BIC immediate 11096 // instruction instead. We do this here instead of using a (and x, (mvni imm)) 11097 // pattern in isel, because some immediates may be lowered to the preferred 11098 // (and x, (movi imm)) form, even though an mvni representation also exists. 11099 APInt DefBits(VT.getSizeInBits(), 0); 11100 APInt UndefBits(VT.getSizeInBits(), 0); 11101 if (resolveBuildVector(BVN, DefBits, UndefBits)) { 11102 SDValue NewOp; 11103 11104 DefBits = ~DefBits; 11105 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG, 11106 DefBits, &LHS)) || 11107 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG, 11108 DefBits, &LHS))) 11109 return NewOp; 11110 11111 UndefBits = ~UndefBits; 11112 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG, 11113 UndefBits, &LHS)) || 11114 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG, 11115 UndefBits, &LHS))) 11116 return NewOp; 11117 } 11118 11119 return SDValue(); 11120 } 11121 11122 static SDValue performSRLCombine(SDNode *N, 11123 TargetLowering::DAGCombinerInfo &DCI) { 11124 SelectionDAG &DAG = DCI.DAG; 11125 EVT VT = N->getValueType(0); 11126 if (VT != MVT::i32 && VT != MVT::i64) 11127 return SDValue(); 11128 11129 // Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the 11130 // high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32) 11131 // to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero. 11132 SDValue N0 = N->getOperand(0); 11133 if (N0.getOpcode() == ISD::BSWAP) { 11134 SDLoc DL(N); 11135 SDValue N1 = N->getOperand(1); 11136 SDValue N00 = N0.getOperand(0); 11137 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) { 11138 uint64_t ShiftAmt = C->getZExtValue(); 11139 if (VT == MVT::i32 && ShiftAmt == 16 && 11140 DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(32, 16))) 11141 return DAG.getNode(ISD::ROTR, DL, VT, N0, N1); 11142 if (VT == MVT::i64 && ShiftAmt == 32 && 11143 DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(64, 32))) 11144 return DAG.getNode(ISD::ROTR, DL, VT, N0, N1); 11145 } 11146 } 11147 return SDValue(); 11148 } 11149 11150 static SDValue performConcatVectorsCombine(SDNode *N, 11151 TargetLowering::DAGCombinerInfo &DCI, 11152 SelectionDAG &DAG) { 11153 SDLoc dl(N); 11154 EVT VT = N->getValueType(0); 11155 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); 11156 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode(); 11157 11158 // Optimize concat_vectors of truncated vectors, where the intermediate 11159 // type is illegal, to avoid said illegality, e.g., 11160 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))), 11161 // (v2i16 (truncate (v2i64))))) 11162 // -> 11163 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))), 11164 // (v4i32 (bitcast (v2i64))), 11165 // <0, 2, 4, 6>))) 11166 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed 11167 // on both input and result type, so we might generate worse code. 11168 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8. 11169 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE && 11170 N1Opc == ISD::TRUNCATE) { 11171 SDValue N00 = N0->getOperand(0); 11172 SDValue N10 = N1->getOperand(0); 11173 EVT N00VT = N00.getValueType(); 11174 11175 if (N00VT == N10.getValueType() && 11176 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) && 11177 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) { 11178 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16); 11179 SmallVector<int, 8> Mask(MidVT.getVectorNumElements()); 11180 for (size_t i = 0; i < Mask.size(); ++i) 11181 Mask[i] = i * 2; 11182 return DAG.getNode(ISD::TRUNCATE, dl, VT, 11183 DAG.getVectorShuffle( 11184 MidVT, dl, 11185 DAG.getNode(ISD::BITCAST, dl, MidVT, N00), 11186 DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask)); 11187 } 11188 } 11189 11190 // Wait 'til after everything is legalized to try this. That way we have 11191 // legal vector types and such. 11192 if (DCI.isBeforeLegalizeOps()) 11193 return SDValue(); 11194 11195 // Optimise concat_vectors of two [us]rhadds that use extracted subvectors 11196 // from the same original vectors. Combine these into a single [us]rhadd that 11197 // operates on the two original vectors. Example: 11198 // (v16i8 (concat_vectors (v8i8 (urhadd (extract_subvector (v16i8 OpA, <0>), 11199 // extract_subvector (v16i8 OpB, 11200 // <0>))), 11201 // (v8i8 (urhadd (extract_subvector (v16i8 OpA, <8>), 11202 // extract_subvector (v16i8 OpB, 11203 // <8>))))) 11204 // -> 11205 // (v16i8(urhadd(v16i8 OpA, v16i8 OpB))) 11206 if (N->getNumOperands() == 2 && N0Opc == N1Opc && 11207 (N0Opc == AArch64ISD::URHADD || N0Opc == AArch64ISD::SRHADD)) { 11208 SDValue N00 = N0->getOperand(0); 11209 SDValue N01 = N0->getOperand(1); 11210 SDValue N10 = N1->getOperand(0); 11211 SDValue N11 = N1->getOperand(1); 11212 11213 EVT N00VT = N00.getValueType(); 11214 EVT N10VT = N10.getValueType(); 11215 11216 if (N00->getOpcode() == ISD::EXTRACT_SUBVECTOR && 11217 N01->getOpcode() == ISD::EXTRACT_SUBVECTOR && 11218 N10->getOpcode() == ISD::EXTRACT_SUBVECTOR && 11219 N11->getOpcode() == ISD::EXTRACT_SUBVECTOR && N00VT == N10VT) { 11220 SDValue N00Source = N00->getOperand(0); 11221 SDValue N01Source = N01->getOperand(0); 11222 SDValue N10Source = N10->getOperand(0); 11223 SDValue N11Source = N11->getOperand(0); 11224 11225 if (N00Source == N10Source && N01Source == N11Source && 11226 N00Source.getValueType() == VT && N01Source.getValueType() == VT) { 11227 assert(N0.getValueType() == N1.getValueType()); 11228 11229 uint64_t N00Index = N00.getConstantOperandVal(1); 11230 uint64_t N01Index = N01.getConstantOperandVal(1); 11231 uint64_t N10Index = N10.getConstantOperandVal(1); 11232 uint64_t N11Index = N11.getConstantOperandVal(1); 11233 11234 if (N00Index == N01Index && N10Index == N11Index && N00Index == 0 && 11235 N10Index == N00VT.getVectorNumElements()) 11236 return DAG.getNode(N0Opc, dl, VT, N00Source, N01Source); 11237 } 11238 } 11239 } 11240 11241 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector 11242 // splat. The indexed instructions are going to be expecting a DUPLANE64, so 11243 // canonicalise to that. 11244 if (N0 == N1 && VT.getVectorNumElements() == 2) { 11245 assert(VT.getScalarSizeInBits() == 64); 11246 return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG), 11247 DAG.getConstant(0, dl, MVT::i64)); 11248 } 11249 11250 // Canonicalise concat_vectors so that the right-hand vector has as few 11251 // bit-casts as possible before its real operation. The primary matching 11252 // destination for these operations will be the narrowing "2" instructions, 11253 // which depend on the operation being performed on this right-hand vector. 11254 // For example, 11255 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS)))) 11256 // becomes 11257 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS)) 11258 11259 if (N1Opc != ISD::BITCAST) 11260 return SDValue(); 11261 SDValue RHS = N1->getOperand(0); 11262 MVT RHSTy = RHS.getValueType().getSimpleVT(); 11263 // If the RHS is not a vector, this is not the pattern we're looking for. 11264 if (!RHSTy.isVector()) 11265 return SDValue(); 11266 11267 LLVM_DEBUG( 11268 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n"); 11269 11270 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(), 11271 RHSTy.getVectorNumElements() * 2); 11272 return DAG.getNode(ISD::BITCAST, dl, VT, 11273 DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy, 11274 DAG.getNode(ISD::BITCAST, dl, RHSTy, N0), 11275 RHS)); 11276 } 11277 11278 static SDValue tryCombineFixedPointConvert(SDNode *N, 11279 TargetLowering::DAGCombinerInfo &DCI, 11280 SelectionDAG &DAG) { 11281 // Wait until after everything is legalized to try this. That way we have 11282 // legal vector types and such. 11283 if (DCI.isBeforeLegalizeOps()) 11284 return SDValue(); 11285 // Transform a scalar conversion of a value from a lane extract into a 11286 // lane extract of a vector conversion. E.g., from foo1 to foo2: 11287 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); } 11288 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; } 11289 // 11290 // The second form interacts better with instruction selection and the 11291 // register allocator to avoid cross-class register copies that aren't 11292 // coalescable due to a lane reference. 11293 11294 // Check the operand and see if it originates from a lane extract. 11295 SDValue Op1 = N->getOperand(1); 11296 if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 11297 // Yep, no additional predication needed. Perform the transform. 11298 SDValue IID = N->getOperand(0); 11299 SDValue Shift = N->getOperand(2); 11300 SDValue Vec = Op1.getOperand(0); 11301 SDValue Lane = Op1.getOperand(1); 11302 EVT ResTy = N->getValueType(0); 11303 EVT VecResTy; 11304 SDLoc DL(N); 11305 11306 // The vector width should be 128 bits by the time we get here, even 11307 // if it started as 64 bits (the extract_vector handling will have 11308 // done so). 11309 assert(Vec.getValueSizeInBits() == 128 && 11310 "unexpected vector size on extract_vector_elt!"); 11311 if (Vec.getValueType() == MVT::v4i32) 11312 VecResTy = MVT::v4f32; 11313 else if (Vec.getValueType() == MVT::v2i64) 11314 VecResTy = MVT::v2f64; 11315 else 11316 llvm_unreachable("unexpected vector type!"); 11317 11318 SDValue Convert = 11319 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift); 11320 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane); 11321 } 11322 return SDValue(); 11323 } 11324 11325 // AArch64 high-vector "long" operations are formed by performing the non-high 11326 // version on an extract_subvector of each operand which gets the high half: 11327 // 11328 // (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS)) 11329 // 11330 // However, there are cases which don't have an extract_high explicitly, but 11331 // have another operation that can be made compatible with one for free. For 11332 // example: 11333 // 11334 // (dupv64 scalar) --> (extract_high (dup128 scalar)) 11335 // 11336 // This routine does the actual conversion of such DUPs, once outer routines 11337 // have determined that everything else is in order. 11338 // It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold 11339 // similarly here. 11340 static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) { 11341 switch (N.getOpcode()) { 11342 case AArch64ISD::DUP: 11343 case AArch64ISD::DUPLANE8: 11344 case AArch64ISD::DUPLANE16: 11345 case AArch64ISD::DUPLANE32: 11346 case AArch64ISD::DUPLANE64: 11347 case AArch64ISD::MOVI: 11348 case AArch64ISD::MOVIshift: 11349 case AArch64ISD::MOVIedit: 11350 case AArch64ISD::MOVImsl: 11351 case AArch64ISD::MVNIshift: 11352 case AArch64ISD::MVNImsl: 11353 break; 11354 default: 11355 // FMOV could be supported, but isn't very useful, as it would only occur 11356 // if you passed a bitcast' floating point immediate to an eligible long 11357 // integer op (addl, smull, ...). 11358 return SDValue(); 11359 } 11360 11361 MVT NarrowTy = N.getSimpleValueType(); 11362 if (!NarrowTy.is64BitVector()) 11363 return SDValue(); 11364 11365 MVT ElementTy = NarrowTy.getVectorElementType(); 11366 unsigned NumElems = NarrowTy.getVectorNumElements(); 11367 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2); 11368 11369 SDLoc dl(N); 11370 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy, 11371 DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()), 11372 DAG.getConstant(NumElems, dl, MVT::i64)); 11373 } 11374 11375 static bool isEssentiallyExtractHighSubvector(SDValue N) { 11376 if (N.getOpcode() == ISD::BITCAST) 11377 N = N.getOperand(0); 11378 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR) 11379 return false; 11380 return cast<ConstantSDNode>(N.getOperand(1))->getAPIntValue() == 11381 N.getOperand(0).getValueType().getVectorNumElements() / 2; 11382 } 11383 11384 /// Helper structure to keep track of ISD::SET_CC operands. 11385 struct GenericSetCCInfo { 11386 const SDValue *Opnd0; 11387 const SDValue *Opnd1; 11388 ISD::CondCode CC; 11389 }; 11390 11391 /// Helper structure to keep track of a SET_CC lowered into AArch64 code. 11392 struct AArch64SetCCInfo { 11393 const SDValue *Cmp; 11394 AArch64CC::CondCode CC; 11395 }; 11396 11397 /// Helper structure to keep track of SetCC information. 11398 union SetCCInfo { 11399 GenericSetCCInfo Generic; 11400 AArch64SetCCInfo AArch64; 11401 }; 11402 11403 /// Helper structure to be able to read SetCC information. If set to 11404 /// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a 11405 /// GenericSetCCInfo. 11406 struct SetCCInfoAndKind { 11407 SetCCInfo Info; 11408 bool IsAArch64; 11409 }; 11410 11411 /// Check whether or not \p Op is a SET_CC operation, either a generic or 11412 /// an 11413 /// AArch64 lowered one. 11414 /// \p SetCCInfo is filled accordingly. 11415 /// \post SetCCInfo is meanginfull only when this function returns true. 11416 /// \return True when Op is a kind of SET_CC operation. 11417 static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) { 11418 // If this is a setcc, this is straight forward. 11419 if (Op.getOpcode() == ISD::SETCC) { 11420 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0); 11421 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1); 11422 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 11423 SetCCInfo.IsAArch64 = false; 11424 return true; 11425 } 11426 // Otherwise, check if this is a matching csel instruction. 11427 // In other words: 11428 // - csel 1, 0, cc 11429 // - csel 0, 1, !cc 11430 if (Op.getOpcode() != AArch64ISD::CSEL) 11431 return false; 11432 // Set the information about the operands. 11433 // TODO: we want the operands of the Cmp not the csel 11434 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3); 11435 SetCCInfo.IsAArch64 = true; 11436 SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>( 11437 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); 11438 11439 // Check that the operands matches the constraints: 11440 // (1) Both operands must be constants. 11441 // (2) One must be 1 and the other must be 0. 11442 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0)); 11443 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 11444 11445 // Check (1). 11446 if (!TValue || !FValue) 11447 return false; 11448 11449 // Check (2). 11450 if (!TValue->isOne()) { 11451 // Update the comparison when we are interested in !cc. 11452 std::swap(TValue, FValue); 11453 SetCCInfo.Info.AArch64.CC = 11454 AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC); 11455 } 11456 return TValue->isOne() && FValue->isNullValue(); 11457 } 11458 11459 // Returns true if Op is setcc or zext of setcc. 11460 static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) { 11461 if (isSetCC(Op, Info)) 11462 return true; 11463 return ((Op.getOpcode() == ISD::ZERO_EXTEND) && 11464 isSetCC(Op->getOperand(0), Info)); 11465 } 11466 11467 // The folding we want to perform is: 11468 // (add x, [zext] (setcc cc ...) ) 11469 // --> 11470 // (csel x, (add x, 1), !cc ...) 11471 // 11472 // The latter will get matched to a CSINC instruction. 11473 static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) { 11474 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!"); 11475 SDValue LHS = Op->getOperand(0); 11476 SDValue RHS = Op->getOperand(1); 11477 SetCCInfoAndKind InfoAndKind; 11478 11479 // If neither operand is a SET_CC, give up. 11480 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) { 11481 std::swap(LHS, RHS); 11482 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) 11483 return SDValue(); 11484 } 11485 11486 // FIXME: This could be generatized to work for FP comparisons. 11487 EVT CmpVT = InfoAndKind.IsAArch64 11488 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType() 11489 : InfoAndKind.Info.Generic.Opnd0->getValueType(); 11490 if (CmpVT != MVT::i32 && CmpVT != MVT::i64) 11491 return SDValue(); 11492 11493 SDValue CCVal; 11494 SDValue Cmp; 11495 SDLoc dl(Op); 11496 if (InfoAndKind.IsAArch64) { 11497 CCVal = DAG.getConstant( 11498 AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl, 11499 MVT::i32); 11500 Cmp = *InfoAndKind.Info.AArch64.Cmp; 11501 } else 11502 Cmp = getAArch64Cmp( 11503 *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1, 11504 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG, 11505 dl); 11506 11507 EVT VT = Op->getValueType(0); 11508 LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT)); 11509 return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp); 11510 } 11511 11512 // The basic add/sub long vector instructions have variants with "2" on the end 11513 // which act on the high-half of their inputs. They are normally matched by 11514 // patterns like: 11515 // 11516 // (add (zeroext (extract_high LHS)), 11517 // (zeroext (extract_high RHS))) 11518 // -> uaddl2 vD, vN, vM 11519 // 11520 // However, if one of the extracts is something like a duplicate, this 11521 // instruction can still be used profitably. This function puts the DAG into a 11522 // more appropriate form for those patterns to trigger. 11523 static SDValue performAddSubLongCombine(SDNode *N, 11524 TargetLowering::DAGCombinerInfo &DCI, 11525 SelectionDAG &DAG) { 11526 if (DCI.isBeforeLegalizeOps()) 11527 return SDValue(); 11528 11529 MVT VT = N->getSimpleValueType(0); 11530 if (!VT.is128BitVector()) { 11531 if (N->getOpcode() == ISD::ADD) 11532 return performSetccAddFolding(N, DAG); 11533 return SDValue(); 11534 } 11535 11536 // Make sure both branches are extended in the same way. 11537 SDValue LHS = N->getOperand(0); 11538 SDValue RHS = N->getOperand(1); 11539 if ((LHS.getOpcode() != ISD::ZERO_EXTEND && 11540 LHS.getOpcode() != ISD::SIGN_EXTEND) || 11541 LHS.getOpcode() != RHS.getOpcode()) 11542 return SDValue(); 11543 11544 unsigned ExtType = LHS.getOpcode(); 11545 11546 // It's not worth doing if at least one of the inputs isn't already an 11547 // extract, but we don't know which it'll be so we have to try both. 11548 if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) { 11549 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG); 11550 if (!RHS.getNode()) 11551 return SDValue(); 11552 11553 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS); 11554 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) { 11555 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG); 11556 if (!LHS.getNode()) 11557 return SDValue(); 11558 11559 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS); 11560 } 11561 11562 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS); 11563 } 11564 11565 // Massage DAGs which we can use the high-half "long" operations on into 11566 // something isel will recognize better. E.g. 11567 // 11568 // (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) --> 11569 // (aarch64_neon_umull (extract_high (v2i64 vec))) 11570 // (extract_high (v2i64 (dup128 scalar))))) 11571 // 11572 static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, 11573 TargetLowering::DAGCombinerInfo &DCI, 11574 SelectionDAG &DAG) { 11575 if (DCI.isBeforeLegalizeOps()) 11576 return SDValue(); 11577 11578 SDValue LHS = N->getOperand(1); 11579 SDValue RHS = N->getOperand(2); 11580 assert(LHS.getValueType().is64BitVector() && 11581 RHS.getValueType().is64BitVector() && 11582 "unexpected shape for long operation"); 11583 11584 // Either node could be a DUP, but it's not worth doing both of them (you'd 11585 // just as well use the non-high version) so look for a corresponding extract 11586 // operation on the other "wing". 11587 if (isEssentiallyExtractHighSubvector(LHS)) { 11588 RHS = tryExtendDUPToExtractHigh(RHS, DAG); 11589 if (!RHS.getNode()) 11590 return SDValue(); 11591 } else if (isEssentiallyExtractHighSubvector(RHS)) { 11592 LHS = tryExtendDUPToExtractHigh(LHS, DAG); 11593 if (!LHS.getNode()) 11594 return SDValue(); 11595 } 11596 11597 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0), 11598 N->getOperand(0), LHS, RHS); 11599 } 11600 11601 static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) { 11602 MVT ElemTy = N->getSimpleValueType(0).getScalarType(); 11603 unsigned ElemBits = ElemTy.getSizeInBits(); 11604 11605 int64_t ShiftAmount; 11606 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) { 11607 APInt SplatValue, SplatUndef; 11608 unsigned SplatBitSize; 11609 bool HasAnyUndefs; 11610 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, 11611 HasAnyUndefs, ElemBits) || 11612 SplatBitSize != ElemBits) 11613 return SDValue(); 11614 11615 ShiftAmount = SplatValue.getSExtValue(); 11616 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) { 11617 ShiftAmount = CVN->getSExtValue(); 11618 } else 11619 return SDValue(); 11620 11621 unsigned Opcode; 11622 bool IsRightShift; 11623 switch (IID) { 11624 default: 11625 llvm_unreachable("Unknown shift intrinsic"); 11626 case Intrinsic::aarch64_neon_sqshl: 11627 Opcode = AArch64ISD::SQSHL_I; 11628 IsRightShift = false; 11629 break; 11630 case Intrinsic::aarch64_neon_uqshl: 11631 Opcode = AArch64ISD::UQSHL_I; 11632 IsRightShift = false; 11633 break; 11634 case Intrinsic::aarch64_neon_srshl: 11635 Opcode = AArch64ISD::SRSHR_I; 11636 IsRightShift = true; 11637 break; 11638 case Intrinsic::aarch64_neon_urshl: 11639 Opcode = AArch64ISD::URSHR_I; 11640 IsRightShift = true; 11641 break; 11642 case Intrinsic::aarch64_neon_sqshlu: 11643 Opcode = AArch64ISD::SQSHLU_I; 11644 IsRightShift = false; 11645 break; 11646 case Intrinsic::aarch64_neon_sshl: 11647 case Intrinsic::aarch64_neon_ushl: 11648 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular 11649 // left shift for positive shift amounts. Below, we only replace the current 11650 // node with VSHL, if this condition is met. 11651 Opcode = AArch64ISD::VSHL; 11652 IsRightShift = false; 11653 break; 11654 } 11655 11656 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) { 11657 SDLoc dl(N); 11658 return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1), 11659 DAG.getConstant(-ShiftAmount, dl, MVT::i32)); 11660 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) { 11661 SDLoc dl(N); 11662 return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1), 11663 DAG.getConstant(ShiftAmount, dl, MVT::i32)); 11664 } 11665 11666 return SDValue(); 11667 } 11668 11669 // The CRC32[BH] instructions ignore the high bits of their data operand. Since 11670 // the intrinsics must be legal and take an i32, this means there's almost 11671 // certainly going to be a zext in the DAG which we can eliminate. 11672 static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) { 11673 SDValue AndN = N->getOperand(2); 11674 if (AndN.getOpcode() != ISD::AND) 11675 return SDValue(); 11676 11677 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1)); 11678 if (!CMask || CMask->getZExtValue() != Mask) 11679 return SDValue(); 11680 11681 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32, 11682 N->getOperand(0), N->getOperand(1), AndN.getOperand(0)); 11683 } 11684 11685 static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, 11686 SelectionDAG &DAG) { 11687 SDLoc dl(N); 11688 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), 11689 DAG.getNode(Opc, dl, 11690 N->getOperand(1).getSimpleValueType(), 11691 N->getOperand(1)), 11692 DAG.getConstant(0, dl, MVT::i64)); 11693 } 11694 11695 static SDValue LowerSVEIntReduction(SDNode *N, unsigned Opc, 11696 SelectionDAG &DAG) { 11697 SDLoc dl(N); 11698 LLVMContext &Ctx = *DAG.getContext(); 11699 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11700 11701 EVT VT = N->getValueType(0); 11702 SDValue Pred = N->getOperand(1); 11703 SDValue Data = N->getOperand(2); 11704 EVT DataVT = Data.getValueType(); 11705 11706 if (DataVT.getVectorElementType().isScalarInteger() && 11707 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64)) { 11708 if (!TLI.isTypeLegal(DataVT)) 11709 return SDValue(); 11710 11711 EVT OutputVT = EVT::getVectorVT(Ctx, VT, 11712 AArch64::NeonBitsPerVector / VT.getSizeInBits()); 11713 SDValue Reduce = DAG.getNode(Opc, dl, OutputVT, Pred, Data); 11714 SDValue Zero = DAG.getConstant(0, dl, MVT::i64); 11715 SDValue Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Reduce, Zero); 11716 11717 return Result; 11718 } 11719 11720 return SDValue(); 11721 } 11722 11723 static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG) { 11724 SDLoc DL(N); 11725 SDValue Op1 = N->getOperand(1); 11726 SDValue Op2 = N->getOperand(2); 11727 EVT ScalarTy = Op1.getValueType(); 11728 11729 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) { 11730 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); 11731 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); 11732 } 11733 11734 return DAG.getNode(AArch64ISD::INDEX_VECTOR, DL, N->getValueType(0), 11735 Op1, Op2); 11736 } 11737 11738 static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) { 11739 SDLoc dl(N); 11740 SDValue Scalar = N->getOperand(3); 11741 EVT ScalarTy = Scalar.getValueType(); 11742 11743 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) 11744 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar); 11745 11746 SDValue Passthru = N->getOperand(1); 11747 SDValue Pred = N->getOperand(2); 11748 return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0), 11749 Pred, Scalar, Passthru); 11750 } 11751 11752 static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) { 11753 SDLoc dl(N); 11754 LLVMContext &Ctx = *DAG.getContext(); 11755 EVT VT = N->getValueType(0); 11756 11757 assert(VT.isScalableVector() && "Expected a scalable vector."); 11758 11759 // Current lowering only supports the SVE-ACLE types. 11760 if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock) 11761 return SDValue(); 11762 11763 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8; 11764 unsigned ByteSize = VT.getSizeInBits().getKnownMinSize() / 8; 11765 EVT ByteVT = EVT::getVectorVT(Ctx, MVT::i8, { ByteSize, true }); 11766 11767 // Convert everything to the domain of EXT (i.e bytes). 11768 SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1)); 11769 SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2)); 11770 SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3), 11771 DAG.getConstant(ElemSize, dl, MVT::i32)); 11772 11773 SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2); 11774 return DAG.getNode(ISD::BITCAST, dl, VT, EXT); 11775 } 11776 11777 static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, 11778 TargetLowering::DAGCombinerInfo &DCI, 11779 SelectionDAG &DAG) { 11780 if (DCI.isBeforeLegalize()) 11781 return SDValue(); 11782 11783 SDValue Comparator = N->getOperand(3); 11784 if (Comparator.getOpcode() == AArch64ISD::DUP || 11785 Comparator.getOpcode() == ISD::SPLAT_VECTOR) { 11786 unsigned IID = getIntrinsicID(N); 11787 EVT VT = N->getValueType(0); 11788 EVT CmpVT = N->getOperand(2).getValueType(); 11789 SDValue Pred = N->getOperand(1); 11790 SDValue Imm; 11791 SDLoc DL(N); 11792 11793 switch (IID) { 11794 default: 11795 llvm_unreachable("Called with wrong intrinsic!"); 11796 break; 11797 11798 // Signed comparisons 11799 case Intrinsic::aarch64_sve_cmpeq_wide: 11800 case Intrinsic::aarch64_sve_cmpne_wide: 11801 case Intrinsic::aarch64_sve_cmpge_wide: 11802 case Intrinsic::aarch64_sve_cmpgt_wide: 11803 case Intrinsic::aarch64_sve_cmplt_wide: 11804 case Intrinsic::aarch64_sve_cmple_wide: { 11805 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) { 11806 int64_t ImmVal = CN->getSExtValue(); 11807 if (ImmVal >= -16 && ImmVal <= 15) 11808 Imm = DAG.getConstant(ImmVal, DL, MVT::i32); 11809 else 11810 return SDValue(); 11811 } 11812 break; 11813 } 11814 // Unsigned comparisons 11815 case Intrinsic::aarch64_sve_cmphs_wide: 11816 case Intrinsic::aarch64_sve_cmphi_wide: 11817 case Intrinsic::aarch64_sve_cmplo_wide: 11818 case Intrinsic::aarch64_sve_cmpls_wide: { 11819 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) { 11820 uint64_t ImmVal = CN->getZExtValue(); 11821 if (ImmVal <= 127) 11822 Imm = DAG.getConstant(ImmVal, DL, MVT::i32); 11823 else 11824 return SDValue(); 11825 } 11826 break; 11827 } 11828 } 11829 11830 if (!Imm) 11831 return SDValue(); 11832 11833 SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm); 11834 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred, 11835 N->getOperand(2), Splat, DAG.getCondCode(CC)); 11836 } 11837 11838 return SDValue(); 11839 } 11840 11841 static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, 11842 AArch64CC::CondCode Cond) { 11843 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11844 11845 SDLoc DL(Op); 11846 assert(Op.getValueType().isScalableVector() && 11847 TLI.isTypeLegal(Op.getValueType()) && 11848 "Expected legal scalable vector type!"); 11849 11850 // Ensure target specific opcodes are using legal type. 11851 EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); 11852 SDValue TVal = DAG.getConstant(1, DL, OutVT); 11853 SDValue FVal = DAG.getConstant(0, DL, OutVT); 11854 11855 // Set condition code (CC) flags. 11856 SDValue Test = DAG.getNode(AArch64ISD::PTEST, DL, MVT::Other, Pg, Op); 11857 11858 // Convert CC to integer based on requested condition. 11859 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare. 11860 SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32); 11861 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test); 11862 return DAG.getZExtOrTrunc(Res, DL, VT); 11863 } 11864 11865 static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, 11866 SelectionDAG &DAG) { 11867 SDLoc DL(N); 11868 11869 SDValue Pred = N->getOperand(1); 11870 SDValue VecToReduce = N->getOperand(2); 11871 11872 EVT ReduceVT = VecToReduce.getValueType(); 11873 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce); 11874 11875 // SVE reductions set the whole vector register with the first element 11876 // containing the reduction result, which we'll now extract. 11877 SDValue Zero = DAG.getConstant(0, DL, MVT::i64); 11878 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce, 11879 Zero); 11880 } 11881 11882 static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, 11883 SelectionDAG &DAG) { 11884 SDLoc DL(N); 11885 11886 SDValue Pred = N->getOperand(1); 11887 SDValue InitVal = N->getOperand(2); 11888 SDValue VecToReduce = N->getOperand(3); 11889 EVT ReduceVT = VecToReduce.getValueType(); 11890 11891 // Ordered reductions use the first lane of the result vector as the 11892 // reduction's initial value. 11893 SDValue Zero = DAG.getConstant(0, DL, MVT::i64); 11894 InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT, 11895 DAG.getUNDEF(ReduceVT), InitVal, Zero); 11896 11897 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce); 11898 11899 // SVE reductions set the whole vector register with the first element 11900 // containing the reduction result, which we'll now extract. 11901 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce, 11902 Zero); 11903 } 11904 11905 static SDValue performIntrinsicCombine(SDNode *N, 11906 TargetLowering::DAGCombinerInfo &DCI, 11907 const AArch64Subtarget *Subtarget) { 11908 SelectionDAG &DAG = DCI.DAG; 11909 unsigned IID = getIntrinsicID(N); 11910 switch (IID) { 11911 default: 11912 break; 11913 case Intrinsic::aarch64_neon_vcvtfxs2fp: 11914 case Intrinsic::aarch64_neon_vcvtfxu2fp: 11915 return tryCombineFixedPointConvert(N, DCI, DAG); 11916 case Intrinsic::aarch64_neon_saddv: 11917 return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG); 11918 case Intrinsic::aarch64_neon_uaddv: 11919 return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG); 11920 case Intrinsic::aarch64_neon_sminv: 11921 return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG); 11922 case Intrinsic::aarch64_neon_uminv: 11923 return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG); 11924 case Intrinsic::aarch64_neon_smaxv: 11925 return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG); 11926 case Intrinsic::aarch64_neon_umaxv: 11927 return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG); 11928 case Intrinsic::aarch64_neon_fmax: 11929 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0), 11930 N->getOperand(1), N->getOperand(2)); 11931 case Intrinsic::aarch64_neon_fmin: 11932 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0), 11933 N->getOperand(1), N->getOperand(2)); 11934 case Intrinsic::aarch64_neon_fmaxnm: 11935 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0), 11936 N->getOperand(1), N->getOperand(2)); 11937 case Intrinsic::aarch64_neon_fminnm: 11938 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0), 11939 N->getOperand(1), N->getOperand(2)); 11940 case Intrinsic::aarch64_neon_smull: 11941 case Intrinsic::aarch64_neon_umull: 11942 case Intrinsic::aarch64_neon_pmull: 11943 case Intrinsic::aarch64_neon_sqdmull: 11944 return tryCombineLongOpWithDup(IID, N, DCI, DAG); 11945 case Intrinsic::aarch64_neon_sqshl: 11946 case Intrinsic::aarch64_neon_uqshl: 11947 case Intrinsic::aarch64_neon_sqshlu: 11948 case Intrinsic::aarch64_neon_srshl: 11949 case Intrinsic::aarch64_neon_urshl: 11950 case Intrinsic::aarch64_neon_sshl: 11951 case Intrinsic::aarch64_neon_ushl: 11952 return tryCombineShiftImm(IID, N, DAG); 11953 case Intrinsic::aarch64_crc32b: 11954 case Intrinsic::aarch64_crc32cb: 11955 return tryCombineCRC32(0xff, N, DAG); 11956 case Intrinsic::aarch64_crc32h: 11957 case Intrinsic::aarch64_crc32ch: 11958 return tryCombineCRC32(0xffff, N, DAG); 11959 case Intrinsic::aarch64_sve_smaxv: 11960 return LowerSVEIntReduction(N, AArch64ISD::SMAXV_PRED, DAG); 11961 case Intrinsic::aarch64_sve_umaxv: 11962 return LowerSVEIntReduction(N, AArch64ISD::UMAXV_PRED, DAG); 11963 case Intrinsic::aarch64_sve_sminv: 11964 return LowerSVEIntReduction(N, AArch64ISD::SMINV_PRED, DAG); 11965 case Intrinsic::aarch64_sve_uminv: 11966 return LowerSVEIntReduction(N, AArch64ISD::UMINV_PRED, DAG); 11967 case Intrinsic::aarch64_sve_orv: 11968 return LowerSVEIntReduction(N, AArch64ISD::ORV_PRED, DAG); 11969 case Intrinsic::aarch64_sve_eorv: 11970 return LowerSVEIntReduction(N, AArch64ISD::EORV_PRED, DAG); 11971 case Intrinsic::aarch64_sve_andv: 11972 return LowerSVEIntReduction(N, AArch64ISD::ANDV_PRED, DAG); 11973 case Intrinsic::aarch64_sve_index: 11974 return LowerSVEIntrinsicIndex(N, DAG); 11975 case Intrinsic::aarch64_sve_dup: 11976 return LowerSVEIntrinsicDUP(N, DAG); 11977 case Intrinsic::aarch64_sve_dup_x: 11978 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0), 11979 N->getOperand(1)); 11980 case Intrinsic::aarch64_sve_ext: 11981 return LowerSVEIntrinsicEXT(N, DAG); 11982 case Intrinsic::aarch64_sve_smin: 11983 return DAG.getNode(AArch64ISD::SMIN_MERGE_OP1, SDLoc(N), N->getValueType(0), 11984 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 11985 case Intrinsic::aarch64_sve_umin: 11986 return DAG.getNode(AArch64ISD::UMIN_MERGE_OP1, SDLoc(N), N->getValueType(0), 11987 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 11988 case Intrinsic::aarch64_sve_smax: 11989 return DAG.getNode(AArch64ISD::SMAX_MERGE_OP1, SDLoc(N), N->getValueType(0), 11990 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 11991 case Intrinsic::aarch64_sve_umax: 11992 return DAG.getNode(AArch64ISD::UMAX_MERGE_OP1, SDLoc(N), N->getValueType(0), 11993 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 11994 case Intrinsic::aarch64_sve_lsl: 11995 return DAG.getNode(AArch64ISD::SHL_MERGE_OP1, SDLoc(N), N->getValueType(0), 11996 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 11997 case Intrinsic::aarch64_sve_lsr: 11998 return DAG.getNode(AArch64ISD::SRL_MERGE_OP1, SDLoc(N), N->getValueType(0), 11999 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 12000 case Intrinsic::aarch64_sve_asr: 12001 return DAG.getNode(AArch64ISD::SRA_MERGE_OP1, SDLoc(N), N->getValueType(0), 12002 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 12003 case Intrinsic::aarch64_sve_cmphs: 12004 if (!N->getOperand(2).getValueType().isFloatingPoint()) 12005 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), 12006 N->getValueType(0), N->getOperand(1), N->getOperand(2), 12007 N->getOperand(3), DAG.getCondCode(ISD::SETUGE)); 12008 break; 12009 case Intrinsic::aarch64_sve_cmphi: 12010 if (!N->getOperand(2).getValueType().isFloatingPoint()) 12011 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), 12012 N->getValueType(0), N->getOperand(1), N->getOperand(2), 12013 N->getOperand(3), DAG.getCondCode(ISD::SETUGT)); 12014 break; 12015 case Intrinsic::aarch64_sve_cmpge: 12016 if (!N->getOperand(2).getValueType().isFloatingPoint()) 12017 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), 12018 N->getValueType(0), N->getOperand(1), N->getOperand(2), 12019 N->getOperand(3), DAG.getCondCode(ISD::SETGE)); 12020 break; 12021 case Intrinsic::aarch64_sve_cmpgt: 12022 if (!N->getOperand(2).getValueType().isFloatingPoint()) 12023 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), 12024 N->getValueType(0), N->getOperand(1), N->getOperand(2), 12025 N->getOperand(3), DAG.getCondCode(ISD::SETGT)); 12026 break; 12027 case Intrinsic::aarch64_sve_cmpeq: 12028 if (!N->getOperand(2).getValueType().isFloatingPoint()) 12029 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), 12030 N->getValueType(0), N->getOperand(1), N->getOperand(2), 12031 N->getOperand(3), DAG.getCondCode(ISD::SETEQ)); 12032 break; 12033 case Intrinsic::aarch64_sve_cmpne: 12034 if (!N->getOperand(2).getValueType().isFloatingPoint()) 12035 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), 12036 N->getValueType(0), N->getOperand(1), N->getOperand(2), 12037 N->getOperand(3), DAG.getCondCode(ISD::SETNE)); 12038 break; 12039 case Intrinsic::aarch64_sve_fadda: 12040 return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG); 12041 case Intrinsic::aarch64_sve_faddv: 12042 return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG); 12043 case Intrinsic::aarch64_sve_fmaxnmv: 12044 return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG); 12045 case Intrinsic::aarch64_sve_fmaxv: 12046 return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG); 12047 case Intrinsic::aarch64_sve_fminnmv: 12048 return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG); 12049 case Intrinsic::aarch64_sve_fminv: 12050 return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG); 12051 case Intrinsic::aarch64_sve_sel: 12052 return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0), 12053 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 12054 case Intrinsic::aarch64_sve_cmpeq_wide: 12055 return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG); 12056 case Intrinsic::aarch64_sve_cmpne_wide: 12057 return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG); 12058 case Intrinsic::aarch64_sve_cmpge_wide: 12059 return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG); 12060 case Intrinsic::aarch64_sve_cmpgt_wide: 12061 return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG); 12062 case Intrinsic::aarch64_sve_cmplt_wide: 12063 return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG); 12064 case Intrinsic::aarch64_sve_cmple_wide: 12065 return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG); 12066 case Intrinsic::aarch64_sve_cmphs_wide: 12067 return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG); 12068 case Intrinsic::aarch64_sve_cmphi_wide: 12069 return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG); 12070 case Intrinsic::aarch64_sve_cmplo_wide: 12071 return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG); 12072 case Intrinsic::aarch64_sve_cmpls_wide: 12073 return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG); 12074 case Intrinsic::aarch64_sve_ptest_any: 12075 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2), 12076 AArch64CC::ANY_ACTIVE); 12077 case Intrinsic::aarch64_sve_ptest_first: 12078 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2), 12079 AArch64CC::FIRST_ACTIVE); 12080 case Intrinsic::aarch64_sve_ptest_last: 12081 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2), 12082 AArch64CC::LAST_ACTIVE); 12083 } 12084 return SDValue(); 12085 } 12086 12087 static SDValue performExtendCombine(SDNode *N, 12088 TargetLowering::DAGCombinerInfo &DCI, 12089 SelectionDAG &DAG) { 12090 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then 12091 // we can convert that DUP into another extract_high (of a bigger DUP), which 12092 // helps the backend to decide that an sabdl2 would be useful, saving a real 12093 // extract_high operation. 12094 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND && 12095 N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) { 12096 SDNode *ABDNode = N->getOperand(0).getNode(); 12097 unsigned IID = getIntrinsicID(ABDNode); 12098 if (IID == Intrinsic::aarch64_neon_sabd || 12099 IID == Intrinsic::aarch64_neon_uabd) { 12100 SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG); 12101 if (!NewABD.getNode()) 12102 return SDValue(); 12103 12104 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), 12105 NewABD); 12106 } 12107 } 12108 12109 // This is effectively a custom type legalization for AArch64. 12110 // 12111 // Type legalization will split an extend of a small, legal, type to a larger 12112 // illegal type by first splitting the destination type, often creating 12113 // illegal source types, which then get legalized in isel-confusing ways, 12114 // leading to really terrible codegen. E.g., 12115 // %result = v8i32 sext v8i8 %value 12116 // becomes 12117 // %losrc = extract_subreg %value, ... 12118 // %hisrc = extract_subreg %value, ... 12119 // %lo = v4i32 sext v4i8 %losrc 12120 // %hi = v4i32 sext v4i8 %hisrc 12121 // Things go rapidly downhill from there. 12122 // 12123 // For AArch64, the [sz]ext vector instructions can only go up one element 12124 // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32 12125 // take two instructions. 12126 // 12127 // This implies that the most efficient way to do the extend from v8i8 12128 // to two v4i32 values is to first extend the v8i8 to v8i16, then do 12129 // the normal splitting to happen for the v8i16->v8i32. 12130 12131 // This is pre-legalization to catch some cases where the default 12132 // type legalization will create ill-tempered code. 12133 if (!DCI.isBeforeLegalizeOps()) 12134 return SDValue(); 12135 12136 // We're only interested in cleaning things up for non-legal vector types 12137 // here. If both the source and destination are legal, things will just 12138 // work naturally without any fiddling. 12139 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12140 EVT ResVT = N->getValueType(0); 12141 if (!ResVT.isVector() || TLI.isTypeLegal(ResVT)) 12142 return SDValue(); 12143 // If the vector type isn't a simple VT, it's beyond the scope of what 12144 // we're worried about here. Let legalization do its thing and hope for 12145 // the best. 12146 SDValue Src = N->getOperand(0); 12147 EVT SrcVT = Src->getValueType(0); 12148 if (!ResVT.isSimple() || !SrcVT.isSimple()) 12149 return SDValue(); 12150 12151 // If the source VT is a 64-bit fixed or scalable vector, we can play games 12152 // and get the better results we want. 12153 if (SrcVT.getSizeInBits().getKnownMinSize() != 64) 12154 return SDValue(); 12155 12156 unsigned SrcEltSize = SrcVT.getScalarSizeInBits(); 12157 ElementCount SrcEC = SrcVT.getVectorElementCount(); 12158 SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), SrcEC); 12159 SDLoc DL(N); 12160 Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src); 12161 12162 // Now split the rest of the operation into two halves, each with a 64 12163 // bit source. 12164 EVT LoVT, HiVT; 12165 SDValue Lo, Hi; 12166 LoVT = HiVT = ResVT.getHalfNumVectorElementsVT(*DAG.getContext()); 12167 12168 EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(), 12169 LoVT.getVectorElementCount()); 12170 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, 12171 DAG.getConstant(0, DL, MVT::i64)); 12172 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, 12173 DAG.getConstant(InNVT.getVectorMinNumElements(), DL, MVT::i64)); 12174 Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo); 12175 Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi); 12176 12177 // Now combine the parts back together so we still have a single result 12178 // like the combiner expects. 12179 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi); 12180 } 12181 12182 static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, 12183 SDValue SplatVal, unsigned NumVecElts) { 12184 assert(!St.isTruncatingStore() && "cannot split truncating vector store"); 12185 unsigned OrigAlignment = St.getAlignment(); 12186 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8; 12187 12188 // Create scalar stores. This is at least as good as the code sequence for a 12189 // split unaligned store which is a dup.s, ext.b, and two stores. 12190 // Most of the time the three stores should be replaced by store pair 12191 // instructions (stp). 12192 SDLoc DL(&St); 12193 SDValue BasePtr = St.getBasePtr(); 12194 uint64_t BaseOffset = 0; 12195 12196 const MachinePointerInfo &PtrInfo = St.getPointerInfo(); 12197 SDValue NewST1 = 12198 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo, 12199 OrigAlignment, St.getMemOperand()->getFlags()); 12200 12201 // As this in ISel, we will not merge this add which may degrade results. 12202 if (BasePtr->getOpcode() == ISD::ADD && 12203 isa<ConstantSDNode>(BasePtr->getOperand(1))) { 12204 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue(); 12205 BasePtr = BasePtr->getOperand(0); 12206 } 12207 12208 unsigned Offset = EltOffset; 12209 while (--NumVecElts) { 12210 unsigned Alignment = MinAlign(OrigAlignment, Offset); 12211 SDValue OffsetPtr = 12212 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, 12213 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64)); 12214 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr, 12215 PtrInfo.getWithOffset(Offset), Alignment, 12216 St.getMemOperand()->getFlags()); 12217 Offset += EltOffset; 12218 } 12219 return NewST1; 12220 } 12221 12222 // Returns an SVE type that ContentTy can be trivially sign or zero extended 12223 // into. 12224 static MVT getSVEContainerType(EVT ContentTy) { 12225 assert(ContentTy.isSimple() && "No SVE containers for extended types"); 12226 12227 switch (ContentTy.getSimpleVT().SimpleTy) { 12228 default: 12229 llvm_unreachable("No known SVE container for this MVT type"); 12230 case MVT::nxv2i8: 12231 case MVT::nxv2i16: 12232 case MVT::nxv2i32: 12233 case MVT::nxv2i64: 12234 case MVT::nxv2f32: 12235 case MVT::nxv2f64: 12236 return MVT::nxv2i64; 12237 case MVT::nxv4i8: 12238 case MVT::nxv4i16: 12239 case MVT::nxv4i32: 12240 case MVT::nxv4f32: 12241 return MVT::nxv4i32; 12242 case MVT::nxv8i8: 12243 case MVT::nxv8i16: 12244 case MVT::nxv8f16: 12245 case MVT::nxv8bf16: 12246 return MVT::nxv8i16; 12247 case MVT::nxv16i8: 12248 return MVT::nxv16i8; 12249 } 12250 } 12251 12252 static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) { 12253 SDLoc DL(N); 12254 EVT VT = N->getValueType(0); 12255 12256 if (VT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock) 12257 return SDValue(); 12258 12259 EVT ContainerVT = VT; 12260 if (ContainerVT.isInteger()) 12261 ContainerVT = getSVEContainerType(ContainerVT); 12262 12263 SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other); 12264 SDValue Ops[] = { N->getOperand(0), // Chain 12265 N->getOperand(2), // Pg 12266 N->getOperand(3), // Base 12267 DAG.getValueType(VT) }; 12268 12269 SDValue Load = DAG.getNode(Opc, DL, VTs, Ops); 12270 SDValue LoadChain = SDValue(Load.getNode(), 1); 12271 12272 if (ContainerVT.isInteger() && (VT != ContainerVT)) 12273 Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0)); 12274 12275 return DAG.getMergeValues({ Load, LoadChain }, DL); 12276 } 12277 12278 static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) { 12279 SDLoc DL(N); 12280 EVT VT = N->getValueType(0); 12281 EVT PtrTy = N->getOperand(3).getValueType(); 12282 12283 if (VT == MVT::nxv8bf16 && 12284 !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16()) 12285 return SDValue(); 12286 12287 EVT LoadVT = VT; 12288 if (VT.isFloatingPoint()) 12289 LoadVT = VT.changeTypeToInteger(); 12290 12291 auto *MINode = cast<MemIntrinsicSDNode>(N); 12292 SDValue PassThru = DAG.getConstant(0, DL, LoadVT); 12293 SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(), 12294 MINode->getOperand(3), DAG.getUNDEF(PtrTy), 12295 MINode->getOperand(2), PassThru, 12296 MINode->getMemoryVT(), MINode->getMemOperand(), 12297 ISD::UNINDEXED, ISD::NON_EXTLOAD, false); 12298 12299 if (VT.isFloatingPoint()) { 12300 SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) }; 12301 return DAG.getMergeValues(Ops, DL); 12302 } 12303 12304 return L; 12305 } 12306 12307 template <unsigned Opcode> 12308 static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) { 12309 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO || 12310 Opcode == AArch64ISD::LD1RO_MERGE_ZERO, 12311 "Unsupported opcode."); 12312 SDLoc DL(N); 12313 EVT VT = N->getValueType(0); 12314 if (VT == MVT::nxv8bf16 && 12315 !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16()) 12316 return SDValue(); 12317 12318 EVT LoadVT = VT; 12319 if (VT.isFloatingPoint()) 12320 LoadVT = VT.changeTypeToInteger(); 12321 12322 SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)}; 12323 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops); 12324 SDValue LoadChain = SDValue(Load.getNode(), 1); 12325 12326 if (VT.isFloatingPoint()) 12327 Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0)); 12328 12329 return DAG.getMergeValues({Load, LoadChain}, DL); 12330 } 12331 12332 static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) { 12333 SDLoc DL(N); 12334 SDValue Data = N->getOperand(2); 12335 EVT DataVT = Data.getValueType(); 12336 EVT HwSrcVt = getSVEContainerType(DataVT); 12337 SDValue InputVT = DAG.getValueType(DataVT); 12338 12339 if (DataVT == MVT::nxv8bf16 && 12340 !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16()) 12341 return SDValue(); 12342 12343 if (DataVT.isFloatingPoint()) 12344 InputVT = DAG.getValueType(HwSrcVt); 12345 12346 SDValue SrcNew; 12347 if (Data.getValueType().isFloatingPoint()) 12348 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data); 12349 else 12350 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data); 12351 12352 SDValue Ops[] = { N->getOperand(0), // Chain 12353 SrcNew, 12354 N->getOperand(4), // Base 12355 N->getOperand(3), // Pg 12356 InputVT 12357 }; 12358 12359 return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops); 12360 } 12361 12362 static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) { 12363 SDLoc DL(N); 12364 12365 SDValue Data = N->getOperand(2); 12366 EVT DataVT = Data.getValueType(); 12367 EVT PtrTy = N->getOperand(4).getValueType(); 12368 12369 if (DataVT == MVT::nxv8bf16 && 12370 !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16()) 12371 return SDValue(); 12372 12373 if (DataVT.isFloatingPoint()) 12374 Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data); 12375 12376 auto *MINode = cast<MemIntrinsicSDNode>(N); 12377 return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4), 12378 DAG.getUNDEF(PtrTy), MINode->getOperand(3), 12379 MINode->getMemoryVT(), MINode->getMemOperand(), 12380 ISD::UNINDEXED, false, false); 12381 } 12382 12383 /// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The 12384 /// load store optimizer pass will merge them to store pair stores. This should 12385 /// be better than a movi to create the vector zero followed by a vector store 12386 /// if the zero constant is not re-used, since one instructions and one register 12387 /// live range will be removed. 12388 /// 12389 /// For example, the final generated code should be: 12390 /// 12391 /// stp xzr, xzr, [x0] 12392 /// 12393 /// instead of: 12394 /// 12395 /// movi v0.2d, #0 12396 /// str q0, [x0] 12397 /// 12398 static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) { 12399 SDValue StVal = St.getValue(); 12400 EVT VT = StVal.getValueType(); 12401 12402 // Avoid scalarizing zero splat stores for scalable vectors. 12403 if (VT.isScalableVector()) 12404 return SDValue(); 12405 12406 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or 12407 // 2, 3 or 4 i32 elements. 12408 int NumVecElts = VT.getVectorNumElements(); 12409 if (!(((NumVecElts == 2 || NumVecElts == 3) && 12410 VT.getVectorElementType().getSizeInBits() == 64) || 12411 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) && 12412 VT.getVectorElementType().getSizeInBits() == 32))) 12413 return SDValue(); 12414 12415 if (StVal.getOpcode() != ISD::BUILD_VECTOR) 12416 return SDValue(); 12417 12418 // If the zero constant has more than one use then the vector store could be 12419 // better since the constant mov will be amortized and stp q instructions 12420 // should be able to be formed. 12421 if (!StVal.hasOneUse()) 12422 return SDValue(); 12423 12424 // If the store is truncating then it's going down to i16 or smaller, which 12425 // means it can be implemented in a single store anyway. 12426 if (St.isTruncatingStore()) 12427 return SDValue(); 12428 12429 // If the immediate offset of the address operand is too large for the stp 12430 // instruction, then bail out. 12431 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) { 12432 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1); 12433 if (Offset < -512 || Offset > 504) 12434 return SDValue(); 12435 } 12436 12437 for (int I = 0; I < NumVecElts; ++I) { 12438 SDValue EltVal = StVal.getOperand(I); 12439 if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal)) 12440 return SDValue(); 12441 } 12442 12443 // Use a CopyFromReg WZR/XZR here to prevent 12444 // DAGCombiner::MergeConsecutiveStores from undoing this transformation. 12445 SDLoc DL(&St); 12446 unsigned ZeroReg; 12447 EVT ZeroVT; 12448 if (VT.getVectorElementType().getSizeInBits() == 32) { 12449 ZeroReg = AArch64::WZR; 12450 ZeroVT = MVT::i32; 12451 } else { 12452 ZeroReg = AArch64::XZR; 12453 ZeroVT = MVT::i64; 12454 } 12455 SDValue SplatVal = 12456 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT); 12457 return splitStoreSplat(DAG, St, SplatVal, NumVecElts); 12458 } 12459 12460 /// Replace a splat of a scalar to a vector store by scalar stores of the scalar 12461 /// value. The load store optimizer pass will merge them to store pair stores. 12462 /// This has better performance than a splat of the scalar followed by a split 12463 /// vector store. Even if the stores are not merged it is four stores vs a dup, 12464 /// followed by an ext.b and two stores. 12465 static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) { 12466 SDValue StVal = St.getValue(); 12467 EVT VT = StVal.getValueType(); 12468 12469 // Don't replace floating point stores, they possibly won't be transformed to 12470 // stp because of the store pair suppress pass. 12471 if (VT.isFloatingPoint()) 12472 return SDValue(); 12473 12474 // We can express a splat as store pair(s) for 2 or 4 elements. 12475 unsigned NumVecElts = VT.getVectorNumElements(); 12476 if (NumVecElts != 4 && NumVecElts != 2) 12477 return SDValue(); 12478 12479 // If the store is truncating then it's going down to i16 or smaller, which 12480 // means it can be implemented in a single store anyway. 12481 if (St.isTruncatingStore()) 12482 return SDValue(); 12483 12484 // Check that this is a splat. 12485 // Make sure that each of the relevant vector element locations are inserted 12486 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32. 12487 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1); 12488 SDValue SplatVal; 12489 for (unsigned I = 0; I < NumVecElts; ++I) { 12490 // Check for insert vector elements. 12491 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT) 12492 return SDValue(); 12493 12494 // Check that same value is inserted at each vector element. 12495 if (I == 0) 12496 SplatVal = StVal.getOperand(1); 12497 else if (StVal.getOperand(1) != SplatVal) 12498 return SDValue(); 12499 12500 // Check insert element index. 12501 ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2)); 12502 if (!CIndex) 12503 return SDValue(); 12504 uint64_t IndexVal = CIndex->getZExtValue(); 12505 if (IndexVal >= NumVecElts) 12506 return SDValue(); 12507 IndexNotInserted.reset(IndexVal); 12508 12509 StVal = StVal.getOperand(0); 12510 } 12511 // Check that all vector element locations were inserted to. 12512 if (IndexNotInserted.any()) 12513 return SDValue(); 12514 12515 return splitStoreSplat(DAG, St, SplatVal, NumVecElts); 12516 } 12517 12518 static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 12519 SelectionDAG &DAG, 12520 const AArch64Subtarget *Subtarget) { 12521 12522 StoreSDNode *S = cast<StoreSDNode>(N); 12523 if (S->isVolatile() || S->isIndexed()) 12524 return SDValue(); 12525 12526 SDValue StVal = S->getValue(); 12527 EVT VT = StVal.getValueType(); 12528 12529 if (!VT.isFixedLengthVector()) 12530 return SDValue(); 12531 12532 // If we get a splat of zeros, convert this vector store to a store of 12533 // scalars. They will be merged into store pairs of xzr thereby removing one 12534 // instruction and one register. 12535 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S)) 12536 return ReplacedZeroSplat; 12537 12538 // FIXME: The logic for deciding if an unaligned store should be split should 12539 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be 12540 // a call to that function here. 12541 12542 if (!Subtarget->isMisaligned128StoreSlow()) 12543 return SDValue(); 12544 12545 // Don't split at -Oz. 12546 if (DAG.getMachineFunction().getFunction().hasMinSize()) 12547 return SDValue(); 12548 12549 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting 12550 // those up regresses performance on micro-benchmarks and olden/bh. 12551 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64) 12552 return SDValue(); 12553 12554 // Split unaligned 16B stores. They are terrible for performance. 12555 // Don't split stores with alignment of 1 or 2. Code that uses clang vector 12556 // extensions can use this to mark that it does not want splitting to happen 12557 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of 12558 // eliminating alignment hazards is only 1 in 8 for alignment of 2. 12559 if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 || 12560 S->getAlignment() <= 2) 12561 return SDValue(); 12562 12563 // If we get a splat of a scalar convert this vector store to a store of 12564 // scalars. They will be merged into store pairs thereby removing two 12565 // instructions. 12566 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S)) 12567 return ReplacedSplat; 12568 12569 SDLoc DL(S); 12570 12571 // Split VT into two. 12572 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); 12573 unsigned NumElts = HalfVT.getVectorNumElements(); 12574 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, 12575 DAG.getConstant(0, DL, MVT::i64)); 12576 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, 12577 DAG.getConstant(NumElts, DL, MVT::i64)); 12578 SDValue BasePtr = S->getBasePtr(); 12579 SDValue NewST1 = 12580 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(), 12581 S->getAlignment(), S->getMemOperand()->getFlags()); 12582 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, 12583 DAG.getConstant(8, DL, MVT::i64)); 12584 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr, 12585 S->getPointerInfo(), S->getAlignment(), 12586 S->getMemOperand()->getFlags()); 12587 } 12588 12589 /// Target-specific DAG combine function for post-increment LD1 (lane) and 12590 /// post-increment LD1R. 12591 static SDValue performPostLD1Combine(SDNode *N, 12592 TargetLowering::DAGCombinerInfo &DCI, 12593 bool IsLaneOp) { 12594 if (DCI.isBeforeLegalizeOps()) 12595 return SDValue(); 12596 12597 SelectionDAG &DAG = DCI.DAG; 12598 EVT VT = N->getValueType(0); 12599 12600 if (VT.isScalableVector()) 12601 return SDValue(); 12602 12603 unsigned LoadIdx = IsLaneOp ? 1 : 0; 12604 SDNode *LD = N->getOperand(LoadIdx).getNode(); 12605 // If it is not LOAD, can not do such combine. 12606 if (LD->getOpcode() != ISD::LOAD) 12607 return SDValue(); 12608 12609 // The vector lane must be a constant in the LD1LANE opcode. 12610 SDValue Lane; 12611 if (IsLaneOp) { 12612 Lane = N->getOperand(2); 12613 auto *LaneC = dyn_cast<ConstantSDNode>(Lane); 12614 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements()) 12615 return SDValue(); 12616 } 12617 12618 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD); 12619 EVT MemVT = LoadSDN->getMemoryVT(); 12620 // Check if memory operand is the same type as the vector element. 12621 if (MemVT != VT.getVectorElementType()) 12622 return SDValue(); 12623 12624 // Check if there are other uses. If so, do not combine as it will introduce 12625 // an extra load. 12626 for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE; 12627 ++UI) { 12628 if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result. 12629 continue; 12630 if (*UI != N) 12631 return SDValue(); 12632 } 12633 12634 SDValue Addr = LD->getOperand(1); 12635 SDValue Vector = N->getOperand(0); 12636 // Search for a use of the address operand that is an increment. 12637 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE = 12638 Addr.getNode()->use_end(); UI != UE; ++UI) { 12639 SDNode *User = *UI; 12640 if (User->getOpcode() != ISD::ADD 12641 || UI.getUse().getResNo() != Addr.getResNo()) 12642 continue; 12643 12644 // If the increment is a constant, it must match the memory ref size. 12645 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 12646 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { 12647 uint32_t IncVal = CInc->getZExtValue(); 12648 unsigned NumBytes = VT.getScalarSizeInBits() / 8; 12649 if (IncVal != NumBytes) 12650 continue; 12651 Inc = DAG.getRegister(AArch64::XZR, MVT::i64); 12652 } 12653 12654 // To avoid cycle construction make sure that neither the load nor the add 12655 // are predecessors to each other or the Vector. 12656 SmallPtrSet<const SDNode *, 32> Visited; 12657 SmallVector<const SDNode *, 16> Worklist; 12658 Visited.insert(Addr.getNode()); 12659 Worklist.push_back(User); 12660 Worklist.push_back(LD); 12661 Worklist.push_back(Vector.getNode()); 12662 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) || 12663 SDNode::hasPredecessorHelper(User, Visited, Worklist)) 12664 continue; 12665 12666 SmallVector<SDValue, 8> Ops; 12667 Ops.push_back(LD->getOperand(0)); // Chain 12668 if (IsLaneOp) { 12669 Ops.push_back(Vector); // The vector to be inserted 12670 Ops.push_back(Lane); // The lane to be inserted in the vector 12671 } 12672 Ops.push_back(Addr); 12673 Ops.push_back(Inc); 12674 12675 EVT Tys[3] = { VT, MVT::i64, MVT::Other }; 12676 SDVTList SDTys = DAG.getVTList(Tys); 12677 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost; 12678 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops, 12679 MemVT, 12680 LoadSDN->getMemOperand()); 12681 12682 // Update the uses. 12683 SDValue NewResults[] = { 12684 SDValue(LD, 0), // The result of load 12685 SDValue(UpdN.getNode(), 2) // Chain 12686 }; 12687 DCI.CombineTo(LD, NewResults); 12688 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result 12689 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register 12690 12691 break; 12692 } 12693 return SDValue(); 12694 } 12695 12696 /// Simplify ``Addr`` given that the top byte of it is ignored by HW during 12697 /// address translation. 12698 static bool performTBISimplification(SDValue Addr, 12699 TargetLowering::DAGCombinerInfo &DCI, 12700 SelectionDAG &DAG) { 12701 APInt DemandedMask = APInt::getLowBitsSet(64, 56); 12702 KnownBits Known; 12703 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 12704 !DCI.isBeforeLegalizeOps()); 12705 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12706 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) { 12707 DCI.CommitTargetLoweringOpt(TLO); 12708 return true; 12709 } 12710 return false; 12711 } 12712 12713 static SDValue performSTORECombine(SDNode *N, 12714 TargetLowering::DAGCombinerInfo &DCI, 12715 SelectionDAG &DAG, 12716 const AArch64Subtarget *Subtarget) { 12717 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget)) 12718 return Split; 12719 12720 if (Subtarget->supportsAddressTopByteIgnored() && 12721 performTBISimplification(N->getOperand(2), DCI, DAG)) 12722 return SDValue(N, 0); 12723 12724 return SDValue(); 12725 } 12726 12727 12728 /// Target-specific DAG combine function for NEON load/store intrinsics 12729 /// to merge base address updates. 12730 static SDValue performNEONPostLDSTCombine(SDNode *N, 12731 TargetLowering::DAGCombinerInfo &DCI, 12732 SelectionDAG &DAG) { 12733 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 12734 return SDValue(); 12735 12736 unsigned AddrOpIdx = N->getNumOperands() - 1; 12737 SDValue Addr = N->getOperand(AddrOpIdx); 12738 12739 // Search for a use of the address operand that is an increment. 12740 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), 12741 UE = Addr.getNode()->use_end(); UI != UE; ++UI) { 12742 SDNode *User = *UI; 12743 if (User->getOpcode() != ISD::ADD || 12744 UI.getUse().getResNo() != Addr.getResNo()) 12745 continue; 12746 12747 // Check that the add is independent of the load/store. Otherwise, folding 12748 // it would create a cycle. 12749 SmallPtrSet<const SDNode *, 32> Visited; 12750 SmallVector<const SDNode *, 16> Worklist; 12751 Visited.insert(Addr.getNode()); 12752 Worklist.push_back(N); 12753 Worklist.push_back(User); 12754 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || 12755 SDNode::hasPredecessorHelper(User, Visited, Worklist)) 12756 continue; 12757 12758 // Find the new opcode for the updating load/store. 12759 bool IsStore = false; 12760 bool IsLaneOp = false; 12761 bool IsDupOp = false; 12762 unsigned NewOpc = 0; 12763 unsigned NumVecs = 0; 12764 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 12765 switch (IntNo) { 12766 default: llvm_unreachable("unexpected intrinsic for Neon base update"); 12767 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post; 12768 NumVecs = 2; break; 12769 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post; 12770 NumVecs = 3; break; 12771 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post; 12772 NumVecs = 4; break; 12773 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post; 12774 NumVecs = 2; IsStore = true; break; 12775 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post; 12776 NumVecs = 3; IsStore = true; break; 12777 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post; 12778 NumVecs = 4; IsStore = true; break; 12779 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post; 12780 NumVecs = 2; break; 12781 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post; 12782 NumVecs = 3; break; 12783 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post; 12784 NumVecs = 4; break; 12785 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post; 12786 NumVecs = 2; IsStore = true; break; 12787 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post; 12788 NumVecs = 3; IsStore = true; break; 12789 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post; 12790 NumVecs = 4; IsStore = true; break; 12791 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost; 12792 NumVecs = 2; IsDupOp = true; break; 12793 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost; 12794 NumVecs = 3; IsDupOp = true; break; 12795 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost; 12796 NumVecs = 4; IsDupOp = true; break; 12797 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost; 12798 NumVecs = 2; IsLaneOp = true; break; 12799 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost; 12800 NumVecs = 3; IsLaneOp = true; break; 12801 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost; 12802 NumVecs = 4; IsLaneOp = true; break; 12803 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost; 12804 NumVecs = 2; IsStore = true; IsLaneOp = true; break; 12805 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost; 12806 NumVecs = 3; IsStore = true; IsLaneOp = true; break; 12807 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost; 12808 NumVecs = 4; IsStore = true; IsLaneOp = true; break; 12809 } 12810 12811 EVT VecTy; 12812 if (IsStore) 12813 VecTy = N->getOperand(2).getValueType(); 12814 else 12815 VecTy = N->getValueType(0); 12816 12817 // If the increment is a constant, it must match the memory ref size. 12818 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 12819 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { 12820 uint32_t IncVal = CInc->getZExtValue(); 12821 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; 12822 if (IsLaneOp || IsDupOp) 12823 NumBytes /= VecTy.getVectorNumElements(); 12824 if (IncVal != NumBytes) 12825 continue; 12826 Inc = DAG.getRegister(AArch64::XZR, MVT::i64); 12827 } 12828 SmallVector<SDValue, 8> Ops; 12829 Ops.push_back(N->getOperand(0)); // Incoming chain 12830 // Load lane and store have vector list as input. 12831 if (IsLaneOp || IsStore) 12832 for (unsigned i = 2; i < AddrOpIdx; ++i) 12833 Ops.push_back(N->getOperand(i)); 12834 Ops.push_back(Addr); // Base register 12835 Ops.push_back(Inc); 12836 12837 // Return Types. 12838 EVT Tys[6]; 12839 unsigned NumResultVecs = (IsStore ? 0 : NumVecs); 12840 unsigned n; 12841 for (n = 0; n < NumResultVecs; ++n) 12842 Tys[n] = VecTy; 12843 Tys[n++] = MVT::i64; // Type of write back register 12844 Tys[n] = MVT::Other; // Type of the chain 12845 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2)); 12846 12847 MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N); 12848 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops, 12849 MemInt->getMemoryVT(), 12850 MemInt->getMemOperand()); 12851 12852 // Update the uses. 12853 std::vector<SDValue> NewResults; 12854 for (unsigned i = 0; i < NumResultVecs; ++i) { 12855 NewResults.push_back(SDValue(UpdN.getNode(), i)); 12856 } 12857 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); 12858 DCI.CombineTo(N, NewResults); 12859 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); 12860 12861 break; 12862 } 12863 return SDValue(); 12864 } 12865 12866 // Checks to see if the value is the prescribed width and returns information 12867 // about its extension mode. 12868 static 12869 bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) { 12870 ExtType = ISD::NON_EXTLOAD; 12871 switch(V.getNode()->getOpcode()) { 12872 default: 12873 return false; 12874 case ISD::LOAD: { 12875 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode()); 12876 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8) 12877 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) { 12878 ExtType = LoadNode->getExtensionType(); 12879 return true; 12880 } 12881 return false; 12882 } 12883 case ISD::AssertSext: { 12884 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1)); 12885 if ((TypeNode->getVT() == MVT::i8 && width == 8) 12886 || (TypeNode->getVT() == MVT::i16 && width == 16)) { 12887 ExtType = ISD::SEXTLOAD; 12888 return true; 12889 } 12890 return false; 12891 } 12892 case ISD::AssertZext: { 12893 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1)); 12894 if ((TypeNode->getVT() == MVT::i8 && width == 8) 12895 || (TypeNode->getVT() == MVT::i16 && width == 16)) { 12896 ExtType = ISD::ZEXTLOAD; 12897 return true; 12898 } 12899 return false; 12900 } 12901 case ISD::Constant: 12902 case ISD::TargetConstant: { 12903 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) < 12904 1LL << (width - 1); 12905 } 12906 } 12907 12908 return true; 12909 } 12910 12911 // This function does a whole lot of voodoo to determine if the tests are 12912 // equivalent without and with a mask. Essentially what happens is that given a 12913 // DAG resembling: 12914 // 12915 // +-------------+ +-------------+ +-------------+ +-------------+ 12916 // | Input | | AddConstant | | CompConstant| | CC | 12917 // +-------------+ +-------------+ +-------------+ +-------------+ 12918 // | | | | 12919 // V V | +----------+ 12920 // +-------------+ +----+ | | 12921 // | ADD | |0xff| | | 12922 // +-------------+ +----+ | | 12923 // | | | | 12924 // V V | | 12925 // +-------------+ | | 12926 // | AND | | | 12927 // +-------------+ | | 12928 // | | | 12929 // +-----+ | | 12930 // | | | 12931 // V V V 12932 // +-------------+ 12933 // | CMP | 12934 // +-------------+ 12935 // 12936 // The AND node may be safely removed for some combinations of inputs. In 12937 // particular we need to take into account the extension type of the Input, 12938 // the exact values of AddConstant, CompConstant, and CC, along with the nominal 12939 // width of the input (this can work for any width inputs, the above graph is 12940 // specific to 8 bits. 12941 // 12942 // The specific equations were worked out by generating output tables for each 12943 // AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The 12944 // problem was simplified by working with 4 bit inputs, which means we only 12945 // needed to reason about 24 distinct bit patterns: 8 patterns unique to zero 12946 // extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8 12947 // patterns present in both extensions (0,7). For every distinct set of 12948 // AddConstant and CompConstants bit patterns we can consider the masked and 12949 // unmasked versions to be equivalent if the result of this function is true for 12950 // all 16 distinct bit patterns of for the current extension type of Input (w0). 12951 // 12952 // sub w8, w0, w1 12953 // and w10, w8, #0x0f 12954 // cmp w8, w2 12955 // cset w9, AArch64CC 12956 // cmp w10, w2 12957 // cset w11, AArch64CC 12958 // cmp w9, w11 12959 // cset w0, eq 12960 // ret 12961 // 12962 // Since the above function shows when the outputs are equivalent it defines 12963 // when it is safe to remove the AND. Unfortunately it only runs on AArch64 and 12964 // would be expensive to run during compiles. The equations below were written 12965 // in a test harness that confirmed they gave equivalent outputs to the above 12966 // for all inputs function, so they can be used determine if the removal is 12967 // legal instead. 12968 // 12969 // isEquivalentMaskless() is the code for testing if the AND can be removed 12970 // factored out of the DAG recognition as the DAG can take several forms. 12971 12972 static bool isEquivalentMaskless(unsigned CC, unsigned width, 12973 ISD::LoadExtType ExtType, int AddConstant, 12974 int CompConstant) { 12975 // By being careful about our equations and only writing the in term 12976 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can 12977 // make them generally applicable to all bit widths. 12978 int MaxUInt = (1 << width); 12979 12980 // For the purposes of these comparisons sign extending the type is 12981 // equivalent to zero extending the add and displacing it by half the integer 12982 // width. Provided we are careful and make sure our equations are valid over 12983 // the whole range we can just adjust the input and avoid writing equations 12984 // for sign extended inputs. 12985 if (ExtType == ISD::SEXTLOAD) 12986 AddConstant -= (1 << (width-1)); 12987 12988 switch(CC) { 12989 case AArch64CC::LE: 12990 case AArch64CC::GT: 12991 if ((AddConstant == 0) || 12992 (CompConstant == MaxUInt - 1 && AddConstant < 0) || 12993 (AddConstant >= 0 && CompConstant < 0) || 12994 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant)) 12995 return true; 12996 break; 12997 case AArch64CC::LT: 12998 case AArch64CC::GE: 12999 if ((AddConstant == 0) || 13000 (AddConstant >= 0 && CompConstant <= 0) || 13001 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant)) 13002 return true; 13003 break; 13004 case AArch64CC::HI: 13005 case AArch64CC::LS: 13006 if ((AddConstant >= 0 && CompConstant < 0) || 13007 (AddConstant <= 0 && CompConstant >= -1 && 13008 CompConstant < AddConstant + MaxUInt)) 13009 return true; 13010 break; 13011 case AArch64CC::PL: 13012 case AArch64CC::MI: 13013 if ((AddConstant == 0) || 13014 (AddConstant > 0 && CompConstant <= 0) || 13015 (AddConstant < 0 && CompConstant <= AddConstant)) 13016 return true; 13017 break; 13018 case AArch64CC::LO: 13019 case AArch64CC::HS: 13020 if ((AddConstant >= 0 && CompConstant <= 0) || 13021 (AddConstant <= 0 && CompConstant >= 0 && 13022 CompConstant <= AddConstant + MaxUInt)) 13023 return true; 13024 break; 13025 case AArch64CC::EQ: 13026 case AArch64CC::NE: 13027 if ((AddConstant > 0 && CompConstant < 0) || 13028 (AddConstant < 0 && CompConstant >= 0 && 13029 CompConstant < AddConstant + MaxUInt) || 13030 (AddConstant >= 0 && CompConstant >= 0 && 13031 CompConstant >= AddConstant) || 13032 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant)) 13033 return true; 13034 break; 13035 case AArch64CC::VS: 13036 case AArch64CC::VC: 13037 case AArch64CC::AL: 13038 case AArch64CC::NV: 13039 return true; 13040 case AArch64CC::Invalid: 13041 break; 13042 } 13043 13044 return false; 13045 } 13046 13047 static 13048 SDValue performCONDCombine(SDNode *N, 13049 TargetLowering::DAGCombinerInfo &DCI, 13050 SelectionDAG &DAG, unsigned CCIndex, 13051 unsigned CmpIndex) { 13052 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue(); 13053 SDNode *SubsNode = N->getOperand(CmpIndex).getNode(); 13054 unsigned CondOpcode = SubsNode->getOpcode(); 13055 13056 if (CondOpcode != AArch64ISD::SUBS) 13057 return SDValue(); 13058 13059 // There is a SUBS feeding this condition. Is it fed by a mask we can 13060 // use? 13061 13062 SDNode *AndNode = SubsNode->getOperand(0).getNode(); 13063 unsigned MaskBits = 0; 13064 13065 if (AndNode->getOpcode() != ISD::AND) 13066 return SDValue(); 13067 13068 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) { 13069 uint32_t CNV = CN->getZExtValue(); 13070 if (CNV == 255) 13071 MaskBits = 8; 13072 else if (CNV == 65535) 13073 MaskBits = 16; 13074 } 13075 13076 if (!MaskBits) 13077 return SDValue(); 13078 13079 SDValue AddValue = AndNode->getOperand(0); 13080 13081 if (AddValue.getOpcode() != ISD::ADD) 13082 return SDValue(); 13083 13084 // The basic dag structure is correct, grab the inputs and validate them. 13085 13086 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0); 13087 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1); 13088 SDValue SubsInputValue = SubsNode->getOperand(1); 13089 13090 // The mask is present and the provenance of all the values is a smaller type, 13091 // lets see if the mask is superfluous. 13092 13093 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) || 13094 !isa<ConstantSDNode>(SubsInputValue.getNode())) 13095 return SDValue(); 13096 13097 ISD::LoadExtType ExtType; 13098 13099 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) || 13100 !checkValueWidth(AddInputValue2, MaskBits, ExtType) || 13101 !checkValueWidth(AddInputValue1, MaskBits, ExtType) ) 13102 return SDValue(); 13103 13104 if(!isEquivalentMaskless(CC, MaskBits, ExtType, 13105 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(), 13106 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue())) 13107 return SDValue(); 13108 13109 // The AND is not necessary, remove it. 13110 13111 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0), 13112 SubsNode->getValueType(1)); 13113 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) }; 13114 13115 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops); 13116 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode()); 13117 13118 return SDValue(N, 0); 13119 } 13120 13121 // Optimize compare with zero and branch. 13122 static SDValue performBRCONDCombine(SDNode *N, 13123 TargetLowering::DAGCombinerInfo &DCI, 13124 SelectionDAG &DAG) { 13125 MachineFunction &MF = DAG.getMachineFunction(); 13126 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions 13127 // will not be produced, as they are conditional branch instructions that do 13128 // not set flags. 13129 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening)) 13130 return SDValue(); 13131 13132 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3)) 13133 N = NV.getNode(); 13134 SDValue Chain = N->getOperand(0); 13135 SDValue Dest = N->getOperand(1); 13136 SDValue CCVal = N->getOperand(2); 13137 SDValue Cmp = N->getOperand(3); 13138 13139 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!"); 13140 unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue(); 13141 if (CC != AArch64CC::EQ && CC != AArch64CC::NE) 13142 return SDValue(); 13143 13144 unsigned CmpOpc = Cmp.getOpcode(); 13145 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS) 13146 return SDValue(); 13147 13148 // Only attempt folding if there is only one use of the flag and no use of the 13149 // value. 13150 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1)) 13151 return SDValue(); 13152 13153 SDValue LHS = Cmp.getOperand(0); 13154 SDValue RHS = Cmp.getOperand(1); 13155 13156 assert(LHS.getValueType() == RHS.getValueType() && 13157 "Expected the value type to be the same for both operands!"); 13158 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) 13159 return SDValue(); 13160 13161 if (isNullConstant(LHS)) 13162 std::swap(LHS, RHS); 13163 13164 if (!isNullConstant(RHS)) 13165 return SDValue(); 13166 13167 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA || 13168 LHS.getOpcode() == ISD::SRL) 13169 return SDValue(); 13170 13171 // Fold the compare into the branch instruction. 13172 SDValue BR; 13173 if (CC == AArch64CC::EQ) 13174 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); 13175 else 13176 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); 13177 13178 // Do not add new nodes to DAG combiner worklist. 13179 DCI.CombineTo(N, BR, false); 13180 13181 return SDValue(); 13182 } 13183 13184 // Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test 13185 // as well as whether the test should be inverted. This code is required to 13186 // catch these cases (as opposed to standard dag combines) because 13187 // AArch64ISD::TBZ is matched during legalization. 13188 static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, 13189 SelectionDAG &DAG) { 13190 13191 if (!Op->hasOneUse()) 13192 return Op; 13193 13194 // We don't handle undef/constant-fold cases below, as they should have 13195 // already been taken care of (e.g. and of 0, test of undefined shifted bits, 13196 // etc.) 13197 13198 // (tbz (trunc x), b) -> (tbz x, b) 13199 // This case is just here to enable more of the below cases to be caught. 13200 if (Op->getOpcode() == ISD::TRUNCATE && 13201 Bit < Op->getValueType(0).getSizeInBits()) { 13202 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 13203 } 13204 13205 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits. 13206 if (Op->getOpcode() == ISD::ANY_EXTEND && 13207 Bit < Op->getOperand(0).getValueSizeInBits()) { 13208 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 13209 } 13210 13211 if (Op->getNumOperands() != 2) 13212 return Op; 13213 13214 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 13215 if (!C) 13216 return Op; 13217 13218 switch (Op->getOpcode()) { 13219 default: 13220 return Op; 13221 13222 // (tbz (and x, m), b) -> (tbz x, b) 13223 case ISD::AND: 13224 if ((C->getZExtValue() >> Bit) & 1) 13225 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 13226 return Op; 13227 13228 // (tbz (shl x, c), b) -> (tbz x, b-c) 13229 case ISD::SHL: 13230 if (C->getZExtValue() <= Bit && 13231 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) { 13232 Bit = Bit - C->getZExtValue(); 13233 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 13234 } 13235 return Op; 13236 13237 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x 13238 case ISD::SRA: 13239 Bit = Bit + C->getZExtValue(); 13240 if (Bit >= Op->getValueType(0).getSizeInBits()) 13241 Bit = Op->getValueType(0).getSizeInBits() - 1; 13242 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 13243 13244 // (tbz (srl x, c), b) -> (tbz x, b+c) 13245 case ISD::SRL: 13246 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) { 13247 Bit = Bit + C->getZExtValue(); 13248 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 13249 } 13250 return Op; 13251 13252 // (tbz (xor x, -1), b) -> (tbnz x, b) 13253 case ISD::XOR: 13254 if ((C->getZExtValue() >> Bit) & 1) 13255 Invert = !Invert; 13256 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 13257 } 13258 } 13259 13260 // Optimize test single bit zero/non-zero and branch. 13261 static SDValue performTBZCombine(SDNode *N, 13262 TargetLowering::DAGCombinerInfo &DCI, 13263 SelectionDAG &DAG) { 13264 unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 13265 bool Invert = false; 13266 SDValue TestSrc = N->getOperand(1); 13267 SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG); 13268 13269 if (TestSrc == NewTestSrc) 13270 return SDValue(); 13271 13272 unsigned NewOpc = N->getOpcode(); 13273 if (Invert) { 13274 if (NewOpc == AArch64ISD::TBZ) 13275 NewOpc = AArch64ISD::TBNZ; 13276 else { 13277 assert(NewOpc == AArch64ISD::TBNZ); 13278 NewOpc = AArch64ISD::TBZ; 13279 } 13280 } 13281 13282 SDLoc DL(N); 13283 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc, 13284 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3)); 13285 } 13286 13287 // vselect (v1i1 setcc) -> 13288 // vselect (v1iXX setcc) (XX is the size of the compared operand type) 13289 // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as 13290 // condition. If it can legalize "VSELECT v1i1" correctly, no need to combine 13291 // such VSELECT. 13292 static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) { 13293 SDValue N0 = N->getOperand(0); 13294 EVT CCVT = N0.getValueType(); 13295 13296 if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 || 13297 CCVT.getVectorElementType() != MVT::i1) 13298 return SDValue(); 13299 13300 EVT ResVT = N->getValueType(0); 13301 EVT CmpVT = N0.getOperand(0).getValueType(); 13302 // Only combine when the result type is of the same size as the compared 13303 // operands. 13304 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits()) 13305 return SDValue(); 13306 13307 SDValue IfTrue = N->getOperand(1); 13308 SDValue IfFalse = N->getOperand(2); 13309 SDValue SetCC = 13310 DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(), 13311 N0.getOperand(0), N0.getOperand(1), 13312 cast<CondCodeSDNode>(N0.getOperand(2))->get()); 13313 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC, 13314 IfTrue, IfFalse); 13315 } 13316 13317 /// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with 13318 /// the compare-mask instructions rather than going via NZCV, even if LHS and 13319 /// RHS are really scalar. This replaces any scalar setcc in the above pattern 13320 /// with a vector one followed by a DUP shuffle on the result. 13321 static SDValue performSelectCombine(SDNode *N, 13322 TargetLowering::DAGCombinerInfo &DCI) { 13323 SelectionDAG &DAG = DCI.DAG; 13324 SDValue N0 = N->getOperand(0); 13325 EVT ResVT = N->getValueType(0); 13326 13327 if (N0.getOpcode() != ISD::SETCC) 13328 return SDValue(); 13329 13330 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered 13331 // scalar SetCCResultType. We also don't expect vectors, because we assume 13332 // that selects fed by vector SETCCs are canonicalized to VSELECT. 13333 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) && 13334 "Scalar-SETCC feeding SELECT has unexpected result type!"); 13335 13336 // If NumMaskElts == 0, the comparison is larger than select result. The 13337 // largest real NEON comparison is 64-bits per lane, which means the result is 13338 // at most 32-bits and an illegal vector. Just bail out for now. 13339 EVT SrcVT = N0.getOperand(0).getValueType(); 13340 13341 // Don't try to do this optimization when the setcc itself has i1 operands. 13342 // There are no legal vectors of i1, so this would be pointless. 13343 if (SrcVT == MVT::i1) 13344 return SDValue(); 13345 13346 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits(); 13347 if (!ResVT.isVector() || NumMaskElts == 0) 13348 return SDValue(); 13349 13350 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts); 13351 EVT CCVT = SrcVT.changeVectorElementTypeToInteger(); 13352 13353 // Also bail out if the vector CCVT isn't the same size as ResVT. 13354 // This can happen if the SETCC operand size doesn't divide the ResVT size 13355 // (e.g., f64 vs v3f32). 13356 if (CCVT.getSizeInBits() != ResVT.getSizeInBits()) 13357 return SDValue(); 13358 13359 // Make sure we didn't create illegal types, if we're not supposed to. 13360 assert(DCI.isBeforeLegalize() || 13361 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)); 13362 13363 // First perform a vector comparison, where lane 0 is the one we're interested 13364 // in. 13365 SDLoc DL(N0); 13366 SDValue LHS = 13367 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0)); 13368 SDValue RHS = 13369 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1)); 13370 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2)); 13371 13372 // Now duplicate the comparison mask we want across all other lanes. 13373 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0); 13374 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask); 13375 Mask = DAG.getNode(ISD::BITCAST, DL, 13376 ResVT.changeVectorElementTypeToInteger(), Mask); 13377 13378 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2)); 13379 } 13380 13381 /// Get rid of unnecessary NVCASTs (that don't change the type). 13382 static SDValue performNVCASTCombine(SDNode *N) { 13383 if (N->getValueType(0) == N->getOperand(0).getValueType()) 13384 return N->getOperand(0); 13385 13386 return SDValue(); 13387 } 13388 13389 // If all users of the globaladdr are of the form (globaladdr + constant), find 13390 // the smallest constant, fold it into the globaladdr's offset and rewrite the 13391 // globaladdr as (globaladdr + constant) - constant. 13392 static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, 13393 const AArch64Subtarget *Subtarget, 13394 const TargetMachine &TM) { 13395 auto *GN = cast<GlobalAddressSDNode>(N); 13396 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) != 13397 AArch64II::MO_NO_FLAG) 13398 return SDValue(); 13399 13400 uint64_t MinOffset = -1ull; 13401 for (SDNode *N : GN->uses()) { 13402 if (N->getOpcode() != ISD::ADD) 13403 return SDValue(); 13404 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0)); 13405 if (!C) 13406 C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 13407 if (!C) 13408 return SDValue(); 13409 MinOffset = std::min(MinOffset, C->getZExtValue()); 13410 } 13411 uint64_t Offset = MinOffset + GN->getOffset(); 13412 13413 // Require that the new offset is larger than the existing one. Otherwise, we 13414 // can end up oscillating between two possible DAGs, for example, 13415 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1). 13416 if (Offset <= uint64_t(GN->getOffset())) 13417 return SDValue(); 13418 13419 // Check whether folding this offset is legal. It must not go out of bounds of 13420 // the referenced object to avoid violating the code model, and must be 13421 // smaller than 2^21 because this is the largest offset expressible in all 13422 // object formats. 13423 // 13424 // This check also prevents us from folding negative offsets, which will end 13425 // up being treated in the same way as large positive ones. They could also 13426 // cause code model violations, and aren't really common enough to matter. 13427 if (Offset >= (1 << 21)) 13428 return SDValue(); 13429 13430 const GlobalValue *GV = GN->getGlobal(); 13431 Type *T = GV->getValueType(); 13432 if (!T->isSized() || 13433 Offset > GV->getParent()->getDataLayout().getTypeAllocSize(T)) 13434 return SDValue(); 13435 13436 SDLoc DL(GN); 13437 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset); 13438 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result, 13439 DAG.getConstant(MinOffset, DL, MVT::i64)); 13440 } 13441 13442 // Turns the vector of indices into a vector of byte offstes by scaling Offset 13443 // by (BitWidth / 8). 13444 static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, 13445 SDLoc DL, unsigned BitWidth) { 13446 assert(Offset.getValueType().isScalableVector() && 13447 "This method is only for scalable vectors of offsets"); 13448 13449 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64); 13450 SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift); 13451 13452 return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift); 13453 } 13454 13455 /// Check if the value of \p OffsetInBytes can be used as an immediate for 13456 /// the gather load/prefetch and scatter store instructions with vector base and 13457 /// immediate offset addressing mode: 13458 /// 13459 /// [<Zn>.[S|D]{, #<imm>}] 13460 /// 13461 /// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31. 13462 13463 inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, 13464 unsigned ScalarSizeInBytes) { 13465 // The immediate is not a multiple of the scalar size. 13466 if (OffsetInBytes % ScalarSizeInBytes) 13467 return false; 13468 13469 // The immediate is out of range. 13470 if (OffsetInBytes / ScalarSizeInBytes > 31) 13471 return false; 13472 13473 return true; 13474 } 13475 13476 /// Check if the value of \p Offset represents a valid immediate for the SVE 13477 /// gather load/prefetch and scatter store instructiona with vector base and 13478 /// immediate offset addressing mode: 13479 /// 13480 /// [<Zn>.[S|D]{, #<imm>}] 13481 /// 13482 /// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31. 13483 static bool isValidImmForSVEVecImmAddrMode(SDValue Offset, 13484 unsigned ScalarSizeInBytes) { 13485 ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode()); 13486 return OffsetConst && isValidImmForSVEVecImmAddrMode( 13487 OffsetConst->getZExtValue(), ScalarSizeInBytes); 13488 } 13489 13490 static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, 13491 unsigned Opcode, 13492 bool OnlyPackedOffsets = true) { 13493 const SDValue Src = N->getOperand(2); 13494 const EVT SrcVT = Src->getValueType(0); 13495 assert(SrcVT.isScalableVector() && 13496 "Scatter stores are only possible for SVE vectors"); 13497 13498 SDLoc DL(N); 13499 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT(); 13500 13501 // Make sure that source data will fit into an SVE register 13502 if (SrcVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock) 13503 return SDValue(); 13504 13505 // For FPs, ACLE only supports _packed_ single and double precision types. 13506 if (SrcElVT.isFloatingPoint()) 13507 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64)) 13508 return SDValue(); 13509 13510 // Depending on the addressing mode, this is either a pointer or a vector of 13511 // pointers (that fits into one register) 13512 SDValue Base = N->getOperand(4); 13513 // Depending on the addressing mode, this is either a single offset or a 13514 // vector of offsets (that fits into one register) 13515 SDValue Offset = N->getOperand(5); 13516 13517 // For "scalar + vector of indices", just scale the indices. This only 13518 // applies to non-temporal scatters because there's no instruction that takes 13519 // indicies. 13520 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) { 13521 Offset = 13522 getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits()); 13523 Opcode = AArch64ISD::SSTNT1_PRED; 13524 } 13525 13526 // In the case of non-temporal gather loads there's only one SVE instruction 13527 // per data-size: "scalar + vector", i.e. 13528 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0] 13529 // Since we do have intrinsics that allow the arguments to be in a different 13530 // order, we may need to swap them to match the spec. 13531 if (Opcode == AArch64ISD::SSTNT1_PRED && Offset.getValueType().isVector()) 13532 std::swap(Base, Offset); 13533 13534 // SST1_IMM requires that the offset is an immediate that is: 13535 // * a multiple of #SizeInBytes, 13536 // * in the range [0, 31 x #SizeInBytes], 13537 // where #SizeInBytes is the size in bytes of the stored items. For 13538 // immediates outside that range and non-immediate scalar offsets use SST1 or 13539 // SST1_UXTW instead. 13540 if (Opcode == AArch64ISD::SST1_IMM_PRED) { 13541 if (!isValidImmForSVEVecImmAddrMode(Offset, 13542 SrcVT.getScalarSizeInBits() / 8)) { 13543 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy) 13544 Opcode = AArch64ISD::SST1_UXTW_PRED; 13545 else 13546 Opcode = AArch64ISD::SST1_PRED; 13547 13548 std::swap(Base, Offset); 13549 } 13550 } 13551 13552 auto &TLI = DAG.getTargetLoweringInfo(); 13553 if (!TLI.isTypeLegal(Base.getValueType())) 13554 return SDValue(); 13555 13556 // Some scatter store variants allow unpacked offsets, but only as nxv2i32 13557 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to 13558 // nxv2i64. Legalize accordingly. 13559 if (!OnlyPackedOffsets && 13560 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32) 13561 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0); 13562 13563 if (!TLI.isTypeLegal(Offset.getValueType())) 13564 return SDValue(); 13565 13566 // Source value type that is representable in hardware 13567 EVT HwSrcVt = getSVEContainerType(SrcVT); 13568 13569 // Keep the original type of the input data to store - this is needed to be 13570 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For 13571 // FP values we want the integer equivalent, so just use HwSrcVt. 13572 SDValue InputVT = DAG.getValueType(SrcVT); 13573 if (SrcVT.isFloatingPoint()) 13574 InputVT = DAG.getValueType(HwSrcVt); 13575 13576 SDVTList VTs = DAG.getVTList(MVT::Other); 13577 SDValue SrcNew; 13578 13579 if (Src.getValueType().isFloatingPoint()) 13580 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src); 13581 else 13582 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src); 13583 13584 SDValue Ops[] = {N->getOperand(0), // Chain 13585 SrcNew, 13586 N->getOperand(3), // Pg 13587 Base, 13588 Offset, 13589 InputVT}; 13590 13591 return DAG.getNode(Opcode, DL, VTs, Ops); 13592 } 13593 13594 static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, 13595 unsigned Opcode, 13596 bool OnlyPackedOffsets = true) { 13597 const EVT RetVT = N->getValueType(0); 13598 assert(RetVT.isScalableVector() && 13599 "Gather loads are only possible for SVE vectors"); 13600 13601 SDLoc DL(N); 13602 13603 // Make sure that the loaded data will fit into an SVE register 13604 if (RetVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock) 13605 return SDValue(); 13606 13607 // Depending on the addressing mode, this is either a pointer or a vector of 13608 // pointers (that fits into one register) 13609 SDValue Base = N->getOperand(3); 13610 // Depending on the addressing mode, this is either a single offset or a 13611 // vector of offsets (that fits into one register) 13612 SDValue Offset = N->getOperand(4); 13613 13614 // For "scalar + vector of indices", just scale the indices. This only 13615 // applies to non-temporal gathers because there's no instruction that takes 13616 // indicies. 13617 if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) { 13618 Offset = getScaledOffsetForBitWidth(DAG, Offset, DL, 13619 RetVT.getScalarSizeInBits()); 13620 Opcode = AArch64ISD::GLDNT1_MERGE_ZERO; 13621 } 13622 13623 // In the case of non-temporal gather loads there's only one SVE instruction 13624 // per data-size: "scalar + vector", i.e. 13625 // * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0] 13626 // Since we do have intrinsics that allow the arguments to be in a different 13627 // order, we may need to swap them to match the spec. 13628 if (Opcode == AArch64ISD::GLDNT1_MERGE_ZERO && 13629 Offset.getValueType().isVector()) 13630 std::swap(Base, Offset); 13631 13632 // GLD{FF}1_IMM requires that the offset is an immediate that is: 13633 // * a multiple of #SizeInBytes, 13634 // * in the range [0, 31 x #SizeInBytes], 13635 // where #SizeInBytes is the size in bytes of the loaded items. For 13636 // immediates outside that range and non-immediate scalar offsets use 13637 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead. 13638 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO || 13639 Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) { 13640 if (!isValidImmForSVEVecImmAddrMode(Offset, 13641 RetVT.getScalarSizeInBits() / 8)) { 13642 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy) 13643 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO) 13644 ? AArch64ISD::GLD1_UXTW_MERGE_ZERO 13645 : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO; 13646 else 13647 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO) 13648 ? AArch64ISD::GLD1_MERGE_ZERO 13649 : AArch64ISD::GLDFF1_MERGE_ZERO; 13650 13651 std::swap(Base, Offset); 13652 } 13653 } 13654 13655 auto &TLI = DAG.getTargetLoweringInfo(); 13656 if (!TLI.isTypeLegal(Base.getValueType())) 13657 return SDValue(); 13658 13659 // Some gather load variants allow unpacked offsets, but only as nxv2i32 13660 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to 13661 // nxv2i64. Legalize accordingly. 13662 if (!OnlyPackedOffsets && 13663 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32) 13664 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0); 13665 13666 // Return value type that is representable in hardware 13667 EVT HwRetVt = getSVEContainerType(RetVT); 13668 13669 // Keep the original output value type around - this is needed to be able to 13670 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP 13671 // values we want the integer equivalent, so just use HwRetVT. 13672 SDValue OutVT = DAG.getValueType(RetVT); 13673 if (RetVT.isFloatingPoint()) 13674 OutVT = DAG.getValueType(HwRetVt); 13675 13676 SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other); 13677 SDValue Ops[] = {N->getOperand(0), // Chain 13678 N->getOperand(2), // Pg 13679 Base, Offset, OutVT}; 13680 13681 SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops); 13682 SDValue LoadChain = SDValue(Load.getNode(), 1); 13683 13684 if (RetVT.isInteger() && (RetVT != HwRetVt)) 13685 Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0)); 13686 13687 // If the original return value was FP, bitcast accordingly. Doing it here 13688 // means that we can avoid adding TableGen patterns for FPs. 13689 if (RetVT.isFloatingPoint()) 13690 Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0)); 13691 13692 return DAG.getMergeValues({Load, LoadChain}, DL); 13693 } 13694 13695 static SDValue 13696 performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 13697 SelectionDAG &DAG) { 13698 if (DCI.isBeforeLegalizeOps()) 13699 return SDValue(); 13700 13701 SDLoc DL(N); 13702 SDValue Src = N->getOperand(0); 13703 unsigned Opc = Src->getOpcode(); 13704 13705 // Sign extend of an unsigned unpack -> signed unpack 13706 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) { 13707 13708 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI 13709 : AArch64ISD::SUNPKLO; 13710 13711 // Push the sign extend to the operand of the unpack 13712 // This is necessary where, for example, the operand of the unpack 13713 // is another unpack: 13714 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8) 13715 // -> 13716 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8) 13717 // -> 13718 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd)) 13719 SDValue ExtOp = Src->getOperand(0); 13720 auto VT = cast<VTSDNode>(N->getOperand(1))->getVT(); 13721 EVT EltTy = VT.getVectorElementType(); 13722 (void)EltTy; 13723 13724 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) && 13725 "Sign extending from an invalid type"); 13726 13727 EVT ExtVT = EVT::getVectorVT(*DAG.getContext(), 13728 VT.getVectorElementType(), 13729 VT.getVectorElementCount() * 2); 13730 13731 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(), 13732 ExtOp, DAG.getValueType(ExtVT)); 13733 13734 return DAG.getNode(SOpc, DL, N->getValueType(0), Ext); 13735 } 13736 13737 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates 13738 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes. 13739 unsigned NewOpc; 13740 unsigned MemVTOpNum = 4; 13741 switch (Opc) { 13742 case AArch64ISD::LD1_MERGE_ZERO: 13743 NewOpc = AArch64ISD::LD1S_MERGE_ZERO; 13744 MemVTOpNum = 3; 13745 break; 13746 case AArch64ISD::LDNF1_MERGE_ZERO: 13747 NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO; 13748 MemVTOpNum = 3; 13749 break; 13750 case AArch64ISD::LDFF1_MERGE_ZERO: 13751 NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO; 13752 MemVTOpNum = 3; 13753 break; 13754 case AArch64ISD::GLD1_MERGE_ZERO: 13755 NewOpc = AArch64ISD::GLD1S_MERGE_ZERO; 13756 break; 13757 case AArch64ISD::GLD1_SCALED_MERGE_ZERO: 13758 NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO; 13759 break; 13760 case AArch64ISD::GLD1_SXTW_MERGE_ZERO: 13761 NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO; 13762 break; 13763 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO: 13764 NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO; 13765 break; 13766 case AArch64ISD::GLD1_UXTW_MERGE_ZERO: 13767 NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO; 13768 break; 13769 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO: 13770 NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO; 13771 break; 13772 case AArch64ISD::GLD1_IMM_MERGE_ZERO: 13773 NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO; 13774 break; 13775 case AArch64ISD::GLDFF1_MERGE_ZERO: 13776 NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO; 13777 break; 13778 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO: 13779 NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO; 13780 break; 13781 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO: 13782 NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO; 13783 break; 13784 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO: 13785 NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO; 13786 break; 13787 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO: 13788 NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO; 13789 break; 13790 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO: 13791 NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO; 13792 break; 13793 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO: 13794 NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO; 13795 break; 13796 case AArch64ISD::GLDNT1_MERGE_ZERO: 13797 NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO; 13798 break; 13799 default: 13800 return SDValue(); 13801 } 13802 13803 EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT(); 13804 EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT(); 13805 13806 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse()) 13807 return SDValue(); 13808 13809 EVT DstVT = N->getValueType(0); 13810 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other); 13811 13812 SmallVector<SDValue, 5> Ops; 13813 for (unsigned I = 0; I < Src->getNumOperands(); ++I) 13814 Ops.push_back(Src->getOperand(I)); 13815 13816 SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops); 13817 DCI.CombineTo(N, ExtLoad); 13818 DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1)); 13819 13820 // Return N so it doesn't get rechecked 13821 return SDValue(N, 0); 13822 } 13823 13824 /// Legalize the gather prefetch (scalar + vector addressing mode) when the 13825 /// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset 13826 /// != nxv2i32) do not need legalization. 13827 static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) { 13828 const unsigned OffsetPos = 4; 13829 SDValue Offset = N->getOperand(OffsetPos); 13830 13831 // Not an unpacked vector, bail out. 13832 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32) 13833 return SDValue(); 13834 13835 // Extend the unpacked offset vector to 64-bit lanes. 13836 SDLoc DL(N); 13837 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset); 13838 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end()); 13839 // Replace the offset operand with the 64-bit one. 13840 Ops[OffsetPos] = Offset; 13841 13842 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops); 13843 } 13844 13845 /// Combines a node carrying the intrinsic 13846 /// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses 13847 /// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to 13848 /// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the 13849 /// sve gather prefetch instruction with vector plus immediate addressing mode. 13850 static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, 13851 unsigned ScalarSizeInBytes) { 13852 const unsigned ImmPos = 4, OffsetPos = 3; 13853 // No need to combine the node if the immediate is valid... 13854 if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes)) 13855 return SDValue(); 13856 13857 // ...otherwise swap the offset base with the offset... 13858 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end()); 13859 std::swap(Ops[ImmPos], Ops[OffsetPos]); 13860 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to 13861 // `aarch64_sve_prfb_gather_uxtw_index`. 13862 SDLoc DL(N); 13863 Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL, 13864 MVT::i64); 13865 13866 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops); 13867 } 13868 13869 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, 13870 DAGCombinerInfo &DCI) const { 13871 SelectionDAG &DAG = DCI.DAG; 13872 switch (N->getOpcode()) { 13873 default: 13874 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n"); 13875 break; 13876 case ISD::ADD: 13877 case ISD::SUB: 13878 return performAddSubLongCombine(N, DCI, DAG); 13879 case ISD::XOR: 13880 return performXorCombine(N, DAG, DCI, Subtarget); 13881 case ISD::MUL: 13882 return performMulCombine(N, DAG, DCI, Subtarget); 13883 case ISD::SINT_TO_FP: 13884 case ISD::UINT_TO_FP: 13885 return performIntToFpCombine(N, DAG, Subtarget); 13886 case ISD::FP_TO_SINT: 13887 case ISD::FP_TO_UINT: 13888 return performFpToIntCombine(N, DAG, DCI, Subtarget); 13889 case ISD::FDIV: 13890 return performFDivCombine(N, DAG, DCI, Subtarget); 13891 case ISD::OR: 13892 return performORCombine(N, DCI, Subtarget); 13893 case ISD::AND: 13894 return performANDCombine(N, DCI); 13895 case ISD::SRL: 13896 return performSRLCombine(N, DCI); 13897 case ISD::INTRINSIC_WO_CHAIN: 13898 return performIntrinsicCombine(N, DCI, Subtarget); 13899 case ISD::ANY_EXTEND: 13900 case ISD::ZERO_EXTEND: 13901 case ISD::SIGN_EXTEND: 13902 return performExtendCombine(N, DCI, DAG); 13903 case ISD::SIGN_EXTEND_INREG: 13904 return performSignExtendInRegCombine(N, DCI, DAG); 13905 case ISD::CONCAT_VECTORS: 13906 return performConcatVectorsCombine(N, DCI, DAG); 13907 case ISD::SELECT: 13908 return performSelectCombine(N, DCI); 13909 case ISD::VSELECT: 13910 return performVSelectCombine(N, DCI.DAG); 13911 case ISD::LOAD: 13912 if (performTBISimplification(N->getOperand(1), DCI, DAG)) 13913 return SDValue(N, 0); 13914 break; 13915 case ISD::STORE: 13916 return performSTORECombine(N, DCI, DAG, Subtarget); 13917 case AArch64ISD::BRCOND: 13918 return performBRCONDCombine(N, DCI, DAG); 13919 case AArch64ISD::TBNZ: 13920 case AArch64ISD::TBZ: 13921 return performTBZCombine(N, DCI, DAG); 13922 case AArch64ISD::CSEL: 13923 return performCONDCombine(N, DCI, DAG, 2, 3); 13924 case AArch64ISD::DUP: 13925 return performPostLD1Combine(N, DCI, false); 13926 case AArch64ISD::NVCAST: 13927 return performNVCASTCombine(N); 13928 case ISD::INSERT_VECTOR_ELT: 13929 return performPostLD1Combine(N, DCI, true); 13930 case ISD::INTRINSIC_VOID: 13931 case ISD::INTRINSIC_W_CHAIN: 13932 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 13933 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset: 13934 return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/); 13935 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset: 13936 return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/); 13937 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset: 13938 return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/); 13939 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset: 13940 return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/); 13941 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index: 13942 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index: 13943 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index: 13944 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index: 13945 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index: 13946 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index: 13947 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index: 13948 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index: 13949 return legalizeSVEGatherPrefetchOffsVec(N, DAG); 13950 case Intrinsic::aarch64_neon_ld2: 13951 case Intrinsic::aarch64_neon_ld3: 13952 case Intrinsic::aarch64_neon_ld4: 13953 case Intrinsic::aarch64_neon_ld1x2: 13954 case Intrinsic::aarch64_neon_ld1x3: 13955 case Intrinsic::aarch64_neon_ld1x4: 13956 case Intrinsic::aarch64_neon_ld2lane: 13957 case Intrinsic::aarch64_neon_ld3lane: 13958 case Intrinsic::aarch64_neon_ld4lane: 13959 case Intrinsic::aarch64_neon_ld2r: 13960 case Intrinsic::aarch64_neon_ld3r: 13961 case Intrinsic::aarch64_neon_ld4r: 13962 case Intrinsic::aarch64_neon_st2: 13963 case Intrinsic::aarch64_neon_st3: 13964 case Intrinsic::aarch64_neon_st4: 13965 case Intrinsic::aarch64_neon_st1x2: 13966 case Intrinsic::aarch64_neon_st1x3: 13967 case Intrinsic::aarch64_neon_st1x4: 13968 case Intrinsic::aarch64_neon_st2lane: 13969 case Intrinsic::aarch64_neon_st3lane: 13970 case Intrinsic::aarch64_neon_st4lane: 13971 return performNEONPostLDSTCombine(N, DCI, DAG); 13972 case Intrinsic::aarch64_sve_ldnt1: 13973 return performLDNT1Combine(N, DAG); 13974 case Intrinsic::aarch64_sve_ld1rq: 13975 return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG); 13976 case Intrinsic::aarch64_sve_ld1ro: 13977 return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG); 13978 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset: 13979 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO); 13980 case Intrinsic::aarch64_sve_ldnt1_gather: 13981 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO); 13982 case Intrinsic::aarch64_sve_ldnt1_gather_index: 13983 return performGatherLoadCombine(N, DAG, 13984 AArch64ISD::GLDNT1_INDEX_MERGE_ZERO); 13985 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw: 13986 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO); 13987 case Intrinsic::aarch64_sve_ld1: 13988 return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO); 13989 case Intrinsic::aarch64_sve_ldnf1: 13990 return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO); 13991 case Intrinsic::aarch64_sve_ldff1: 13992 return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO); 13993 case Intrinsic::aarch64_sve_st1: 13994 return performST1Combine(N, DAG); 13995 case Intrinsic::aarch64_sve_stnt1: 13996 return performSTNT1Combine(N, DAG); 13997 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset: 13998 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED); 13999 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw: 14000 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED); 14001 case Intrinsic::aarch64_sve_stnt1_scatter: 14002 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED); 14003 case Intrinsic::aarch64_sve_stnt1_scatter_index: 14004 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED); 14005 case Intrinsic::aarch64_sve_ld1_gather: 14006 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO); 14007 case Intrinsic::aarch64_sve_ld1_gather_index: 14008 return performGatherLoadCombine(N, DAG, 14009 AArch64ISD::GLD1_SCALED_MERGE_ZERO); 14010 case Intrinsic::aarch64_sve_ld1_gather_sxtw: 14011 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO, 14012 /*OnlyPackedOffsets=*/false); 14013 case Intrinsic::aarch64_sve_ld1_gather_uxtw: 14014 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO, 14015 /*OnlyPackedOffsets=*/false); 14016 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index: 14017 return performGatherLoadCombine(N, DAG, 14018 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO, 14019 /*OnlyPackedOffsets=*/false); 14020 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index: 14021 return performGatherLoadCombine(N, DAG, 14022 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO, 14023 /*OnlyPackedOffsets=*/false); 14024 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset: 14025 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO); 14026 case Intrinsic::aarch64_sve_ldff1_gather: 14027 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO); 14028 case Intrinsic::aarch64_sve_ldff1_gather_index: 14029 return performGatherLoadCombine(N, DAG, 14030 AArch64ISD::GLDFF1_SCALED_MERGE_ZERO); 14031 case Intrinsic::aarch64_sve_ldff1_gather_sxtw: 14032 return performGatherLoadCombine(N, DAG, 14033 AArch64ISD::GLDFF1_SXTW_MERGE_ZERO, 14034 /*OnlyPackedOffsets=*/false); 14035 case Intrinsic::aarch64_sve_ldff1_gather_uxtw: 14036 return performGatherLoadCombine(N, DAG, 14037 AArch64ISD::GLDFF1_UXTW_MERGE_ZERO, 14038 /*OnlyPackedOffsets=*/false); 14039 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index: 14040 return performGatherLoadCombine(N, DAG, 14041 AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO, 14042 /*OnlyPackedOffsets=*/false); 14043 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index: 14044 return performGatherLoadCombine(N, DAG, 14045 AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO, 14046 /*OnlyPackedOffsets=*/false); 14047 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset: 14048 return performGatherLoadCombine(N, DAG, 14049 AArch64ISD::GLDFF1_IMM_MERGE_ZERO); 14050 case Intrinsic::aarch64_sve_st1_scatter: 14051 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED); 14052 case Intrinsic::aarch64_sve_st1_scatter_index: 14053 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED); 14054 case Intrinsic::aarch64_sve_st1_scatter_sxtw: 14055 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED, 14056 /*OnlyPackedOffsets=*/false); 14057 case Intrinsic::aarch64_sve_st1_scatter_uxtw: 14058 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED, 14059 /*OnlyPackedOffsets=*/false); 14060 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index: 14061 return performScatterStoreCombine(N, DAG, 14062 AArch64ISD::SST1_SXTW_SCALED_PRED, 14063 /*OnlyPackedOffsets=*/false); 14064 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index: 14065 return performScatterStoreCombine(N, DAG, 14066 AArch64ISD::SST1_UXTW_SCALED_PRED, 14067 /*OnlyPackedOffsets=*/false); 14068 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset: 14069 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED); 14070 case Intrinsic::aarch64_sve_tuple_get: { 14071 SDLoc DL(N); 14072 SDValue Chain = N->getOperand(0); 14073 SDValue Src1 = N->getOperand(2); 14074 SDValue Idx = N->getOperand(3); 14075 14076 uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue(); 14077 EVT ResVT = N->getValueType(0); 14078 uint64_t NumLanes = ResVT.getVectorElementCount().Min; 14079 SDValue Val = 14080 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Src1, 14081 DAG.getConstant(IdxConst * NumLanes, DL, MVT::i32)); 14082 return DAG.getMergeValues({Val, Chain}, DL); 14083 } 14084 case Intrinsic::aarch64_sve_tuple_set: { 14085 SDLoc DL(N); 14086 SDValue Chain = N->getOperand(0); 14087 SDValue Tuple = N->getOperand(2); 14088 SDValue Idx = N->getOperand(3); 14089 SDValue Vec = N->getOperand(4); 14090 14091 EVT TupleVT = Tuple.getValueType(); 14092 uint64_t TupleLanes = TupleVT.getVectorElementCount().Min; 14093 14094 uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue(); 14095 uint64_t NumLanes = Vec.getValueType().getVectorElementCount().Min; 14096 14097 if ((TupleLanes % NumLanes) != 0) 14098 report_fatal_error("invalid tuple vector!"); 14099 14100 uint64_t NumVecs = TupleLanes / NumLanes; 14101 14102 SmallVector<SDValue, 4> Opnds; 14103 for (unsigned I = 0; I < NumVecs; ++I) { 14104 if (I == IdxConst) 14105 Opnds.push_back(Vec); 14106 else { 14107 Opnds.push_back( 14108 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Vec.getValueType(), Tuple, 14109 DAG.getConstant(I * NumLanes, DL, MVT::i32))); 14110 } 14111 } 14112 SDValue Concat = 14113 DAG.getNode(ISD::CONCAT_VECTORS, DL, Tuple.getValueType(), Opnds); 14114 return DAG.getMergeValues({Concat, Chain}, DL); 14115 } 14116 case Intrinsic::aarch64_sve_tuple_create2: 14117 case Intrinsic::aarch64_sve_tuple_create3: 14118 case Intrinsic::aarch64_sve_tuple_create4: { 14119 SDLoc DL(N); 14120 SDValue Chain = N->getOperand(0); 14121 14122 SmallVector<SDValue, 4> Opnds; 14123 for (unsigned I = 2; I < N->getNumOperands(); ++I) 14124 Opnds.push_back(N->getOperand(I)); 14125 14126 EVT VT = Opnds[0].getValueType(); 14127 EVT EltVT = VT.getVectorElementType(); 14128 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, 14129 VT.getVectorElementCount() * 14130 (N->getNumOperands() - 2)); 14131 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Opnds); 14132 return DAG.getMergeValues({Concat, Chain}, DL); 14133 } 14134 case Intrinsic::aarch64_sve_ld2: 14135 case Intrinsic::aarch64_sve_ld3: 14136 case Intrinsic::aarch64_sve_ld4: { 14137 SDLoc DL(N); 14138 SDValue Chain = N->getOperand(0); 14139 SDValue Mask = N->getOperand(2); 14140 SDValue BasePtr = N->getOperand(3); 14141 SDValue LoadOps[] = {Chain, Mask, BasePtr}; 14142 unsigned IntrinsicID = 14143 cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 14144 SDValue Result = 14145 LowerSVEStructLoad(IntrinsicID, LoadOps, N->getValueType(0), DAG, DL); 14146 return DAG.getMergeValues({Result, Chain}, DL); 14147 } 14148 default: 14149 break; 14150 } 14151 break; 14152 case ISD::GlobalAddress: 14153 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine()); 14154 } 14155 return SDValue(); 14156 } 14157 14158 // Check if the return value is used as only a return value, as otherwise 14159 // we can't perform a tail-call. In particular, we need to check for 14160 // target ISD nodes that are returns and any other "odd" constructs 14161 // that the generic analysis code won't necessarily catch. 14162 bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N, 14163 SDValue &Chain) const { 14164 if (N->getNumValues() != 1) 14165 return false; 14166 if (!N->hasNUsesOfValue(1, 0)) 14167 return false; 14168 14169 SDValue TCChain = Chain; 14170 SDNode *Copy = *N->use_begin(); 14171 if (Copy->getOpcode() == ISD::CopyToReg) { 14172 // If the copy has a glue operand, we conservatively assume it isn't safe to 14173 // perform a tail call. 14174 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() == 14175 MVT::Glue) 14176 return false; 14177 TCChain = Copy->getOperand(0); 14178 } else if (Copy->getOpcode() != ISD::FP_EXTEND) 14179 return false; 14180 14181 bool HasRet = false; 14182 for (SDNode *Node : Copy->uses()) { 14183 if (Node->getOpcode() != AArch64ISD::RET_FLAG) 14184 return false; 14185 HasRet = true; 14186 } 14187 14188 if (!HasRet) 14189 return false; 14190 14191 Chain = TCChain; 14192 return true; 14193 } 14194 14195 // Return whether the an instruction can potentially be optimized to a tail 14196 // call. This will cause the optimizers to attempt to move, or duplicate, 14197 // return instructions to help enable tail call optimizations for this 14198 // instruction. 14199 bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { 14200 return CI->isTailCall(); 14201 } 14202 14203 bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base, 14204 SDValue &Offset, 14205 ISD::MemIndexedMode &AM, 14206 bool &IsInc, 14207 SelectionDAG &DAG) const { 14208 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB) 14209 return false; 14210 14211 Base = Op->getOperand(0); 14212 // All of the indexed addressing mode instructions take a signed 14213 // 9 bit immediate offset. 14214 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) { 14215 int64_t RHSC = RHS->getSExtValue(); 14216 if (Op->getOpcode() == ISD::SUB) 14217 RHSC = -(uint64_t)RHSC; 14218 if (!isInt<9>(RHSC)) 14219 return false; 14220 IsInc = (Op->getOpcode() == ISD::ADD); 14221 Offset = Op->getOperand(1); 14222 return true; 14223 } 14224 return false; 14225 } 14226 14227 bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, 14228 SDValue &Offset, 14229 ISD::MemIndexedMode &AM, 14230 SelectionDAG &DAG) const { 14231 EVT VT; 14232 SDValue Ptr; 14233 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 14234 VT = LD->getMemoryVT(); 14235 Ptr = LD->getBasePtr(); 14236 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 14237 VT = ST->getMemoryVT(); 14238 Ptr = ST->getBasePtr(); 14239 } else 14240 return false; 14241 14242 bool IsInc; 14243 if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG)) 14244 return false; 14245 AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC; 14246 return true; 14247 } 14248 14249 bool AArch64TargetLowering::getPostIndexedAddressParts( 14250 SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, 14251 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const { 14252 EVT VT; 14253 SDValue Ptr; 14254 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 14255 VT = LD->getMemoryVT(); 14256 Ptr = LD->getBasePtr(); 14257 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 14258 VT = ST->getMemoryVT(); 14259 Ptr = ST->getBasePtr(); 14260 } else 14261 return false; 14262 14263 bool IsInc; 14264 if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG)) 14265 return false; 14266 // Post-indexing updates the base, so it's not a valid transform 14267 // if that's not the same as the load's pointer. 14268 if (Ptr != Base) 14269 return false; 14270 AM = IsInc ? ISD::POST_INC : ISD::POST_DEC; 14271 return true; 14272 } 14273 14274 static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results, 14275 SelectionDAG &DAG) { 14276 SDLoc DL(N); 14277 SDValue Op = N->getOperand(0); 14278 14279 if (N->getValueType(0) != MVT::i16 || 14280 (Op.getValueType() != MVT::f16 && Op.getValueType() != MVT::bf16)) 14281 return; 14282 14283 Op = SDValue( 14284 DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32, 14285 DAG.getUNDEF(MVT::i32), Op, 14286 DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)), 14287 0); 14288 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op); 14289 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op)); 14290 } 14291 14292 static void ReplaceReductionResults(SDNode *N, 14293 SmallVectorImpl<SDValue> &Results, 14294 SelectionDAG &DAG, unsigned InterOp, 14295 unsigned AcrossOp) { 14296 EVT LoVT, HiVT; 14297 SDValue Lo, Hi; 14298 SDLoc dl(N); 14299 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); 14300 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); 14301 SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi); 14302 SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal); 14303 Results.push_back(SplitVal); 14304 } 14305 14306 static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) { 14307 SDLoc DL(N); 14308 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, N); 14309 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, 14310 DAG.getNode(ISD::SRL, DL, MVT::i128, N, 14311 DAG.getConstant(64, DL, MVT::i64))); 14312 return std::make_pair(Lo, Hi); 14313 } 14314 14315 void AArch64TargetLowering::ReplaceExtractSubVectorResults( 14316 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { 14317 SDValue In = N->getOperand(0); 14318 EVT InVT = In.getValueType(); 14319 14320 // Common code will handle these just fine. 14321 if (!InVT.isScalableVector() || !InVT.isInteger()) 14322 return; 14323 14324 SDLoc DL(N); 14325 EVT VT = N->getValueType(0); 14326 14327 // The following checks bail if this is not a halving operation. 14328 14329 ElementCount ResEC = VT.getVectorElementCount(); 14330 14331 if (InVT.getVectorElementCount().Min != (ResEC.Min * 2)) 14332 return; 14333 14334 auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1)); 14335 if (!CIndex) 14336 return; 14337 14338 unsigned Index = CIndex->getZExtValue(); 14339 if ((Index != 0) && (Index != ResEC.Min)) 14340 return; 14341 14342 unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI; 14343 EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext()); 14344 14345 SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0)); 14346 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half)); 14347 } 14348 14349 // Create an even/odd pair of X registers holding integer value V. 14350 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) { 14351 SDLoc dl(V.getNode()); 14352 SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i64); 14353 SDValue VHi = DAG.getAnyExtOrTrunc( 14354 DAG.getNode(ISD::SRL, dl, MVT::i128, V, DAG.getConstant(64, dl, MVT::i64)), 14355 dl, MVT::i64); 14356 if (DAG.getDataLayout().isBigEndian()) 14357 std::swap (VLo, VHi); 14358 SDValue RegClass = 14359 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32); 14360 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32); 14361 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32); 14362 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 }; 14363 return SDValue( 14364 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0); 14365 } 14366 14367 static void ReplaceCMP_SWAP_128Results(SDNode *N, 14368 SmallVectorImpl<SDValue> &Results, 14369 SelectionDAG &DAG, 14370 const AArch64Subtarget *Subtarget) { 14371 assert(N->getValueType(0) == MVT::i128 && 14372 "AtomicCmpSwap on types less than 128 should be legal"); 14373 14374 if (Subtarget->hasLSE()) { 14375 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type, 14376 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG. 14377 SDValue Ops[] = { 14378 createGPRPairNode(DAG, N->getOperand(2)), // Compare value 14379 createGPRPairNode(DAG, N->getOperand(3)), // Store value 14380 N->getOperand(1), // Ptr 14381 N->getOperand(0), // Chain in 14382 }; 14383 14384 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand(); 14385 14386 unsigned Opcode; 14387 switch (MemOp->getOrdering()) { 14388 case AtomicOrdering::Monotonic: 14389 Opcode = AArch64::CASPX; 14390 break; 14391 case AtomicOrdering::Acquire: 14392 Opcode = AArch64::CASPAX; 14393 break; 14394 case AtomicOrdering::Release: 14395 Opcode = AArch64::CASPLX; 14396 break; 14397 case AtomicOrdering::AcquireRelease: 14398 case AtomicOrdering::SequentiallyConsistent: 14399 Opcode = AArch64::CASPALX; 14400 break; 14401 default: 14402 llvm_unreachable("Unexpected ordering!"); 14403 } 14404 14405 MachineSDNode *CmpSwap = DAG.getMachineNode( 14406 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops); 14407 DAG.setNodeMemRefs(CmpSwap, {MemOp}); 14408 14409 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64; 14410 if (DAG.getDataLayout().isBigEndian()) 14411 std::swap(SubReg1, SubReg2); 14412 SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64, 14413 SDValue(CmpSwap, 0)); 14414 SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64, 14415 SDValue(CmpSwap, 0)); 14416 Results.push_back( 14417 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi)); 14418 Results.push_back(SDValue(CmpSwap, 1)); // Chain out 14419 return; 14420 } 14421 14422 auto Desired = splitInt128(N->getOperand(2), DAG); 14423 auto New = splitInt128(N->getOperand(3), DAG); 14424 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second, 14425 New.first, New.second, N->getOperand(0)}; 14426 SDNode *CmpSwap = DAG.getMachineNode( 14427 AArch64::CMP_SWAP_128, SDLoc(N), 14428 DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), Ops); 14429 14430 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand(); 14431 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp}); 14432 14433 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, 14434 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1))); 14435 Results.push_back(SDValue(CmpSwap, 3)); 14436 } 14437 14438 void AArch64TargetLowering::ReplaceNodeResults( 14439 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { 14440 switch (N->getOpcode()) { 14441 default: 14442 llvm_unreachable("Don't know how to custom expand this"); 14443 case ISD::BITCAST: 14444 ReplaceBITCASTResults(N, Results, DAG); 14445 return; 14446 case ISD::VECREDUCE_ADD: 14447 case ISD::VECREDUCE_SMAX: 14448 case ISD::VECREDUCE_SMIN: 14449 case ISD::VECREDUCE_UMAX: 14450 case ISD::VECREDUCE_UMIN: 14451 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG)); 14452 return; 14453 14454 case ISD::CTPOP: 14455 Results.push_back(LowerCTPOP(SDValue(N, 0), DAG)); 14456 return; 14457 case AArch64ISD::SADDV: 14458 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV); 14459 return; 14460 case AArch64ISD::UADDV: 14461 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV); 14462 return; 14463 case AArch64ISD::SMINV: 14464 ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV); 14465 return; 14466 case AArch64ISD::UMINV: 14467 ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV); 14468 return; 14469 case AArch64ISD::SMAXV: 14470 ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV); 14471 return; 14472 case AArch64ISD::UMAXV: 14473 ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV); 14474 return; 14475 case ISD::FP_TO_UINT: 14476 case ISD::FP_TO_SINT: 14477 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion"); 14478 // Let normal code take care of it by not adding anything to Results. 14479 return; 14480 case ISD::ATOMIC_CMP_SWAP: 14481 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget); 14482 return; 14483 case ISD::LOAD: { 14484 assert(SDValue(N, 0).getValueType() == MVT::i128 && 14485 "unexpected load's value type"); 14486 LoadSDNode *LoadNode = cast<LoadSDNode>(N); 14487 if (!LoadNode->isVolatile() || LoadNode->getMemoryVT() != MVT::i128) { 14488 // Non-volatile loads are optimized later in AArch64's load/store 14489 // optimizer. 14490 return; 14491 } 14492 14493 SDValue Result = DAG.getMemIntrinsicNode( 14494 AArch64ISD::LDP, SDLoc(N), 14495 DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}), 14496 {LoadNode->getChain(), LoadNode->getBasePtr()}, LoadNode->getMemoryVT(), 14497 LoadNode->getMemOperand()); 14498 14499 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, 14500 Result.getValue(0), Result.getValue(1)); 14501 Results.append({Pair, Result.getValue(2) /* Chain */}); 14502 return; 14503 } 14504 case ISD::EXTRACT_SUBVECTOR: 14505 ReplaceExtractSubVectorResults(N, Results, DAG); 14506 return; 14507 case ISD::INTRINSIC_WO_CHAIN: { 14508 EVT VT = N->getValueType(0); 14509 assert((VT == MVT::i8 || VT == MVT::i16) && 14510 "custom lowering for unexpected type"); 14511 14512 ConstantSDNode *CN = cast<ConstantSDNode>(N->getOperand(0)); 14513 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); 14514 switch (IntID) { 14515 default: 14516 return; 14517 case Intrinsic::aarch64_sve_clasta_n: { 14518 SDLoc DL(N); 14519 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2)); 14520 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32, 14521 N->getOperand(1), Op2, N->getOperand(3)); 14522 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V)); 14523 return; 14524 } 14525 case Intrinsic::aarch64_sve_clastb_n: { 14526 SDLoc DL(N); 14527 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2)); 14528 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32, 14529 N->getOperand(1), Op2, N->getOperand(3)); 14530 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V)); 14531 return; 14532 } 14533 case Intrinsic::aarch64_sve_lasta: { 14534 SDLoc DL(N); 14535 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32, 14536 N->getOperand(1), N->getOperand(2)); 14537 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V)); 14538 return; 14539 } 14540 case Intrinsic::aarch64_sve_lastb: { 14541 SDLoc DL(N); 14542 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32, 14543 N->getOperand(1), N->getOperand(2)); 14544 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V)); 14545 return; 14546 } 14547 } 14548 } 14549 } 14550 } 14551 14552 bool AArch64TargetLowering::useLoadStackGuardNode() const { 14553 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia()) 14554 return TargetLowering::useLoadStackGuardNode(); 14555 return true; 14556 } 14557 14558 unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const { 14559 // Combine multiple FDIVs with the same divisor into multiple FMULs by the 14560 // reciprocal if there are three or more FDIVs. 14561 return 3; 14562 } 14563 14564 TargetLoweringBase::LegalizeTypeAction 14565 AArch64TargetLowering::getPreferredVectorAction(MVT VT) const { 14566 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8, 14567 // v4i16, v2i32 instead of to promote. 14568 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 || 14569 VT == MVT::v1f32) 14570 return TypeWidenVector; 14571 14572 return TargetLoweringBase::getPreferredVectorAction(VT); 14573 } 14574 14575 // Loads and stores less than 128-bits are already atomic; ones above that 14576 // are doomed anyway, so defer to the default libcall and blame the OS when 14577 // things go wrong. 14578 bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { 14579 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); 14580 return Size == 128; 14581 } 14582 14583 // Loads and stores less than 128-bits are already atomic; ones above that 14584 // are doomed anyway, so defer to the default libcall and blame the OS when 14585 // things go wrong. 14586 TargetLowering::AtomicExpansionKind 14587 AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { 14588 unsigned Size = LI->getType()->getPrimitiveSizeInBits(); 14589 return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None; 14590 } 14591 14592 // For the real atomic operations, we have ldxr/stxr up to 128 bits, 14593 TargetLowering::AtomicExpansionKind 14594 AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 14595 if (AI->isFloatingPointOperation()) 14596 return AtomicExpansionKind::CmpXChg; 14597 14598 unsigned Size = AI->getType()->getPrimitiveSizeInBits(); 14599 if (Size > 128) return AtomicExpansionKind::None; 14600 // Nand not supported in LSE. 14601 if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC; 14602 // Leave 128 bits to LLSC. 14603 return (Subtarget->hasLSE() && Size < 128) ? AtomicExpansionKind::None : AtomicExpansionKind::LLSC; 14604 } 14605 14606 TargetLowering::AtomicExpansionKind 14607 AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR( 14608 AtomicCmpXchgInst *AI) const { 14609 // If subtarget has LSE, leave cmpxchg intact for codegen. 14610 if (Subtarget->hasLSE()) 14611 return AtomicExpansionKind::None; 14612 // At -O0, fast-regalloc cannot cope with the live vregs necessary to 14613 // implement cmpxchg without spilling. If the address being exchanged is also 14614 // on the stack and close enough to the spill slot, this can lead to a 14615 // situation where the monitor always gets cleared and the atomic operation 14616 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. 14617 if (getTargetMachine().getOptLevel() == CodeGenOpt::None) 14618 return AtomicExpansionKind::None; 14619 return AtomicExpansionKind::LLSC; 14620 } 14621 14622 Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, 14623 AtomicOrdering Ord) const { 14624 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 14625 Type *ValTy = cast<PointerType>(Addr->getType())->getElementType(); 14626 bool IsAcquire = isAcquireOrStronger(Ord); 14627 14628 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd 14629 // intrinsic must return {i64, i64} and we have to recombine them into a 14630 // single i128 here. 14631 if (ValTy->getPrimitiveSizeInBits() == 128) { 14632 Intrinsic::ID Int = 14633 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp; 14634 Function *Ldxr = Intrinsic::getDeclaration(M, Int); 14635 14636 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 14637 Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi"); 14638 14639 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); 14640 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); 14641 Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); 14642 Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); 14643 return Builder.CreateOr( 14644 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64"); 14645 } 14646 14647 Type *Tys[] = { Addr->getType() }; 14648 Intrinsic::ID Int = 14649 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr; 14650 Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys); 14651 14652 Type *EltTy = cast<PointerType>(Addr->getType())->getElementType(); 14653 14654 const DataLayout &DL = M->getDataLayout(); 14655 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(EltTy)); 14656 Value *Trunc = Builder.CreateTrunc(Builder.CreateCall(Ldxr, Addr), IntEltTy); 14657 14658 return Builder.CreateBitCast(Trunc, EltTy); 14659 } 14660 14661 void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance( 14662 IRBuilder<> &Builder) const { 14663 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 14664 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex)); 14665 } 14666 14667 Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder, 14668 Value *Val, Value *Addr, 14669 AtomicOrdering Ord) const { 14670 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 14671 bool IsRelease = isReleaseOrStronger(Ord); 14672 14673 // Since the intrinsics must have legal type, the i128 intrinsics take two 14674 // parameters: "i64, i64". We must marshal Val into the appropriate form 14675 // before the call. 14676 if (Val->getType()->getPrimitiveSizeInBits() == 128) { 14677 Intrinsic::ID Int = 14678 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp; 14679 Function *Stxr = Intrinsic::getDeclaration(M, Int); 14680 Type *Int64Ty = Type::getInt64Ty(M->getContext()); 14681 14682 Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo"); 14683 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi"); 14684 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 14685 return Builder.CreateCall(Stxr, {Lo, Hi, Addr}); 14686 } 14687 14688 Intrinsic::ID Int = 14689 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr; 14690 Type *Tys[] = { Addr->getType() }; 14691 Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys); 14692 14693 const DataLayout &DL = M->getDataLayout(); 14694 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType())); 14695 Val = Builder.CreateBitCast(Val, IntValTy); 14696 14697 return Builder.CreateCall(Stxr, 14698 {Builder.CreateZExtOrBitCast( 14699 Val, Stxr->getFunctionType()->getParamType(0)), 14700 Addr}); 14701 } 14702 14703 bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters( 14704 Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { 14705 return Ty->isArrayTy(); 14706 } 14707 14708 bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &, 14709 EVT) const { 14710 return false; 14711 } 14712 14713 static Value *UseTlsOffset(IRBuilder<> &IRB, unsigned Offset) { 14714 Module *M = IRB.GetInsertBlock()->getParent()->getParent(); 14715 Function *ThreadPointerFunc = 14716 Intrinsic::getDeclaration(M, Intrinsic::thread_pointer); 14717 return IRB.CreatePointerCast( 14718 IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc), 14719 Offset), 14720 IRB.getInt8PtrTy()->getPointerTo(0)); 14721 } 14722 14723 Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const { 14724 // Android provides a fixed TLS slot for the stack cookie. See the definition 14725 // of TLS_SLOT_STACK_GUARD in 14726 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h 14727 if (Subtarget->isTargetAndroid()) 14728 return UseTlsOffset(IRB, 0x28); 14729 14730 // Fuchsia is similar. 14731 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value. 14732 if (Subtarget->isTargetFuchsia()) 14733 return UseTlsOffset(IRB, -0x10); 14734 14735 return TargetLowering::getIRStackGuard(IRB); 14736 } 14737 14738 void AArch64TargetLowering::insertSSPDeclarations(Module &M) const { 14739 // MSVC CRT provides functionalities for stack protection. 14740 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) { 14741 // MSVC CRT has a global variable holding security cookie. 14742 M.getOrInsertGlobal("__security_cookie", 14743 Type::getInt8PtrTy(M.getContext())); 14744 14745 // MSVC CRT has a function to validate security cookie. 14746 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( 14747 "__security_check_cookie", Type::getVoidTy(M.getContext()), 14748 Type::getInt8PtrTy(M.getContext())); 14749 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) { 14750 F->setCallingConv(CallingConv::Win64); 14751 F->addAttribute(1, Attribute::AttrKind::InReg); 14752 } 14753 return; 14754 } 14755 TargetLowering::insertSSPDeclarations(M); 14756 } 14757 14758 Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const { 14759 // MSVC CRT has a global variable holding security cookie. 14760 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 14761 return M.getGlobalVariable("__security_cookie"); 14762 return TargetLowering::getSDagStackGuard(M); 14763 } 14764 14765 Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const { 14766 // MSVC CRT has a function to validate security cookie. 14767 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 14768 return M.getFunction("__security_check_cookie"); 14769 return TargetLowering::getSSPStackGuardCheck(M); 14770 } 14771 14772 Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { 14773 // Android provides a fixed TLS slot for the SafeStack pointer. See the 14774 // definition of TLS_SLOT_SAFESTACK in 14775 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h 14776 if (Subtarget->isTargetAndroid()) 14777 return UseTlsOffset(IRB, 0x48); 14778 14779 // Fuchsia is similar. 14780 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value. 14781 if (Subtarget->isTargetFuchsia()) 14782 return UseTlsOffset(IRB, -0x8); 14783 14784 return TargetLowering::getSafeStackPointerLocation(IRB); 14785 } 14786 14787 bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial( 14788 const Instruction &AndI) const { 14789 // Only sink 'and' mask to cmp use block if it is masking a single bit, since 14790 // this is likely to be fold the and/cmp/br into a single tbz instruction. It 14791 // may be beneficial to sink in other cases, but we would have to check that 14792 // the cmp would not get folded into the br to form a cbz for these to be 14793 // beneficial. 14794 ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1)); 14795 if (!Mask) 14796 return false; 14797 return Mask->getValue().isPowerOf2(); 14798 } 14799 14800 bool AArch64TargetLowering:: 14801 shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( 14802 SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, 14803 unsigned OldShiftOpcode, unsigned NewShiftOpcode, 14804 SelectionDAG &DAG) const { 14805 // Does baseline recommend not to perform the fold by default? 14806 if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( 14807 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG)) 14808 return false; 14809 // Else, if this is a vector shift, prefer 'shl'. 14810 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL; 14811 } 14812 14813 bool AArch64TargetLowering::shouldExpandShift(SelectionDAG &DAG, 14814 SDNode *N) const { 14815 if (DAG.getMachineFunction().getFunction().hasMinSize() && 14816 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin()) 14817 return false; 14818 return true; 14819 } 14820 14821 void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { 14822 // Update IsSplitCSR in AArch64unctionInfo. 14823 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>(); 14824 AFI->setIsSplitCSR(true); 14825 } 14826 14827 void AArch64TargetLowering::insertCopiesSplitCSR( 14828 MachineBasicBlock *Entry, 14829 const SmallVectorImpl<MachineBasicBlock *> &Exits) const { 14830 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 14831 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); 14832 if (!IStart) 14833 return; 14834 14835 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 14836 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); 14837 MachineBasicBlock::iterator MBBI = Entry->begin(); 14838 for (const MCPhysReg *I = IStart; *I; ++I) { 14839 const TargetRegisterClass *RC = nullptr; 14840 if (AArch64::GPR64RegClass.contains(*I)) 14841 RC = &AArch64::GPR64RegClass; 14842 else if (AArch64::FPR64RegClass.contains(*I)) 14843 RC = &AArch64::FPR64RegClass; 14844 else 14845 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 14846 14847 Register NewVR = MRI->createVirtualRegister(RC); 14848 // Create copy from CSR to a virtual register. 14849 // FIXME: this currently does not emit CFI pseudo-instructions, it works 14850 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be 14851 // nounwind. If we want to generalize this later, we may need to emit 14852 // CFI pseudo-instructions. 14853 assert(Entry->getParent()->getFunction().hasFnAttribute( 14854 Attribute::NoUnwind) && 14855 "Function should be nounwind in insertCopiesSplitCSR!"); 14856 Entry->addLiveIn(*I); 14857 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) 14858 .addReg(*I); 14859 14860 // Insert the copy-back instructions right before the terminator. 14861 for (auto *Exit : Exits) 14862 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), 14863 TII->get(TargetOpcode::COPY), *I) 14864 .addReg(NewVR); 14865 } 14866 } 14867 14868 bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const { 14869 // Integer division on AArch64 is expensive. However, when aggressively 14870 // optimizing for code size, we prefer to use a div instruction, as it is 14871 // usually smaller than the alternative sequence. 14872 // The exception to this is vector division. Since AArch64 doesn't have vector 14873 // integer division, leaving the division as-is is a loss even in terms of 14874 // size, because it will have to be scalarized, while the alternative code 14875 // sequence can be performed in vector form. 14876 bool OptSize = Attr.hasFnAttribute(Attribute::MinSize); 14877 return OptSize && !VT.isVector(); 14878 } 14879 14880 bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const { 14881 // We want inc-of-add for scalars and sub-of-not for vectors. 14882 return VT.isScalarInteger(); 14883 } 14884 14885 bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const { 14886 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint(); 14887 } 14888 14889 unsigned 14890 AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const { 14891 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows()) 14892 return getPointerTy(DL).getSizeInBits(); 14893 14894 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32; 14895 } 14896 14897 void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const { 14898 MF.getFrameInfo().computeMaxCallFrameSize(MF); 14899 TargetLoweringBase::finalizeLowering(MF); 14900 } 14901 14902 // Unlike X86, we let frame lowering assign offsets to all catch objects. 14903 bool AArch64TargetLowering::needsFixedCatchObjects() const { 14904 return false; 14905 } 14906 14907 bool AArch64TargetLowering::shouldLocalize( 14908 const MachineInstr &MI, const TargetTransformInfo *TTI) const { 14909 switch (MI.getOpcode()) { 14910 case TargetOpcode::G_GLOBAL_VALUE: { 14911 // On Darwin, TLS global vars get selected into function calls, which 14912 // we don't want localized, as they can get moved into the middle of a 14913 // another call sequence. 14914 const GlobalValue &GV = *MI.getOperand(1).getGlobal(); 14915 if (GV.isThreadLocal() && Subtarget->isTargetMachO()) 14916 return false; 14917 break; 14918 } 14919 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being 14920 // localizable. 14921 case AArch64::ADRP: 14922 case AArch64::G_ADD_LOW: 14923 return true; 14924 default: 14925 break; 14926 } 14927 return TargetLoweringBase::shouldLocalize(MI, TTI); 14928 } 14929 14930 bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const { 14931 if (isa<ScalableVectorType>(Inst.getType())) 14932 return true; 14933 14934 for (unsigned i = 0; i < Inst.getNumOperands(); ++i) 14935 if (isa<ScalableVectorType>(Inst.getOperand(i)->getType())) 14936 return true; 14937 14938 if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) { 14939 if (isa<ScalableVectorType>(AI->getAllocatedType())) 14940 return true; 14941 } 14942 14943 return false; 14944 } 14945 14946 // Return the largest legal scalable vector type that matches VT's element type. 14947 static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) { 14948 assert(VT.isFixedLengthVector() && 14949 DAG.getTargetLoweringInfo().isTypeLegal(VT) && 14950 "Expected legal fixed length vector!"); 14951 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { 14952 default: 14953 llvm_unreachable("unexpected element type for SVE container"); 14954 case MVT::i8: 14955 return EVT(MVT::nxv16i8); 14956 case MVT::i16: 14957 return EVT(MVT::nxv8i16); 14958 case MVT::i32: 14959 return EVT(MVT::nxv4i32); 14960 case MVT::i64: 14961 return EVT(MVT::nxv2i64); 14962 case MVT::f16: 14963 return EVT(MVT::nxv8f16); 14964 case MVT::f32: 14965 return EVT(MVT::nxv4f32); 14966 case MVT::f64: 14967 return EVT(MVT::nxv2f64); 14968 } 14969 } 14970 14971 // Return a PTRUE with active lanes corresponding to the extent of VT. 14972 static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, 14973 EVT VT) { 14974 assert(VT.isFixedLengthVector() && 14975 DAG.getTargetLoweringInfo().isTypeLegal(VT) && 14976 "Expected legal fixed length vector!"); 14977 14978 int PgPattern; 14979 switch (VT.getVectorNumElements()) { 14980 default: 14981 llvm_unreachable("unexpected element count for SVE predicate"); 14982 case 1: 14983 PgPattern = AArch64SVEPredPattern::vl1; 14984 break; 14985 case 2: 14986 PgPattern = AArch64SVEPredPattern::vl2; 14987 break; 14988 case 4: 14989 PgPattern = AArch64SVEPredPattern::vl4; 14990 break; 14991 case 8: 14992 PgPattern = AArch64SVEPredPattern::vl8; 14993 break; 14994 case 16: 14995 PgPattern = AArch64SVEPredPattern::vl16; 14996 break; 14997 case 32: 14998 PgPattern = AArch64SVEPredPattern::vl32; 14999 break; 15000 case 64: 15001 PgPattern = AArch64SVEPredPattern::vl64; 15002 break; 15003 case 128: 15004 PgPattern = AArch64SVEPredPattern::vl128; 15005 break; 15006 case 256: 15007 PgPattern = AArch64SVEPredPattern::vl256; 15008 break; 15009 } 15010 15011 // TODO: For vectors that are exactly getMaxSVEVectorSizeInBits big, we can 15012 // use AArch64SVEPredPattern::all, which can enable the use of unpredicated 15013 // variants of instructions when available. 15014 15015 MVT MaskVT; 15016 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { 15017 default: 15018 llvm_unreachable("unexpected element type for SVE predicate"); 15019 case MVT::i8: 15020 MaskVT = MVT::nxv16i1; 15021 break; 15022 case MVT::i16: 15023 case MVT::f16: 15024 MaskVT = MVT::nxv8i1; 15025 break; 15026 case MVT::i32: 15027 case MVT::f32: 15028 MaskVT = MVT::nxv4i1; 15029 break; 15030 case MVT::i64: 15031 case MVT::f64: 15032 MaskVT = MVT::nxv2i1; 15033 break; 15034 } 15035 15036 return DAG.getNode(AArch64ISD::PTRUE, DL, MaskVT, 15037 DAG.getTargetConstant(PgPattern, DL, MVT::i64)); 15038 } 15039 15040 static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, 15041 EVT VT) { 15042 assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && 15043 "Expected legal scalable vector!"); 15044 auto PredTy = VT.changeVectorElementType(MVT::i1); 15045 return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all); 15046 } 15047 15048 static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) { 15049 if (VT.isFixedLengthVector()) 15050 return getPredicateForFixedLengthVector(DAG, DL, VT); 15051 15052 return getPredicateForScalableVector(DAG, DL, VT); 15053 } 15054 15055 // Grow V to consume an entire SVE register. 15056 static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) { 15057 assert(VT.isScalableVector() && 15058 "Expected to convert into a scalable vector!"); 15059 assert(V.getValueType().isFixedLengthVector() && 15060 "Expected a fixed length vector operand!"); 15061 SDLoc DL(V); 15062 SDValue Zero = DAG.getConstant(0, DL, MVT::i64); 15063 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero); 15064 } 15065 15066 // Shrink V so it's just big enough to maintain a VT's worth of data. 15067 static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) { 15068 assert(VT.isFixedLengthVector() && 15069 "Expected to convert into a fixed length vector!"); 15070 assert(V.getValueType().isScalableVector() && 15071 "Expected a scalable vector operand!"); 15072 SDLoc DL(V); 15073 SDValue Zero = DAG.getConstant(0, DL, MVT::i64); 15074 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero); 15075 } 15076 15077 // Convert all fixed length vector loads larger than NEON to masked_loads. 15078 SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE( 15079 SDValue Op, SelectionDAG &DAG) const { 15080 auto Load = cast<LoadSDNode>(Op); 15081 15082 SDLoc DL(Op); 15083 EVT VT = Op.getValueType(); 15084 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 15085 15086 auto NewLoad = DAG.getMaskedLoad( 15087 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), 15088 getPredicateForFixedLengthVector(DAG, DL, VT), DAG.getUNDEF(ContainerVT), 15089 Load->getMemoryVT(), Load->getMemOperand(), Load->getAddressingMode(), 15090 Load->getExtensionType()); 15091 15092 auto Result = convertFromScalableVector(DAG, VT, NewLoad); 15093 SDValue MergedValues[2] = {Result, Load->getChain()}; 15094 return DAG.getMergeValues(MergedValues, DL); 15095 } 15096 15097 // Convert all fixed length vector stores larger than NEON to masked_stores. 15098 SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE( 15099 SDValue Op, SelectionDAG &DAG) const { 15100 auto Store = cast<StoreSDNode>(Op); 15101 15102 SDLoc DL(Op); 15103 EVT VT = Store->getValue().getValueType(); 15104 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 15105 15106 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue()); 15107 return DAG.getMaskedStore( 15108 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(), 15109 getPredicateForFixedLengthVector(DAG, DL, VT), Store->getMemoryVT(), 15110 Store->getMemOperand(), Store->getAddressingMode(), 15111 Store->isTruncatingStore()); 15112 } 15113 15114 SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE( 15115 SDValue Op, SelectionDAG &DAG) const { 15116 EVT VT = Op.getValueType(); 15117 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); 15118 15119 SDLoc DL(Op); 15120 SDValue Val = Op.getOperand(0); 15121 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType()); 15122 Val = convertToScalableVector(DAG, ContainerVT, Val); 15123 15124 // Repeatedly truncate Val until the result is of the desired element type. 15125 switch (ContainerVT.getSimpleVT().SimpleTy) { 15126 default: 15127 llvm_unreachable("unimplemented container type"); 15128 case MVT::nxv2i64: 15129 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val); 15130 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val); 15131 if (VT.getVectorElementType() == MVT::i32) 15132 break; 15133 LLVM_FALLTHROUGH; 15134 case MVT::nxv4i32: 15135 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val); 15136 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val); 15137 if (VT.getVectorElementType() == MVT::i16) 15138 break; 15139 LLVM_FALLTHROUGH; 15140 case MVT::nxv8i16: 15141 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val); 15142 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val); 15143 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!"); 15144 break; 15145 } 15146 15147 return convertFromScalableVector(DAG, VT, Val); 15148 } 15149 15150 SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op, 15151 SelectionDAG &DAG, 15152 unsigned NewOp) const { 15153 EVT VT = Op.getValueType(); 15154 SDLoc DL(Op); 15155 auto Pg = getPredicateForVector(DAG, DL, VT); 15156 15157 if (useSVEForFixedLengthVectorVT(VT)) { 15158 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 15159 15160 // Create list of operands by convereting existing ones to scalable types. 15161 SmallVector<SDValue, 4> Operands = {Pg}; 15162 for (const SDValue &V : Op->op_values()) { 15163 if (isa<CondCodeSDNode>(V)) { 15164 Operands.push_back(V); 15165 continue; 15166 } 15167 15168 assert(useSVEForFixedLengthVectorVT(V.getValueType()) && 15169 "Only fixed length vectors are supported!"); 15170 Operands.push_back(convertToScalableVector(DAG, ContainerVT, V)); 15171 } 15172 15173 auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands); 15174 return convertFromScalableVector(DAG, VT, ScalableRes); 15175 } 15176 15177 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!"); 15178 15179 SmallVector<SDValue, 4> Operands = {Pg}; 15180 for (const SDValue &V : Op->op_values()) { 15181 assert((isa<CondCodeSDNode>(V) || V.getValueType().isScalableVector()) && 15182 "Only scalable vectors are supported!"); 15183 Operands.push_back(V); 15184 } 15185 15186 return DAG.getNode(NewOp, DL, VT, Operands); 15187 } 15188